In [1]:
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from global_dialog import DialogModel, gl_device, daten, tokenisierer, lemmatisierer, stemming, parameter_encoder, stopp_woerter, satz_entcoder, MODEL_SAVE_PATH

In [2]:
g_woerter = []
g_kategorien = []
g_xy = []
g_sprache = daten['data_dialoge']['sprache']

In [3]:
for dialog in daten['data_dialoge']['dialoge']:
    kategorie = dialog['kategorie']
    g_kategorien.append(kategorie)

    for muster in dialog['muster']:
        w = tokenisierer(muster)
        g_woerter.extend(w)
        g_xy.append((w, kategorie))


In [4]:
# g_woerter = [lemmatisierer(stemming(parameter_encoder(w))) for w in g_woerter if w not in stopp_woerter(g_sprache)]
g_woerter = [lemmatisierer(stemming(a)) for w in g_woerter if w not in stopp_woerter(g_sprache) for a, _ in [parameter_encoder(w)]]

""" g_woerter = sorted(set(g_woerter))
g_kategorien = sorted(set(g_kategorien)) """

print(len(g_xy), "patterns")
print(len(g_kategorien), "tags:", g_kategorien)
print(len(g_woerter), "unique stemmed words:", g_woerter)

14 patterns
6 tags: ['Begrüßungen', 'Verabschiedungen', 'Zeit', 'Wetter', 'Timer', 'Wikipedia']
34 unique stemmed words: ['hallo', 'wie', 'geht', "'s?", 'auf', 'wiederseh', 'tschuss', ',', 'bald', 'bi', 'nach', 'mal', 'tschuss', 'mach', 'gut', 'bi', 'bald', 'wie', 'spat', 'konnt', 'uhrzeit', 'sag', 'wa', 'aktuell', 'uhrzeit', 'wett', '[DATUM_EINHEIT]', '[ORT]', 'stell', 'tim', '[NUMMER]', '[ZEIT_EINHEIT]', 'wer', 'wa']


In [5]:
X_train = []
y_train = []

for (mustersatz, kategorie) in g_xy: #for dari jumlah dari setiap muster

    entcode_array = satz_entcoder(mustersatz, g_woerter)
    X_train.append(entcode_array)

    label = g_kategorien.index(kategorie)
    y_train.append(label)

    #print(g_woerter , "\n" , entcode_array  , "\n" , mustersatz, "\n", kategorie, label, "\n")

    print(g_woerter , "\n" , entcode_array  , "\n" , mustersatz, "\n")

X_train = np.array(X_train) #array muster yg sudah di entcode
y_train = np.array(y_train) #array kategori yg sudah bedasarkan index

['hallo', 'wie', 'geht', "'s?", 'auf', 'wiederseh', 'tschuss', ',', 'bald', 'bi', 'nach', 'mal', 'tschuss', 'mach', 'gut', 'bi', 'bald', 'wie', 'spat', 'konnt', 'uhrzeit', 'sag', 'wa', 'aktuell', 'uhrzeit', 'wett', '[DATUM_EINHEIT]', '[ORT]', 'stell', 'tim', '[NUMMER]', '[ZEIT_EINHEIT]', 'wer', 'wa'] 
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] 
 ['Hallo', '!'] 

['hallo', 'wie', 'geht', "'s?", 'auf', 'wiederseh', 'tschuss', ',', 'bald', 'bi', 'nach', 'mal', 'tschuss', 'mach', 'gut', 'bi', 'bald', 'wie', 'spat', 'konnt', 'uhrzeit', 'sag', 'wa', 'aktuell', 'uhrzeit', 'wett', '[DATUM_EINHEIT]', '[ORT]', 'stell', 'tim', '[NUMMER]', '[ZEIT_EINHEIT]', 'wer', 'wa'] 
 [0. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] 
 ['Wie', 'geht', "'s?"] 

['hallo', 'wie', 'geht', "'s?", 'auf', 'wiederseh', 'tschuss', ',', 'bald', 'bi', 'nach', 'mal', 'tschuss', 'mach', 'gut', 'bi', 'bald',

In [6]:
h_epochs = 1000

h_batch_size = 8
h_learning_rate = 0.001
h_input_size = len(X_train[0]) # len(g_woerter)
h_hidden_layer_1_size = 128
h_hidden_layer_2_size = 64
h_hidden_layer_3_size = 32
h_output_size = len(g_kategorien)

In [7]:
class DialogDataset(Dataset):
    def __init__(self):
        self.n_samples = len(X_train) # len(g_xy) jumlah dari setiap muster
        self.x_data = X_train
        self.y_data = y_train

    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index]
    
    def __len__(self):
        return self.n_samples
    
dialog_dataset = DialogDataset()

In [8]:
train_loader = DataLoader(dataset=dialog_dataset,
                          batch_size=h_batch_size,
                          shuffle=True,
                          num_workers=0)

In [9]:
model = DialogModel(h_input_size, h_hidden_layer_1_size, h_hidden_layer_2_size, h_hidden_layer_3_size, h_output_size).to(gl_device)

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=h_learning_rate)

In [10]:
for epoch in range(h_epochs):
    for (woerter, labels) in train_loader:
        woerter = woerter.to(gl_device)
        labels = labels.to(dtype=torch.long).to(gl_device)

        outputs = model(woerter)
        loss = loss_fn(outputs, labels)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    if (epoch+1) % 100 == 0:
        print (f'Epoch [{epoch+1}/{h_epochs}], Loss: {loss.item():.4f}')

Epoch [100/1000], Loss: 0.0051
Epoch [200/1000], Loss: 0.0012
Epoch [300/1000], Loss: 0.0002
Epoch [400/1000], Loss: 0.0002
Epoch [500/1000], Loss: 0.0001
Epoch [600/1000], Loss: 0.0001
Epoch [700/1000], Loss: 0.0000
Epoch [800/1000], Loss: 0.0000
Epoch [900/1000], Loss: 0.0000
Epoch [1000/1000], Loss: 0.0000


In [11]:
data_save = {
    "model_state": model.state_dict(),
    "input_size": h_input_size,
    "hidden_1_size": h_hidden_layer_1_size,
    "hidden_2_size": h_hidden_layer_2_size,
    "hidden_3_size": h_hidden_layer_3_size,
    "output_size": h_output_size,
    "woerter": g_woerter,
    "kategorien": g_kategorien
}

In [12]:
try:
    torch.save(data_save, MODEL_SAVE_PATH)
    print(f'Datei gespeichert in {MODEL_SAVE_PATH}')
except Exception as e:
    print(f'Fehler beim Speichern der Datei: {e}')

Datei gespeichert in assets\data.pth
