# Projekt -  Text generieren (buchstabenweise) mit Hilfe eines LSTM-Modells

Als Trainingsdatensatz wird das Buch "Die Verwandlung" von F. Kafka verwendet

In [19]:
import numpy as np
import pickle

## Einlesen des Textes(Datensatzes)

Zeilen 60-1952


In [20]:
# Read the text file
with open("verwandlung.txt", "r", encoding="utf-8") as file:
    contents = file.read()

# join with newline to keep paragraph structure, only keep lines 60 to 1952
contents = "\n".join(contents.split("\n")[59:1952]) 


In [21]:
print(f"Total characters: {len(contents)}")

Total characters: 121130


## Datenformatierung

__Ziel:__ 

Das Modell soll auf Basis von, z.B. der letzten 40 Zeichen, das nächste Zeichen prädizieren. Dazu muss der Quelltext entsprechend für das Training vorbereitet werden:
- Text wird zerlegt in Buchstaben
- Jedem Buchstaben wird einer Zahl zugeordnet, sodass der Text als eine Abfolge von Integers dargestellt werden kann
- 

In [22]:
# Get unique characters
unique_chars = set(contents)
print(f"Unique characters: {len(unique_chars)}")

# Create dictionaries for character-integer mapping
int_to_char = {}
char_to_int = {}

# Create mappings between characters and integers
for i, j in enumerate(unique_chars):
    int_to_char[i] = j
    char_to_int[j] = i


# Save the mappings using pickle
with open("char_to_int.pickle", "wb") as file:
    pickle.dump(char_to_int, file)
    
with open("int_to_char.pickle", "wb") as file:
    pickle.dump(int_to_char, file)

Unique characters: 68


### Aufbereitung der Daten für das LSTM-Modell
- Input- und Output-Daten generieren: 
    - Input: die letzten 40 Buchstaben (length)
    - Output: der nächste Buchstabe nach den 40 Buchstaben
- Codierung mit One-Hot-Encoding von X:
    - Input-Dimension: 40 letzte Buchstaben x 68 einzigartige Buchstaben
    - Output-Dimension: 68 einzigartige Buchstaben

In [23]:
from keras.utils import to_categorical

length = 40
X = []
y = []

for i in range(0, len(contents) - length):
    line = contents[i:i+length]
    X.append([char_to_int[l] for l in line])
    
    letter = contents[i+length]
    y.append(char_to_int[letter])

X = np.array(X)
y = np.array(y)

# One-Hot-Encoding
X = to_categorical(X, num_classes=len(unique_chars))
y = to_categorical(y, num_classes=len(unique_chars))
print("Input-Dimension", X.shape)
print("Output-Dimension", y.shape)


Input-Dimension (121090, 40, 68)
Output-Dimension (121090, 68)


## Model Training

In [None]:
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout

model = Sequential()
model.add(LSTM(128, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))das405
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))

model.compile(loss='categorical_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])

  super().__init__(**kwargs)


In [25]:
from keras.callbacks import ModelCheckpoint

# Save the model after each epoch
#save_model = ModelCheckpoint("weights.{epoch:02d}-{loss:.2f}.keras")
#model.fit(X, y, batch_size=32, epochs=10, callbacks=[save_model])

model.fit(X, y, batch_size=32, epochs=10, verbose=1)

Epoch 1/10
[1m3785/3785[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m120s[0m 31ms/step - accuracy: 0.3279 - loss: 2.4113
Epoch 2/10
[1m3785/3785[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m113s[0m 30ms/step - accuracy: 0.4436 - loss: 1.9159
Epoch 3/10
[1m3785/3785[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m118s[0m 31ms/step - accuracy: 0.4838 - loss: 1.7540
Epoch 4/10
[1m3785/3785[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m117s[0m 31ms/step - accuracy: 0.5101 - loss: 1.6521
Epoch 5/10
[1m3785/3785[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m111s[0m 29ms/step - accuracy: 0.5281 - loss: 1.5820
Epoch 6/10
[1m3785/3785[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m115s[0m 30ms/step - accuracy: 0.5410 - loss: 1.5315
Epoch 7/10
[1m3785/3785[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m116s[0m 31ms/step - accuracy: 0.5521 - loss: 1.4857
Epoch 8/10
[1m3785/3785[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m951s[0m 251ms/step - accuracy: 0.5616 - loss: 1.4509

<keras.src.callbacks.history.History at 0x4086cb850>

## Abspeichern des finalen Modells

In [26]:
# save the final model
model.save("text_generator_model.keras")