# Projekt: Textvgenerierung mit Word2Vec und Embeddings

Ziel: Textgenerierung mit Word2Vec und Embeddings

## Einlesen der Quelldatei

In [1]:
# Choose between 'Die Verwandulung' und 'Faust'
book = "goethe"  # Options: "kafka" or "goethe"

In [2]:
if book == "kafka":
    with open("verwandlung.txt", "r", encoding="utf-8") as file:
        contents = file.read()
    contents = "\n".join(contents.split("\n")[59:1952])
elif book == "goethe":
    with open("faust.txt", "r", encoding="utf-8") as file:
        contents = file.read()
    contents = contents.split("\n")[52:7052]
    contents = [line.strip() for line in contents]
    contents = "\n".join(contents)
    contents = contents.replace("\n", " \\n ")


## Laden der NLTK Bibliothek

In [3]:
import nltk
from nltk import word_tokenize

nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /Users/alex/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /Users/alex/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

## Vorverarbeitung des Textes mithilfe des CountVectorizer
- Umwandlung des Textes in Tokens
- Fit des Vektorisierers auf die Tokens
- Ausgabe der Anzahl der Features
- Erstellung von zwei Dictionaries für die Abbildung
  - Wort zu Integer
  - Integer zu Wort

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
import pickle

# Umwandlung des Textes in Tokens
tokens = word_tokenize(contents)

cv = CountVectorizer(max_features=1000, lowercase=False, token_pattern="(.*)")
# fit the vectorizer to the tokens
cv.fit(tokens)

features = cv.get_feature_names_out()
print(len(features))

# create two dictionaries for mapping
word_to_int = {}
int_to_word = {}

# create the mapping of word to integer and integer to word
for i in range(0, len(features)):
    word = features[i]
    
    word_to_int[word] = i
    int_to_word[i] = word


# Speichern der Mappings
with open("word_to_int.pickle", "wb") as file:
    pickle.dump(word_to_int, file)
        
with open("int_to_word.pickle", "wb") as file:
    pickle.dump(int_to_word, file)

1000


## Vorbereitung der Daten für das LSTM-Modell
- Transformation der Tokens in Integer-Sequenzen
- Definition der Sequenzlänge
- Erstellung von Eingabe- und Ausgabesequenzen
- Reshape der Eingabedaten für das LSTM-Modell

In [5]:
import numpy as np

# Transformation der Tokens in Integer-Sequenzen
tokens_transformed = [word_to_int[word] for word in tokens if word in word_to_int]

# Vorbereitung der Daten für das LSTM-Modell
X = []
y = []
# Definition der Sequenzlänge
seq_length = 40

# Erstellung von Eingabe- und Ausgabesequenzen
for i in range(0, len(tokens_transformed) - seq_length):
    X.append(tokens_transformed[i:i+seq_length])
    y.append(tokens_transformed[i+seq_length])

# Reshape der Eingabedaten für das LSTM-Modell
X = np.reshape(X, (len(X), seq_length, 1))
# Umwandlung der Ausgabedaten in ein NumPy-Array
y = np.array(y)
print(X.shape)
print(y.shape)

(37145, 40, 1)
(37145,)


## Modell Training

- Definition des LSTM-Modells
    - Embedding Layer
    - LSTM Layer
- Kompilierung des Modells

In [6]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout, Embedding

model = Sequential()
model.add(Embedding(cv.max_features, 150, input_shape=(seq_length,)))
model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(64, activation="relu"))
model.add(Dense(cv.max_features, activation="softmax"))

model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

model.summary()

  super().__init__(**kwargs)


In [7]:
from keras.utils import to_categorical
# Training des Modells mit One-Hot-Kodierung der Ausgabedaten
model.fit(X, to_categorical(y, num_classes=cv.max_features), epochs=10, batch_size=32)

Epoch 1/10
[1m1161/1161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 34ms/step - accuracy: 0.2316 - loss: 4.6575
Epoch 2/10
[1m1161/1161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 34ms/step - accuracy: 0.2666 - loss: 4.1871
Epoch 3/10
[1m1161/1161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 34ms/step - accuracy: 0.2752 - loss: 4.0591
Epoch 4/10
[1m1161/1161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 33ms/step - accuracy: 0.2820 - loss: 3.9685
Epoch 5/10
[1m1161/1161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 35ms/step - accuracy: 0.2853 - loss: 3.8925
Epoch 6/10
[1m1161/1161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 34ms/step - accuracy: 0.2897 - loss: 3.8324
Epoch 7/10
[1m1161/1161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 34ms/step - accuracy: 0.2923 - loss: 3.7750
Epoch 8/10
[1m1161/1161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 35ms/step - accuracy: 0.2929 - loss: 3.7270
Epoch 9/

<keras.src.callbacks.history.History at 0x327e942d0>

## Abspeichern des Modells

In [8]:
if book=="kafka":
    model.save("verwandlung.keras")
elif book=="goethe":
    model.save("faust.keras")