# Model training

## Import packages

In [1]:
import tensorflow as tf
import pandas as pd
import os
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
import random
import re


In [2]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  0


## Read data

In [3]:
# Datensatz einlesen
df = pd.read_csv('data/out.csv')

## Transform dataframe

In [4]:
# Dataframe mit 3 Spalten. Werden so gejoint, dass ein neues Dataframe mit ein Haiku pro Zeile erstellt wird
df = df[['0', '1', '2']].agg(lambda x: '\n'.join(x.values), axis=1)
# Dataframe to list [[]] -> []
array_of_poems = df.values.tolist()

In [5]:
# num_poems an Haikus mit Semicolon zwshceneinander zusammenfuegen
num_poems = 150000

text = ';'.join(array_of_poems[:num_poems])

## Get unique chars in corpus

In [6]:
# Anzahl der unterschielichen characters im gesamt datensatz herausfinden
vocab = sorted(set(text))
vocab_size = len(vocab)
print(f'{vocab_size} unique chars')
print(vocab)

29 unique chars
['\n', ' ', ';', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


## Create training batches

In [7]:
print("Total chars:", vocab_size)
# Dictionary erstellen. Jeder character wird mit einer Zahl nummeriert
char_indices = {c: i for i, c in enumerate(vocab)}
indices_char = {i: c for i, c in enumerate(vocab)}

# cut the text in semi-redundant sequences of seq_len characters
seq_len = 150
step = 11
# Input String
sequences = []
#Output character
next_chars = []
for i in range(0, len(text) - seq_len, step):
    sequences.append(text[i : i + seq_len])
    next_chars.append(text[i + seq_len])
print("Number of sequences:", len(sequences))

# Input String onehot encoded
x = np.zeros((len(sequences), seq_len, vocab_size), dtype=bool)
# Output char onehot encoded
y = np.zeros((len(sequences), vocab_size), dtype=bool)
for i, sequence in enumerate(sequences):
    for t, char in enumerate(sequence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1



Total chars: 29
Number of sequences: 905098


In [8]:
x.shape[1]

150

In [9]:
x.shape[2]

29

In [10]:
model = keras.Sequential(
    [
        # input_shape=sequenz laenge, vocab_size
        # return sequences true -> input-shape = output-shape 
        # shape-input (NONE, seq_len, vocab_size)
        layers.LSTM(256, input_shape=(x.shape[1], x.shape[2]), return_sequences=True),
        layers.Dropout(0.2),
        # shape-input (NONE, seq_len, vocab_size)
        layers.LSTM(128, return_sequences=True),
        layers.Dropout(0.2),
        # shape-input (NONE, seq_len, vocab_size)
        layers.LSTM(64),
        # shape-input (NONE, vocab_size)
        layers.Dense(vocab_size, activation="softmax"),
        # bsp out [0.3, 0.2, 0.1, 0.4]
    ]
)

optimizer = keras.optimizers.RMSprop(learning_rate=0.01)
model.compile(loss="categorical_crossentropy", optimizer=optimizer)

In [11]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 150, 256)          292864    
_________________________________________________________________
dropout (Dropout)            (None, 150, 256)          0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 150, 128)          197120    
_________________________________________________________________
dropout_1 (Dropout)          (None, 150, 128)          0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 64)                49408     
_________________________________________________________________
dense (Dense)                (None, 29)                1885      
Total params: 541,277
Trainable params: 541,277
Non-trainable params: 0
__________________________________________________

In [12]:
# Standartfunktion Probability array to onehot to integerencoded char 
# [0.3, 0.2, 0.1, 0.4] -> [0, 0, 0, 1] -> return 4 (stelle, an der 1)
def sample(prob, temperature=1.0):
    # helper function to sample an index from a probability array
    prob = np.asarray(prob).astype("float64")
    prob = np.log(prob) / temperature
    exp_prob = np.exp(prob)
    prob = exp_prob / np.sum(exp_prob)
    probas = np.random.multinomial(1, prob)
    return np.argmax(probas)

In [None]:
epochs = 30
batch_size = 512

input_data = x
output_data = y


for epoch in range(epochs):
    print()
    print()
    print(f"EPOCH:{epoch}")
    model.fit(input_data, output_data, batch_size=batch_size, epochs=1)

    print()

    generate_chars = 200
    temperature = 1.0
    start_index = random.randint(0, len(text) - seq_len - 1)
    generated = ""

    seed =  text[start_index : start_index + seq_len]

    #print('...Generating with seed: "' + seed + '"')

    for i in range(generate_chars):
        x_pred = np.zeros((1, len(seed), vocab_size))
        for t, char in enumerate(seed):
            x_pred[0, t, char_indices[char]] = 1
        preds = model.predict(x_pred, verbose=0)[0]

        next_index = sample(preds, temperature)
        next_char = indices_char[next_index]
        seed = seed[1:] + next_char
        generated += next_char

        if next_char == ";":
            generated += "\n----------------------------------------\n"

    print(generated)



In [None]:
#model.save('myModel.h5')

In [13]:
model = keras.models.load_model('myModel.h5')

In [16]:

generate_chars = 500
temperature = 1.0
start_index = random.randint(0, len(text) - seq_len - 1)
generated = ""

seed =  text[start_index : start_index + seq_len]
    
print('...Generating with seed: "' + seed + '"')

    
for i in range(generate_chars):
    x_pred = np.zeros((1, len(seed), vocab_size))
    for t, char in enumerate(seed):
        x_pred[0, t, char_indices[char]] = 1
    preds = model.predict(x_pred, verbose=0)[0]
        
    next_index = sample(preds, temperature)
    next_char = indices_char[next_index]
    seed = seed[1:] + next_char
    generated += next_char
        
    if next_char == ";":
            generated += "\n----------------------------------------\n"
            
print(generated) 
        



...Generating with seed: "y a few years;besides young ma ive
 never heard a female rap
and think she was nice;when life is pulling
 you in one direction dont
waste your time fi"


  prob = np.log(prob) / temperature


ghts;
----------------------------------------
people when the brown
 i dont never gten mom
rose i feel anymore;
----------------------------------------
things she due adory
 which i wish the keyothe
show the school is is;
----------------------------------------
im subars too
 this second time such grupping
and antrifides;
----------------------------------------
wed short beat beef
 dance waters me lets see
it both and i have;
----------------------------------------
i love onibs
 shoes at the landy ids
better than you but;
----------------------------------------
now this o is
 garciwallis szancing
point when i wanna;
----------------------------------------
im not alone
 thats where its a city to
aware of a time;
----------------------------------------
mauch pretending
 you do expecting a bent
puols with twitter;
----------------------------------------
she wh
