# Model training

## Import packages

In [None]:
import tensorflow as tf
import pandas as pd
import os
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
import random
import re
from kerastuner.tuners import RandomSearch
from kerastuner.engine.hyperparameters import HyperParameters



## Read data

In [None]:
# Datensatz einlesen
df = pd.read_csv('data/out.csv')

## Transform dataframe

In [None]:
# Dataframe mit 3 Spalten. Werden so gejoint, dass ein neues Dataframe mit ein Haiku pro Zeile erstellt wird
df = df[['0', '1', '2']].agg(lambda x: '\n'.join(x.values), axis=1)
# Dataframe to list [[]] -> []
array_of_poems = df.values.tolist()

In [None]:
# num_poems an Haikus mit Semicolon zwshceneinander zusammenfuegen
num_poems = 10000

text = ';'.join(array_of_poems[:num_poems])

## Get unique chars in corpus

In [None]:
# Anzahl der unterschielichen characters im gesamt datensatz herausfinden
vocab = sorted(set(text))
vocab_size = len(vocab)
print(f'{vocab_size} unique chars')
print(vocab)

## Create training batches

In [None]:
print("Total chars:", vocab_size)
# Dictionary erstellen. Jeder character wird mit einer Zahl nummeriert
char_indices = {c: i for i, c in enumerate(vocab)}
indices_char = {i: c for i, c in enumerate(vocab)}

# cut the text in semi-redundant sequences of seq_len characters
seq_len = 150
step = 11
# Input String
sequences = []
#Output character
next_chars = []
for i in range(0, len(text) - seq_len, step):
    sequences.append(text[i : i + seq_len])
    next_chars.append(text[i + seq_len])
print("Number of sequences:", len(sequences))

# Input String onehot encoded
x = np.zeros((len(sequences), seq_len, vocab_size), dtype=bool)
# Output char onehot encoded
y = np.zeros((len(sequences), vocab_size), dtype=bool)
for i, sequence in enumerate(sequences):
    for t, char in enumerate(sequence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1


## Hyperparametertuning / Model Optimierung

In [None]:
!pip install -q -U keras-tuner

In [None]:
def build_model(hp):
    model = keras.Sequential()
    model.add(layers.LSTM(hp.Int('input_unit', min_value=32, max_value=512, step=32), 
                        input_shape=(x.shape[1], x.shape[2]), 
                        dropout=hp.Float('Dropout_rate',min_value=0.1,max_value=0.5,step=0.1),
                        return_sequences=True)
             )
    for i in range(hp.Int('n_layers', 1, 2)):
        model.add(layers.LSTM(hp.Int(f'lstm_{i}_units', min_value=32, max_value=512, step=32),  
                            dropout=hp.Float('Dropout_rate',min_value=0.1,max_value=0.5,step=0.1),
                            return_sequences=True)
                 )
    model.add(layers.LSTM(hp.Int('layer_2_neurons', min_value=32, max_value=512, step=32)))      
    model.add(layers.Dense(vocab_size, activation=hp.Choice('dense_activation', values=['relu', 'sigmoid'], default='relu')))
    
    model.compile(loss='categorical_crossentropy', optimizer='RMSprop', metrics = ['accuracy'])
              
    return model

In [None]:
tuner= RandomSearch(
        build_model,
        max_trials=10,
        objective="accuracy", # Beispielobjective
        executions_per_trial=1
        )

In [None]:
tuner.search(
        x=x,
        y=y,
        epochs=3,
        batch_size=512,
)

In [None]:
model = tuner.get_best_models(num_models=1)[0]

In [None]:
# model = keras.Sequential(
#     [
#         # input_shape=sequenz laenge, vocab_size
#         # return sequences true -> input-shape = output-shape 
#         # shape-input (NONE, seq_len, vocab_size)
#         layers.LSTM(256, input_shape=(x.shape[1], x.shape[2]), return_sequences=True),
#         layers.Dropout(0.2),
#         # shape-input (NONE, seq_len, vocab_size)
#         layers.LSTM(128, return_sequences=True),
#         layers.Dropout(0.2),
#         # shape-input (NONE, seq_len, vocab_size)
#         layers.LSTM(64),
#         # shape-input (NONE, vocab_size)
#         layers.Dense(vocab_size, activation="softmax"),
#         # bsp out [0.3, 0.2, 0.1, 0.4]
#     ]
# )

# optimizer = keras.optimizers.RMSprop(learning_rate=0.01)
# model.compile(loss="categorical_crossentropy", optimizer=optimizer)

In [None]:
model.summary()

In [None]:
# Standartfunktion Probability array to onehot to integerencoded char 
# [0.3, 0.2, 0.1, 0.4] -> [0, 0, 0, 1] -> return 4 (stelle, an der 1)
def sample(prob, temperature=1.0):
    # helper function to sample an index from a probability array
    prob = np.asarray(prob).astype("float64")
    prob = np.log(prob) / temperature
    exp_prob = np.exp(prob)
    prob = exp_prob / np.sum(exp_prob)
    probas = np.random.multinomial(1, prob)
    return np.argmax(probas)

In [None]:
#model = keras.models.load_model('myModel.h5')

In [None]:
epochs = 30
batch_size = 512

input_data = x
output_data = y

for epoch in range(epochs):
    print(f"\n\nEPOCH:{epoch}")
    model.fit(input_data, output_data, batch_size=batch_size, epochs=1)
    model.save('myModelTuner.h5')
    print()

    generate_chars = 200
    temperature = 1.0
    start_index = random.randint(0, len(text) - seq_len - 1)
    generated = ""

    seed =  text[start_index : start_index + seq_len]
   
    #print('...Generating with seed: "' + seed + '"')

    for i in range(generate_chars):
        x_pred = np.zeros((1, len(seed), vocab_size))
        for t, char in enumerate(seed):
            x_pred[0, t, char_indices[char]] = 1
        preds = model.predict(x_pred, verbose=0)[0]
       
        next_index = sample(preds, temperature)
        next_char = indices_char[next_index]
        seed = seed[1:] + next_char
        generated += next_char
       
        if next_char == ";":
            generated += "\n----------------------------------------\n"
           
    print(generated) 



In [None]:
model.save('myModelTuner.h5')

In [None]:
model = keras.models.load_model('myModelTuner.h5')

In [None]:
generate_chars = 200
temperature = 0.001
start_index = random.randint(0, len(text) - seq_len - 1)
generated = ""

seed =  text[start_index : start_index + seq_len]
    
print('...Generating with seed: "' + seed + '"')

    
for i in range(generate_chars):
    x_pred = np.zeros((1, len(seed), vocab_size))
    for t, char in enumerate(seed):
        x_pred[0, t, char_indices[char]] = 1
    preds = model.predict(x_pred, verbose=0)[0]
        
    next_index = sample(preds, temperature)
    next_char = indices_char[next_index]
    seed = seed[1:] + next_char
    generated += next_char
        
    if next_char == ";":
            generated += "\n----------------------------------------\n"
            
print(generated) 
        

