# Model training

## Import packages

In [1]:
import tensorflow as tf
import pandas as pd
import os
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
import random
import re
import matplotlib.pyplot as plt
from scipy import sparse
from tqdm import tqdm
import gensim
from keras.callbacks import LambdaCallback

# Deep learning: 
from keras.models import Input, Model
from keras.layers import Dense

2022-06-07 14:41:39.936445: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-06-07 14:41:39.936508: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


## Read data

In [2]:
# Datensatz einlesen
df = pd.read_csv('data/big_out.csv')

## Transform dataframe

In [3]:
# Dataframe mit 3 Spalten. Werden so gejoint, dass ein neues Dataframe mit ein Haiku pro Zeile erstellt wird
df = df[['0', '1', '2']].agg(lambda x: ' \n '.join(x.values), axis=1)
# Dataframe to list [[]] -> []
haikus = df.values.tolist()

In [4]:
# alle Haikus in Array
print(haikus[:10])
#number_of_haikus = len(haikus)
number_of_haikus = 10000

haikus = haikus[:number_of_haikus]
print('number of haikus: ' + str(number_of_haikus))

['last red in the sky \n a small girls moon face rises \n over the counter', 'christmas services \n a cellular phone rings out \n handels messiah', 'passover darkness  \n before the buds burst open \n a childs eyes in death', 'last night of summer \n the bright full moon of last night \n hidden by a cloud', 'midnight and full moon \n my neighbour asks to borrow \n the vacum cleaner', 'yellow walnut leaves \n slowly appear on the lawn \n early morning light', 'after its first flight \n the young gerfalcons talons \n tighter on my glove', 'sultry afternoon \n only the mailbox shadow \n crosses the dirt road', 'long journey back home  \n a forgotten bale of hay \n slowly rots away', 'autumn mist obscures \n the island in the distance \n she cleans her glasses']
number of haikus: 10000


In [5]:
# die Haikus cleanen und selber auch noch mal als Wort-Arrays in großen Array
haikus = np.array(haikus)

def clean_and_split(sentence):
    result = list(filter(''.__ne__, re.sub('[.,_]', '', sentence).split(' ')))
    result.append(';')
    return result

haikus = list(map(lambda x: clean_and_split(x), haikus))

In [6]:
print(haikus[:10])

[['last', 'red', 'in', 'the', 'sky', '\n', 'a', 'small', 'girls', 'moon', 'face', 'rises', '\n', 'over', 'the', 'counter', ';'], ['christmas', 'services', '\n', 'a', 'cellular', 'phone', 'rings', 'out', '\n', 'handels', 'messiah', ';'], ['passover', 'darkness', '\n', 'before', 'the', 'buds', 'burst', 'open', '\n', 'a', 'childs', 'eyes', 'in', 'death', ';'], ['last', 'night', 'of', 'summer', '\n', 'the', 'bright', 'full', 'moon', 'of', 'last', 'night', '\n', 'hidden', 'by', 'a', 'cloud', ';'], ['midnight', 'and', 'full', 'moon', '\n', 'my', 'neighbour', 'asks', 'to', 'borrow', '\n', 'the', 'vacum', 'cleaner', ';'], ['yellow', 'walnut', 'leaves', '\n', 'slowly', 'appear', 'on', 'the', 'lawn', '\n', 'early', 'morning', 'light', ';'], ['after', 'its', 'first', 'flight', '\n', 'the', 'young', 'gerfalcons', 'talons', '\n', 'tighter', 'on', 'my', 'glove', ';'], ['sultry', 'afternoon', '\n', 'only', 'the', 'mailbox', 'shadow', '\n', 'crosses', 'the', 'dirt', 'road', ';'], ['long', 'journey', '

## The word_model

In [7]:
# train word_model
word_model = gensim.models.Word2Vec(haikus, min_count=1) # min count automatisch groesser, discarded alle woerter die weniger vorkommen

In [8]:
# summarize the loaded word_model
print(word_model)

Word2Vec<vocab=11016, vector_size=100, alpha=0.025>


In [9]:
# access vector for one word
print('vector for \'girl\':')
print(word_model.wv['girl'])

vector for 'girl':
[ 1.57944846e-03  4.79702383e-01  1.39830023e-01  2.32443362e-01
 -2.48008892e-01 -4.77285028e-01  1.59128159e-01  4.97037202e-01
 -4.14219916e-01 -4.56947803e-01 -1.19305857e-01 -1.13006361e-01
 -4.89470316e-03  5.25909364e-02  1.62677288e-01 -2.62931585e-01
  2.48516634e-01 -2.77941346e-01 -3.44668418e-01 -6.84914947e-01
  6.54398231e-03 -3.69090326e-02  3.39929581e-01 -1.39447480e-01
 -5.92733435e-02 -4.46793949e-03 -1.83766633e-01  5.40932901e-02
 -2.79231161e-01  1.23383693e-01  9.54838190e-03 -2.06339657e-01
  2.08209325e-02 -3.76382232e-01 -1.11764356e-01  1.22187719e-01
  1.89152375e-01 -1.48662120e-01 -1.02862649e-01 -4.83266234e-01
 -1.68181062e-01 -1.70294326e-04 -3.38109463e-01  2.04289779e-01
  1.06395714e-01 -1.53483137e-01 -2.15118691e-01  1.29657567e-01
  1.43383488e-01  4.54440027e-01  4.59779613e-02 -4.42435622e-01
 -1.20817512e-01 -1.09769121e-01 -1.77901015e-01  6.78219274e-02
 -1.15541734e-01 -8.38335380e-02 -5.88402115e-02  3.26623842e-02
  5.66

In [10]:
print('top 10 words most similar to \'girl\':')
word_model.wv.most_similar('girl', topn=10)

top 10 words most similar to 'girl':


[('baby', 0.9995694756507874),
 ('again', 0.9995231628417969),
 ('then', 0.9994540810585022),
 ('him', 0.9993935227394104),
 ('still', 0.9993851184844971),
 ('she', 0.9993551969528198),
 ('cause', 0.9993375539779663),
 ('before', 0.9993357062339783),
 ('does', 0.9993078708648682),
 ('everyone', 0.9993048310279846)]

In [11]:
# similarity between two words
print('similarity between \'go\' and \'walk\' (regarding the haikus):')
print(word_model.wv.similarity(w1='go', w2='walk'))
print()

print('similarity between \'go\' and \'laugh\' (regarding the haikus):')
print(word_model.wv.similarity(w1='go', w2='laugh'))
print()

print('similarity between \'go\' and \'go\':')
print(word_model.wv.similarity(w1='go', w2='go'))

similarity between 'go' and 'walk' (regarding the haikus):
0.9801817

similarity between 'go' and 'laugh' (regarding the haikus):
0.9883106

similarity between 'go' and 'go':
1.0


In [12]:
# save word_model
#word_model.save('w2v_model.bin')

In [13]:
# load model
#new_model = Word2Vec.load('w2v_model.bin')
#print(new_model)

In [14]:
# extract the words & their vectors, as numpy arrays
vectors = np.asarray(word_model.wv.vectors)
labels = np.asarray(word_model.wv.index_to_key)  # fixed-width numpy strings

print('vectors:')
print(vectors[:2])
print()
print('labels:')
print(labels[:10])

vectors:
[[-0.02006438  1.5637118   0.6815313   0.95296234 -1.0103472  -1.6432259
   0.38379577  1.4681886  -1.299237   -1.2814176  -0.27528042 -0.35464862
   0.12460577  0.01586231  0.42715144 -0.6344724   0.667469   -0.60655695
  -1.0345472  -1.8220502   0.04231731  0.14243126  0.68242687 -0.34836373
  -0.18041295  0.21925704 -0.5731937   0.28884244 -1.1853275   0.53131753
  -0.41879782 -0.67153263 -0.09016158 -0.8381077  -0.5402491   0.27035642
   0.7001314  -0.6446366  -0.029839   -1.56917    -0.8088404   0.07606144
  -1.123368    0.4198324   0.26193577 -0.35734543 -0.6477      0.6020401
   0.42673004  1.7353376  -0.00772176 -1.2835473  -0.42641953 -0.34523803
  -0.41307467  0.28775847 -0.34370133  0.01172935 -0.25937247  0.42898273
  -0.1205273  -0.21848534 -0.12997341  0.02379183 -0.6135289   1.2666781
   0.27158913  1.2231308  -0.8192191   1.0312066   0.32499903  0.9899771
   1.0708064  -0.29692984 -0.1008736   0.93253887  0.4496775   0.09123478
  -0.4988745  -0.46505553 -0.2570

In [15]:
len(vectors)

11016

In [16]:
len(labels)

11016

In [17]:
# https://projector.tensorflow.org/

In [18]:
# Save metadata (labels) into tsv file
pd.DataFrame(labels).to_csv("model_dir/metadata.tsv", sep = '\t', index=False)

In [19]:
# Save vectors into tsv file
pd.DataFrame(vectors).to_csv("model_dir/vectors.tsv", sep = '\t', index=False)

## Creating Model for HaikuGen

In [20]:
# Maximale Anzahl der Wörter in einem Haiku aus Datenset
max_haiku_len = len(max(haikus, key=len))
max_features = 20000

In [21]:
haikus[4]

['midnight',
 'and',
 'full',
 'moon',
 '\n',
 'my',
 'neighbour',
 'asks',
 'to',
 'borrow',
 '\n',
 'the',
 'vacum',
 'cleaner',
 ';']

In [22]:
word_model.wv["out"]

array([-0.11142737,  1.3569297 ,  0.3834107 ,  0.68843246, -0.7094674 ,
       -1.2384541 ,  0.34475657,  1.2290188 , -1.216967  , -1.280157  ,
       -0.36968192, -0.3127435 ,  0.03081723,  0.08334462,  0.4768745 ,
       -0.6999552 ,  0.69164765, -0.69219   , -0.84108776, -1.7297027 ,
       -0.11817645, -0.0819692 ,  0.84486747, -0.4489803 , -0.08038399,
        0.06905802, -0.55460745,  0.17209111, -0.7952637 ,  0.3244609 ,
       -0.04219412, -0.5925541 , -0.09798866, -0.9510048 , -0.36299366,
        0.3249889 ,  0.42437774, -0.34720448, -0.1957351 , -1.3518991 ,
       -0.4604878 ,  0.01482105, -0.9969884 ,  0.60189366,  0.2684482 ,
       -0.4044062 , -0.51662916,  0.30371323,  0.42953113,  1.2160685 ,
        0.09136987, -1.2464862 , -0.31920072, -0.3307937 , -0.4878993 ,
        0.21514072, -0.36327156, -0.26939368, -0.20047665,  0.04584768,
        0.17755835, -0.40301216,  0.34076568,  0.18358415, -0.67770576,
        1.36355   ,  0.15298508,  1.0931408 , -0.891092  ,  0.81

In [23]:
def word2idx(word):
    return word_model.wv.key_to_index[word]
def idx2word(idx):
    return word_model.wv.index_to_key[idx]

In [24]:
haikus_combined =  sum(haikus, [])
# cut the text in semi-redundant sequences of seq_len characters
seq_len = max_haiku_len
step = 3
# Input String
sequences = []
#Output character
next_words = []
for i in range(0, len(haikus_combined) - seq_len, step):
    sequences.append(haikus_combined[i : i + seq_len])
    next_words.append(haikus_combined[i + seq_len])
print("Number of sequences:", len(sequences))

train_x = np.zeros([len(sequences), max_haiku_len], dtype=np.int32)
train_y = np.zeros([len(next_words)], dtype=np.int32)




for i, sequence in enumerate(sequences):
    for t, word in enumerate(sequence):
         train_x[i, t] = word2idx(word)
    train_y[i] = word2idx(next_words[i])
print('train_x shape:', train_x.shape)
print('train_y shape:', train_y.shape)


Number of sequences: 52714
train_x shape: (52714, 20)
train_y shape: (52714,)


In [25]:
#print('\nPreparing the data for LSTM...')




#train_x = np.zeros([len(haikus), max_haiku_len], dtype=np.int32)
#train_y = np.zeros([len(haikus)], dtype=np.int32)
#for i, haiku in enumerate(haikus):
#    for t, word in enumerate(haiku[:-1]):
#        train_x[i, t] = word2idx(word)
#    train_y[i] = word2idx(haiku[-1])
#print('train_x shape:', train_x.shape)
#print('train_y shape:', train_y.shape)

In [26]:
pretrained_weights = word_model.wv.vectors
vocab_size = pretrained_weights.shape[0]
emdedding_size = pretrained_weights.shape[1]
print('Result embedding shape:', pretrained_weights.shape)

Result embedding shape: (11016, 100)


In [27]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, Activation 
from keras.layers import LSTM

In [28]:
print('Build model...')
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=emdedding_size, weights=[pretrained_weights]))
model.add(LSTM(emdedding_size, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(vocab_size)),
model.add(Activation('softmax')),
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

#print('\nTraining LSTM...')
#model = Sequential()
#model.add(Embedding(input_dim=vocab_size, output_dim=emdedding_size, weights=[pretrained_weights]))
#model.add(LSTM(units=emdedding_size))
#model.add(Dense(units=vocab_size))
#model.add(Activation('softmax'))

Build model...


2022-06-07 14:41:58.534649: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-06-07 14:41:58.534760: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2022-06-07 14:41:58.534812: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (jupyter-acq716): /proc/driver/nvidia/version does not exist
2022-06-07 14:41:58.535175: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [29]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 100)         1101600   
                                                                 
 lstm (LSTM)                 (None, 100)               80400     
                                                                 
 dense (Dense)               (None, 11016)             1112616   
                                                                 
 activation (Activation)     (None, 11016)             0         
                                                                 
Total params: 2,294,616
Trainable params: 2,294,616
Non-trainable params: 0
_________________________________________________________________


In [30]:
model.fit(train_x, train_y,
          batch_size=512,
          epochs=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fa5d6db22e0>

In [31]:
def sample(preds, temperature=1.0):
  if temperature <= 0:
    return np.argmax(preds)
  preds = np.asarray(preds).astype('float64')
  preds = np.log(preds) / temperature
  exp_preds = np.exp(preds)
  preds = exp_preds / np.sum(exp_preds)
  probas = np.random.multinomial(1, preds, 1)
  return np.argmax(probas)

In [32]:
def generate_next(text, num_generated=10):
  word_idxs = [word2idx(word) for word in text.lower().split()]
  for i in range(num_generated):
    prediction = model.predict(x=np.array(word_idxs))
    idx = sample(prediction[-1], temperature=0.7)
    word_idxs.append(idx)
  return ' '.join(idx2word(idx) for idx in word_idxs)

In [None]:
generate_words = 50
temperature = 1.0
start_index = random.randint(0, train_x.size - seq_len - 1)
generated = ""

seed =  train_x[start_index:start_index+1]
print(seed.shape)

prediction = model.predict(x=np.array(word_idxs))
    idx = sample(prediction[-1], temperature=temperature)
    word_idxs.append(idx)

preds = model.predict(train_x[:1]) # input shape(1, 20)



print(preds.shape)

print(sample(preds[-1]))

print(idx2word(sample(preds[-1])))

starts = [
    'deep convolutional',
    'simple and effective',
    'a nonconvex',
]

for start in starts:
    sample = generate_next(start)
    print('%s... -> %s' % (start, sample))


#for i in range(generate_words):
#    x_pred = np.zeros((1, len(seed)))
#    for t, char in enumerate(seed):
#        x_pred[0, t, char_indices[char]] = 1
#    preds = model.predict(x_pred, verbose=0)[0]
#        
#    next_index = sample(preds, temperature)
#    next_char = indices_char[next_index]
#    seed = seed[1:] + next_char
#    generated += next_char
        
#    if next_char == ";":
#            generated += "\n----------------------------------------\n"
            
#print(generated) 