# Model training

## Import packages

In [1]:
!pip install --upgrade gensim

You should consider upgrading via the '/opt/conda/bin/python -m pip install --upgrade pip' command.[0m[33m
[0m

In [2]:
import tensorflow as tf
import pandas as pd
import os
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
import random
import re
import matplotlib.pyplot as plt
from scipy import sparse
from tqdm import tqdm
import gensim
from keras.callbacks import LambdaCallback

# Deep learning: 
from keras.models import Input, Model
from keras.layers import Dense

## Read data

In [3]:
# Datensatz einlesen
df = pd.read_csv('data/out.csv')

## Transform dataframe

In [4]:
# Dataframe mit 3 Spalten. Werden so gejoint, dass ein neues Dataframe mit ein Haiku pro Zeile erstellt wird
df = df[['0', '1', '2']].agg(lambda x: ' \n '.join(x.values), axis=1)
# Dataframe to list [[]] -> []
haikus = df.values.tolist()

In [5]:
# alle Haikus in Array
print(haikus[:10])
#number_of_haikus = len(haikus)
number_of_haikus = 400000

haikus = haikus[:number_of_haikus]
print('number of haikus: ' + str(number_of_haikus))

['last red in the sky \n a small girls moon face rises \n over the counter', 'christmas services \n a cellular phone rings out \n handels messiah', 'passover darkness  \n before the buds burst open \n a childs eyes in death', 'last night of summer \n the bright full moon of last night \n hidden by a cloud', 'midnight and full moon \n my neighbour asks to borrow \n the vacum cleaner', 'yellow walnut leaves \n slowly appear on the lawn \n early morning light', 'after its first flight \n the young gerfalcons talons \n tighter on my glove', 'sultry afternoon \n only the mailbox shadow \n crosses the dirt road', 'long journey back home  \n a forgotten bale of hay \n slowly rots away', 'autumn mist obscures \n the island in the distance \n she cleans her glasses']
number of haikus: 400000


In [6]:
# die Haikus cleanen und selber auch noch mal als Wort-Arrays in großen Array
haikus = np.array(haikus)

def clean_and_split(sentence):
    result = list(filter(''.__ne__, re.sub('[.,_]', '', sentence).split(' ')))
    result.append(';')
    return result

haikus = list(map(lambda x: clean_and_split(x), haikus))

In [7]:
print(haikus[:10])

[['last', 'red', 'in', 'the', 'sky', '\n', 'a', 'small', 'girls', 'moon', 'face', 'rises', '\n', 'over', 'the', 'counter', ';'], ['christmas', 'services', '\n', 'a', 'cellular', 'phone', 'rings', 'out', '\n', 'handels', 'messiah', ';'], ['passover', 'darkness', '\n', 'before', 'the', 'buds', 'burst', 'open', '\n', 'a', 'childs', 'eyes', 'in', 'death', ';'], ['last', 'night', 'of', 'summer', '\n', 'the', 'bright', 'full', 'moon', 'of', 'last', 'night', '\n', 'hidden', 'by', 'a', 'cloud', ';'], ['midnight', 'and', 'full', 'moon', '\n', 'my', 'neighbour', 'asks', 'to', 'borrow', '\n', 'the', 'vacum', 'cleaner', ';'], ['yellow', 'walnut', 'leaves', '\n', 'slowly', 'appear', 'on', 'the', 'lawn', '\n', 'early', 'morning', 'light', ';'], ['after', 'its', 'first', 'flight', '\n', 'the', 'young', 'gerfalcons', 'talons', '\n', 'tighter', 'on', 'my', 'glove', ';'], ['sultry', 'afternoon', '\n', 'only', 'the', 'mailbox', 'shadow', '\n', 'crosses', 'the', 'dirt', 'road', ';'], ['long', 'journey', '

## The word_model

In [8]:
# train word_model
word_model = gensim.models.Word2Vec(haikus, min_count=1) # min count automatisch groesser, discarded alle woerter die weniger vorkommen

In [9]:
# summarize the loaded word_model
print(word_model)

Word2Vec<vocab=43984, vector_size=100, alpha=0.025>


In [10]:
# access vector for one word
print('vector for \'girl\':')
print(word_model.wv['girl'])

vector for 'girl':
[-1.7813144   1.390223   -0.04524625  0.44154942  1.0800593   0.26244986
  0.15901282 -0.5032187   1.0188013   1.1947584   1.1067681   2.5262969
  0.17680652 -1.217741   -3.147762   -0.2669855   0.78237444 -2.3427348
  0.3931561  -0.9248451   0.8086693   0.26590395 -0.08549678 -1.3627003
  0.3085851  -1.9268545  -2.2421246   0.17599379  0.51561797  1.269628
  2.7472997   2.2557468   1.4816093   0.33526328 -0.656519    1.1573275
 -0.54194933  0.9562054  -1.2792095  -1.4622083   0.3493488  -1.009699
  0.75311345 -0.95323175 -0.72239596 -0.23170902 -0.9226991   0.18376273
 -1.5672365   0.35558397 -0.07680202  1.5370051  -1.5546169   2.088508
  1.7737682  -1.0869023   3.8877852   0.22606003 -0.99964124  0.4793532
  1.7235538  -2.3479352   0.01460907  0.51211745 -0.50773156 -0.7129143
 -1.3684447  -1.4520518  -0.19157673 -1.3626063  -0.40733343 -0.08036793
 -0.30704677 -0.47070426 -1.1997057   1.0842602  -1.668591   -0.0764354
  0.01873258  1.559697   -1.228483    1.25388

In [11]:
print('top 10 words most similar to \'girl\':')
word_model.wv.most_similar('girl', topn=10)

top 10 words most similar to 'girl':


[('woman', 0.8348810076713562),
 ('kid', 0.8027123212814331),
 ('guy', 0.7991846203804016),
 ('lady', 0.7637879848480225),
 ('chick', 0.7459321618080139),
 ('boy', 0.7285013794898987),
 ('girlfriend', 0.7156926393508911),
 ('bitch', 0.6953952312469482),
 ('sister', 0.6900787949562073),
 ('person', 0.687271237373352)]

In [12]:
# similarity between two words
print('similarity between \'go\' and \'walk\' (regarding the haikus):')
print(word_model.wv.similarity(w1='go', w2='walk'))
print()

print('similarity between \'go\' and \'laugh\' (regarding the haikus):')
print(word_model.wv.similarity(w1='go', w2='laugh'))
print()

print('similarity between \'go\' and \'go\':')
print(word_model.wv.similarity(w1='go', w2='go'))

similarity between 'go' and 'walk' (regarding the haikus):
0.725224

similarity between 'go' and 'laugh' (regarding the haikus):
0.21716416

similarity between 'go' and 'go':
1.0


In [13]:
# save word_model
#word_model.save('w2v_model.bin')

In [14]:
# load model
#new_model = Word2Vec.load('w2v_model.bin')
#print(new_model)

In [15]:
# extract the words & their vectors, as numpy arrays
vectors = np.asarray(word_model.wv.vectors)
labels = np.asarray(word_model.wv.index_to_key)  # fixed-width numpy strings

print('vectors:')
print(vectors[:2])
print()
print('labels:')
print(labels[:10])

vectors:
[[ 0.3261586  -1.9390801  -0.8637559   0.60089767  0.8030032  -0.56526315
   0.00928922  0.9588842   0.2725503   0.82826823 -1.4006827   0.6719266
  -0.73364866  0.37084472 -0.851748   -1.9351178   2.5940175  -1.269681
   0.57628906 -1.7919712  -0.98557836  0.5238613  -1.5037771  -0.6535714
   0.36955866 -0.11089121  1.2329588  -0.7234964   0.8659579   0.17947729
   2.1434555  -0.5281072   0.86789626 -0.22335638  0.05619117  0.8480278
  -0.7587801   1.0685723   0.01354888  0.2048348   0.10864631 -0.21553792
   0.7137529   1.5754474   0.5840277  -0.1095252  -0.7474503   0.61101633
  -1.2372096   1.3347064   1.512253   -0.28660384 -1.0613586  -0.24530767
  -1.1773177  -0.18630698  1.1211948  -0.7936223  -1.3308219  -0.6921227
   0.620634    0.09585609 -1.9262667   1.4781322   0.5913094  -0.420123
   1.3143097   0.3534205  -0.7647251  -0.97070414 -0.3863496  -0.25491062
   0.9201879  -0.89660233 -1.0710522   1.5603441   0.0866755   0.27795032
   0.2694539  -0.51851416  0.10907073

In [16]:
len(vectors)

43984

In [17]:
len(labels)

43984

In [18]:
# https://projector.tensorflow.org/

In [19]:
# Save metadata (labels) into tsv file
pd.DataFrame(labels).to_csv("model_dir/metadata.tsv", sep = '\t', index=False)

In [20]:
# Save vectors into tsv file
pd.DataFrame(vectors).to_csv("model_dir/vectors.tsv", sep = '\t', index=False)

## Creating Model for HaikuGen

In [21]:
# Maximale Anzahl der Wörter in einem Haiku aus Datenset
max_haiku_len = len(max(haikus, key=len))
max_features = 20000

In [22]:
haikus[4]

['midnight',
 'and',
 'full',
 'moon',
 '\n',
 'my',
 'neighbour',
 'asks',
 'to',
 'borrow',
 '\n',
 'the',
 'vacum',
 'cleaner',
 ';']

In [23]:
word_model.wv["out"]

array([ 3.5820458e+00,  2.8715370e+00, -2.0762448e+00, -1.2740515e-01,
        2.8581939e+00, -1.9330012e+00,  3.7553601e-02,  2.6797233e+00,
        2.5666494e+00,  2.3984778e+00, -4.8610780e-02,  2.7564251e+00,
        7.6587105e-01, -1.4019598e+00,  2.0969794e+00, -1.3246665e+00,
       -5.6277877e-01,  2.8978521e-01,  1.1375366e-03,  1.0887215e+00,
       -2.8589797e+00,  1.2886252e+00,  1.8312030e+00,  1.4780592e+00,
       -1.1087697e+00, -9.4945067e-01, -1.2071359e+00,  1.1930063e+00,
       -1.3305489e+00, -1.2799729e+00,  1.5982162e+00,  1.7785784e+00,
        2.2997376e-01, -6.0743862e-01, -9.5628017e-01,  2.5486543e+00,
        9.0234995e-02, -4.8573482e-01, -9.4896585e-01,  2.0705674e+00,
       -2.4946468e+00, -4.2541456e-01,  1.9732209e+00, -3.5095584e-02,
       -2.1348510e+00,  2.0176039e+00, -6.7293227e-01, -9.1287255e-02,
        6.3855618e-01,  1.9062890e+00,  5.2674782e-01, -6.0647037e-02,
       -2.6684752e+00, -1.4846808e+00, -1.3958681e+00, -1.3744413e+00,
      

In [24]:
def word2idx(word):
    return word_model.wv.key_to_index[word]
def idx2word(idx):
    return word_model.wv.index_to_key[idx]

In [25]:
#haikus_combined =  sum(haikus, [])

haikus_combined = [item for sublist in haikus for item in sublist]
# cut the text in semi-redundant sequences of seq_len characters
print(haikus_combined[1:20])
seq_len = max_haiku_len
step = 7
# Input String
sequences = []
#Output character
next_words = []
for i in range(0, len(haikus_combined) - seq_len, step):
    sequences.append(haikus_combined[i : i + seq_len])
    next_words.append(haikus_combined[i + seq_len])
print("Number of sequences:", len(sequences))

train_x = np.zeros([len(sequences), max_haiku_len], dtype=np.int32)
train_y = np.zeros([len(next_words)], dtype=np.int32)




for i, sequence in enumerate(sequences):
    for t, word in enumerate(sequence):
         train_x[i, t] = word2idx(word)
    train_y[i] = word2idx(next_words[i])
print('train_x shape:', train_x.shape)
print('train_y shape:', train_y.shape)


['red', 'in', 'the', 'sky', '\n', 'a', 'small', 'girls', 'moon', 'face', 'rises', '\n', 'over', 'the', 'counter', ';', 'christmas', 'services', '\n']
Number of sequences: 910379
train_x shape: (910379, 20)
train_y shape: (910379,)


In [26]:
#print('\nPreparing the data for LSTM...')




#train_x = np.zeros([len(haikus), max_haiku_len], dtype=np.int32)
#train_y = np.zeros([len(haikus)], dtype=np.int32)
#for i, haiku in enumerate(haikus):
#    for t, word in enumerate(haiku[:-1]):
#        train_x[i, t] = word2idx(word)
#    train_y[i] = word2idx(haiku[-1])
#print('train_x shape:', train_x.shape)
#print('train_y shape:', train_y.shape)

In [27]:
pretrained_weights = word_model.wv.vectors
vocab_size = pretrained_weights.shape[0]
emdedding_size = pretrained_weights.shape[1]
print('Result embedding shape:', pretrained_weights.shape)

Result embedding shape: (43984, 100)


In [28]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, Activation 
from keras.layers import LSTM

In [29]:
print('Build model...')
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=emdedding_size, weights=[pretrained_weights]))
model.add(LSTM(emdedding_size, dropout=0.2, recurrent_dropout=0.2,return_sequences=True))
model.add(LSTM(emdedding_size*2, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(vocab_size)),
model.add(Activation('softmax')),
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

#print('\nTraining LSTM...')
#model = Sequential()
#model.add(Embedding(input_dim=vocab_size, output_dim=emdedding_size, weights=[pretrained_weights]))
#model.add(LSTM(units=emdedding_size))
#model.add(Dense(units=vocab_size))
#model.add(Activation('softmax'))

Build model...


2022-06-10 11:07:03.228812: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-06-10 11:07:03.271230: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-06-10 11:07:03.271808: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-06-10 11:07:03.272921: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil



In [30]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 100)         4398400   
                                                                 
 lstm (LSTM)                 (None, None, 100)         80400     
                                                                 
 lstm_1 (LSTM)               (None, 200)               240800    
                                                                 
 dense (Dense)               (None, 43984)             8840784   
                                                                 
 activation (Activation)     (None, 43984)             0         
                                                                 
Total params: 13,560,384
Trainable params: 13,560,384
Non-trainable params: 0
_________________________________________________________________


In [31]:
#model.fit(train_x, train_y,
#          batch_size=512,
#          epochs=1)

In [32]:
def sample(preds, temperature=1.0):
    if temperature <= 0:
        return np.argmax(preds)
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [33]:
def generate_next(text, num_generated=10):
    word_idxs = [word2idx(word) for word in text.lower().split()]
    for i in range(num_generated):
        prediction = model.predict(x=np.array(word_idxs))
        idx = sample(prediction[-1], temperature=0.7)
        word_idxs.append(idx)
    return ' '.join(idx2word(idx) for idx in word_idxs)

In [34]:
def generate_haikus(generate_words, temperature=1.0):

    start_index = random.randint(0, train_x.shape[0] - seq_len - 1)
    generated = ""

    seed =  train_x[start_index:start_index+1]

    for i in range(generate_words):
        preds = model.predict(seed)
        pred_index = sample(preds[-1])
        pred_word = idx2word(pred_index)
    
    
        seed_tmp = np.concatenate((seed[0][1:], [pred_index]))
        seed[0] = seed_tmp
    
        generated += pred_word + " "
    
        if pred_word == ";":
            generated += "\n----------------------------------------\n"

    print(generated)
    
    


In [35]:
epochs = 20
batch_size = 512

for epoch in range(epochs):
    print()
    print()
    print(f"EPOCH:{epoch}")
    model.fit(train_x, train_y, 
              batch_size=batch_size, 
              epochs=1)
    
    model.save('myModelEmbedding.h5')

    generate_haikus(30)




EPOCH:0
is 
 ever happy the made practically reviews really 
 dude thinks your carry ; 
----------------------------------------
off considered ; 
----------------------------------------
how i agree we play 
 heavy bitches to with black 
 


EPOCH:1
dad know ; 
----------------------------------------
i know a a naked 
 seem to mean up however 
 id living pretty ; 
----------------------------------------
visiting level 
 is your venue but you am 
 
 


EPOCH:2
to vote 
 for laughing but they can want 
 to give to rain up ; 
----------------------------------------
if it can i 
 look great th from all which 
 you understand 


EPOCH:3
isnt so much 
 late things in shot here ; 
----------------------------------------
im contained of 
 them and cant prove you smile to 
 increase and would move ; 
----------------------------------------
deleted believe 
 


EPOCH:4
was 
 how like it has grown metal 
 responses i think ; 
----------------------------------------
yeah a best mod is 
 b

KeyboardInterrupt: 

In [36]:
generate_haikus(200)

learn to live on ; 
----------------------------------------
again what that is 
 introduced it is so much 
 i cant be deep cheap ; 
----------------------------------------
i want opinion 
 with the golden bus were friends 
 for drugs with the sky ; 
----------------------------------------
you dont know when you 
 cant try anything on them 
 for usual this clown ; 
----------------------------------------
you met epic run 
 ill my flight a lot for what 
 asked you for all yet ; 
----------------------------------------
but it is exactly 
 what they were commenting from 
 the world to save soft ; 
----------------------------------------
hes not saying bubbles 
 sunglasses here p bounce so 
 far hype ovation soon ; 
----------------------------------------
was a lovely can 
 be able to tell you on 
 my hardware ask you ; 
----------------------------------------
can read feel on her 
 to see or start obsessing 
 on everyone else ; 
----------------------------------------
colorado wit