# Model training

## Import packages

In [None]:
import tensorflow as tf
import pandas as pd
import os
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
import random
import re
import matplotlib.pyplot as plt
from scipy import sparse
from tqdm import tqdm
import gensim

# Deep learning: 
from keras.models import Input, Model
from keras.layers import Dense

## Read data

In [None]:
# Datensatz einlesen
df = pd.read_csv('data/big_out.csv')

## Transform dataframe

In [None]:
# Dataframe mit 3 Spalten. Werden so gejoint, dass ein neues Dataframe mit ein Haiku pro Zeile erstellt wird
df = df[['0', '1', '2']].agg(lambda x: ' \n '.join(x.values), axis=1)
# Dataframe to list [[]] -> []
haikus = df.values.tolist()

In [None]:
# alle Haikus in Array
print(haikus[:10])
number_of_haikus = len(haikus)
print('number of haikus: ' + str(number_of_haikus))

['last red in the sky \n a small girls moon face rises \n over the counter', 'christmas services \n a cellular phone rings out \n handels messiah', 'passover darkness  \n before the buds burst open \n a childs eyes in death', 'last night of summer \n the bright full moon of last night \n hidden by a cloud', 'midnight and full moon \n my neighbour asks to borrow \n the vacum cleaner', 'yellow walnut leaves \n slowly appear on the lawn \n early morning light', 'after its first flight \n the young gerfalcons talons \n tighter on my glove', 'sultry afternoon \n only the mailbox shadow \n crosses the dirt road', 'long journey back home  \n a forgotten bale of hay \n slowly rots away', 'autumn mist obscures \n the island in the distance \n she cleans her glasses']
number of haikus: 438234


In [None]:
# die Haikus cleanen und selber auch noch mal als Wort-Arrays in großen Array
haikus = np.array(haikus)

def clean_and_split(sentence):
    result = list(filter(''.__ne__, re.sub('[.,_]', '', sentence).split(' ')))
    result.append(';')
    return result

haikus = list(map(lambda x: clean_and_split(x), haikus))

In [None]:
print(haikus[:10])

[['last', 'red', 'in', 'the', 'sky', '\n', 'a', 'small', 'girls', 'moon', 'face', 'rises', '\n', 'over', 'the', 'counter', ';'], ['christmas', 'services', '\n', 'a', 'cellular', 'phone', 'rings', 'out', '\n', 'handels', 'messiah', ';'], ['passover', 'darkness', '\n', 'before', 'the', 'buds', 'burst', 'open', '\n', 'a', 'childs', 'eyes', 'in', 'death', ';'], ['last', 'night', 'of', 'summer', '\n', 'the', 'bright', 'full', 'moon', 'of', 'last', 'night', '\n', 'hidden', 'by', 'a', 'cloud', ';'], ['midnight', 'and', 'full', 'moon', '\n', 'my', 'neighbour', 'asks', 'to', 'borrow', '\n', 'the', 'vacum', 'cleaner', ';'], ['yellow', 'walnut', 'leaves', '\n', 'slowly', 'appear', 'on', 'the', 'lawn', '\n', 'early', 'morning', 'light', ';'], ['after', 'its', 'first', 'flight', '\n', 'the', 'young', 'gerfalcons', 'talons', '\n', 'tighter', 'on', 'my', 'glove', ';'], ['sultry', 'afternoon', '\n', 'only', 'the', 'mailbox', 'shadow', '\n', 'crosses', 'the', 'dirt', 'road', ';'], ['long', 'journey', '

## The model

In [None]:
# train model
model = gensim.models.Word2Vec(haikus, min_count=1) # min count automatisch groesser, discarded alle woerter die weniger vorkommen

In [None]:
# summarize the loaded model
print(model)

Word2Vec<vocab=45075, vector_size=100, alpha=0.025>


In [None]:
# access vector for one word
print('vector for \'girl\':')
print(model.wv['girl'])

vector for 'girl':
[ 0.31229058  0.6946153  -0.43084306 -0.0282332   2.0903203   0.10101435
 -3.202442   -3.315119   -0.16451664 -0.07796551 -0.3646245   1.6244198
 -1.4507171   0.13299742 -1.3371615  -0.01770525  1.1575513  -1.3556007
  0.49317735 -1.2439383  -0.2931738   1.2420111   1.8485087   0.9806938
  0.83782506 -1.5369533  -0.7325633   2.2399924   1.739332    1.4475107
  1.8041458   0.01433887 -0.34128883  0.58657515 -0.227564    2.5924487
 -0.13076484  0.32296994 -1.9745549   0.6936519   1.1308823  -1.6093214
  1.8798903  -0.7457563  -1.351318    0.3401661  -0.6295531  -1.3076602
 -0.6241337   0.02563074  2.3840942  -2.65126     0.07908862  0.21345182
 -2.3988001   0.68495125 -0.00364678 -1.380609    1.2683995   1.6524339
  0.28577283 -0.35384795  1.2622461  -2.4703054   0.14531802 -0.3807358
 -0.7369605  -0.7078896   0.022359   -0.39785108  0.80460495  0.7874859
 -0.7940091  -0.7360059  -0.34914967 -1.2929767  -1.2863843   0.64583355
 -0.8362492   1.5712146   0.45155326  0.94

In [None]:
print('top 10 words most similar to \'girl\':')
model.wv.most_similar('girl', topn=10)

top 10 words most similar to 'girl':


[('woman', 0.8308090567588806),
 ('kid', 0.8020614385604858),
 ('guy', 0.7984545826911926),
 ('lady', 0.7733927369117737),
 ('chick', 0.722667396068573),
 ('girlfriend', 0.7165722250938416),
 ('person', 0.702984631061554),
 ('dog', 0.7027512192726135),
 ('boy', 0.6944789886474609),
 ('cat', 0.679090678691864)]

In [None]:
# similarity between two words
print('similarity between \'go\' and \'walk\' (regarding the haikus):')
print(model.wv.similarity(w1='go', w2='walk'))
print()

print('similarity between \'go\' and \'laugh\' (regarding the haikus):')
print(model.wv.similarity(w1='go', w2='laugh'))
print()

print('similarity between \'go\' and \'go\':')
print(model.wv.similarity(w1='go', w2='go'))

similarity between 'go' and 'walk' (regarding the haikus):
0.7328923

similarity between 'go' and 'laugh' (regarding the haikus):
0.21766543

similarity between 'go' and 'go':
1.0


In [None]:
# save model
#model.save('w2v_model.bin')

In [None]:
# load model
#new_model = Word2Vec.load('w2v_model.bin')
#print(new_model)

In [None]:
# extract the words & their vectors, as numpy arrays
vectors = np.asarray(model.wv.vectors)
labels = np.asarray(model.wv.index_to_key)  # fixed-width numpy strings

print('vectors:')
print(vectors[:2])
print()
print('labels:')
print(labels[:10])

vectors:
[[ 1.8460613  -0.59029365  0.00520649  0.01850292 -0.6254876  -2.408798
  -1.3775347   0.27895278 -1.2913343  -0.02673046 -0.56462675  0.30756867
   0.19352806 -0.30972335  0.78330106 -1.1697611  -0.52091134  1.0120761
  -1.1409519  -1.0345565  -0.7452162   0.739466   -0.08062295 -1.6796914
  -0.69315815 -0.5457272  -0.6219647  -0.39156497 -1.174594    0.56472206
   1.1135521   0.6292329  -0.8439422  -0.07067981 -0.02860794 -1.4582332
   0.9359705   1.265377   -1.7603954  -0.36900902  0.16194849  0.40116593
   1.5610093  -0.16510755  0.26651558 -1.0860605   0.3353955  -0.6647575
   1.5359938  -0.84802586  2.6239214  -0.9592528   0.4508637  -0.19657782
  -0.76031494  0.3630984   0.21663533 -1.1813757  -0.50540286  1.5246313
  -1.2286513  -0.8289276  -0.29078034 -0.9965098  -0.1317179  -0.24465352
   0.5494164  -0.36317626 -1.151054    0.22485976  0.37415656 -0.18563044
  -0.39827275 -0.06534689 -0.64337486 -1.616646   -0.46929845  1.5926719
   0.02401546 -0.6839747   0.8495007 

In [None]:
len(vectors)

45075

In [None]:
len(labels)

45075

In [None]:
# https://projector.tensorflow.org/

In [None]:
# Save metadata (labels) into tsv file
pd.DataFrame(labels).to_csv("model_dir/metadata.tsv", sep = '\t', index=False)

In [None]:
# Save vectors into tsv file
pd.DataFrame(vectors).to_csv("model_dir/vectors.tsv", sep = '\t', index=False)

## Creating Model for HaikuGen

In [None]:
# Maximale Anzahl der Wörter in einem Haiku aus Datenset
max_haiku_len = len(max(haikus, key=len))

In [None]:
haikus[1]

['christmas',
 'services',
 '\n',
 'a',
 'cellular',
 'phone',
 'rings',
 'out',
 '\n',
 'handels',
 'messiah',
 ';']

In [None]:
model.wv["out"]

array([ 1.115937  ,  3.7550073 , -1.3472165 , -0.04846412,  2.9191742 ,
       -0.5201832 , -3.9541988 ,  0.56951237,  0.32365316,  1.9051774 ,
       -0.83292013,  3.0557196 , -1.263514  , -0.21320315,  1.3758903 ,
       -0.9768148 ,  0.59615505,  1.7158791 , -1.2663797 ,  0.52242506,
        1.6789254 ,  4.347519  ,  1.6123829 ,  1.4382718 , -0.4821828 ,
       -0.5197544 ,  0.9906136 ,  0.9818026 , -2.7984228 ,  1.395255  ,
        1.4432368 , -1.0875987 , -1.6443707 ,  0.92232853, -0.40474302,
        0.0200003 , -0.3714413 ,  0.7583213 , -3.6758907 ,  1.0969849 ,
       -0.46610647,  3.0717294 ,  1.1727097 ,  1.8695028 , -0.9054868 ,
       -0.2845151 ,  1.6753987 ,  0.389066  , -0.72950745, -0.3828555 ,
       -3.908614  , -0.91224897,  0.1990495 , -3.0756981 , -0.5532407 ,
        0.32998967, -1.9459828 , -1.6822494 ,  0.97183484, -1.204235  ,
        2.6566327 , -0.03324043,  1.5522405 ,  3.088333  ,  0.08327124,
        0.8334451 , -0.9421077 ,  1.1408944 ,  0.09539839,  0.71

In [None]:
def word2idx(word):
    return model.wv.key_to_index[word]
def idx2word(idx):
    return model.wv.index2word[idx]

In [None]:
print('\nPreparing the data for LSTM...')
train_x = np.zeros([len(haikus), max_haiku_len], dtype=np.int32)
train_y = np.zeros([len(haikus)], dtype=np.int32)
for i, haiku in enumerate(haikus):
    for t, word in enumerate(haiku[:-1]):
        train_x[i, t] = word2idx(word)
    train_y[i] = word2idx(haiku[-1])
print('train_x shape:', train_x.shape)
print('train_y shape:', train_y.shape)


Preparing the data for LSTM...
train_x shape: (438234, 20)
train_y shape: (438234,)
