# Model training

## Import packages

In [1]:
import tensorflow as tf
import pandas as pd
import os
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
import random
import re
import matplotlib.pyplot as plt
from scipy import sparse
from tqdm import tqdm
import gensim

# Deep learning: 
from keras.models import Input, Model
from keras.layers import Dense

In [2]:
# print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

## Read data

In [3]:
# Datensatz einlesen
df = pd.read_csv('data/out.csv')

## Transform dataframe

In [4]:
# Dataframe mit 3 Spalten. Werden so gejoint, dass ein neues Dataframe mit ein Haiku pro Zeile erstellt wird
df = df[['0', '1', '2']].agg(lambda x: ' '.join(x.values), axis=1)
# Dataframe to list [[]] -> []
haikus = df.values.tolist()


In [5]:
# alle Haikus in Array
print(haikus[:10])
number_of_haikus = len(haikus)
print('number of haikus: ' + str(number_of_haikus))

['last red in the sky a small girls moon face rises over the counter', 'christmas services a cellular phone rings out handels messiah', 'passover darkness  before the buds burst open a childs eyes in death', 'last night of summer the bright full moon of last night hidden by a cloud', 'midnight and full moon my neighbour asks to borrow the vacum cleaner', 'yellow walnut leaves slowly appear on the lawn early morning light', 'after its first flight the young gerfalcons talons tighter on my glove', 'sultry afternoon only the mailbox shadow crosses the dirt road', 'long journey back home  a forgotten bale of hay slowly rots away', 'autumn mist obscures the island in the distance she cleans her glasses']
number of haikus: 93390


In [6]:
# die Haikus cleanen und selber auch noch mal als Wort-Arrays in großen Array

haikus = np.array(haikus)

def clean_and_split(sentence):
    return re.sub('[.,_]', '', sentence).split(' ')

haikus = list(map(lambda x: clean_and_split(x), haikus))

In [7]:
print(haikus[:10])

[['last', 'red', 'in', 'the', 'sky', 'a', 'small', 'girls', 'moon', 'face', 'rises', 'over', 'the', 'counter'], ['christmas', 'services', 'a', 'cellular', 'phone', 'rings', 'out', 'handels', 'messiah'], ['passover', 'darkness', '', 'before', 'the', 'buds', 'burst', 'open', 'a', 'childs', 'eyes', 'in', 'death'], ['last', 'night', 'of', 'summer', 'the', 'bright', 'full', 'moon', 'of', 'last', 'night', 'hidden', 'by', 'a', 'cloud'], ['midnight', 'and', 'full', 'moon', 'my', 'neighbour', 'asks', 'to', 'borrow', 'the', 'vacum', 'cleaner'], ['yellow', 'walnut', 'leaves', 'slowly', 'appear', 'on', 'the', 'lawn', 'early', 'morning', 'light'], ['after', 'its', 'first', 'flight', 'the', 'young', 'gerfalcons', 'talons', 'tighter', 'on', 'my', 'glove'], ['sultry', 'afternoon', 'only', 'the', 'mailbox', 'shadow', 'crosses', 'the', 'dirt', 'road'], ['long', 'journey', 'back', 'home', '', 'a', 'forgotten', 'bale', 'of', 'hay', 'slowly', 'rots', 'away'], ['autumn', 'mist', 'obscures', 'the', 'island',

In [8]:
def clean_text(
    string: str, 
    punctuations=r'''!()-[]{};:'"\,<>/?@#$%^&*_~=`''',
    stop_words=['the', 'a', 'and', 'is', 'be', 'will']) -> str:
    """
    A method to clean text 
    """
    # Cleaning the urls
    string = re.sub(r'https?://\S+|www\.\S+', '', string)

    # Cleaning the html elements
    string = re.sub(r'<.*?>', '', string)

    # Removing the punctuations
    for x in string.lower(): 
        if x in punctuations: 
            string = string.replace(x, "") 

    # Converting the text to lower
    string = string.lower()

    # Removing stop words
    string = ' '.join([word for word in string.split() if word not in stop_words])

    # Cleaning the whitespaces
    string = re.sub(r'\s+', ' ', string).strip()
    
    # Removing numbers#
    string = re.sub(r'\d', '', string)

    return string  

# Datensatz einlesen
path_to_file = tf.keras.utils.get_file('Harry%20Potter%202%20-%20Chamber%20of%20Secrets.txt',
                                       'http://www.glozman.com/TextPages/Harry%20Potter%202%20-%20Chamber%20of%20Secrets.txt')

# Read, then decode for py2 compat.
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')
text = clean_text(text).split('.')
#print(text[:10])

hp_array = list(map(lambda x: clean_and_split(x), text))

[' harry potter chamber of secrets by j', ' k', ' rowling this book  in harry potter series original scannedocr friday april   v', ' edit where needed change version number by ', ' c h p t e o n e worst birthday not for first time an argument had broken out over breakfast at number four privet drive', ' mr', ' vernon dursley had been woken in early hours of morning by loud hooting noise from his nephew harrys room', ' third time this week he roared across table', ' if you cant control that owl itll have to go harry tried yet again to explain', ' shes bored he said']


## The model

In [9]:
# train model
model = gensim.models.Word2Vec(hp_array)

In [10]:
# summarize the loaded model
print(model)

Word2Vec<vocab=1943, vector_size=100, alpha=0.025>


In [11]:
# access vector for one word
print('vector for \'girl\':')
print(model.wv['girl'])

vector for 'girl':
[-0.0489248   0.18681888  0.03319269  0.03603826  0.14984688 -0.1654673
  0.15224434  0.3821418  -0.14198801 -0.05851124 -0.09174149 -0.2001487
  0.01571479  0.0088606   0.20546536 -0.09817459  0.04179309 -0.06742151
  0.00516966 -0.29627094  0.20089614  0.00789949  0.07393959 -0.14401254
  0.00651163  0.05856685 -0.06709082 -0.19387507 -0.11185332  0.02075415
  0.16296817  0.07231785 -0.00572357 -0.18847522 -0.04731258  0.18734443
 -0.02862754 -0.04059602 -0.030715   -0.27051035  0.10323879 -0.04915797
 -0.05320074  0.01392652  0.2391323  -0.07508141 -0.1710209   0.00318667
  0.18462081  0.0140131   0.0197587  -0.11320188 -0.09901898 -0.04548169
  0.02633374  0.04613921  0.0774593   0.1236912  -0.07847422  0.0735085
  0.01917881  0.1024448   0.05188182 -0.00429058 -0.25333625  0.19324161
  0.09037547  0.18758845 -0.16218056  0.15377995 -0.07535589  0.0314659
  0.26763743 -0.08472732  0.20143652 -0.00042409 -0.0207449  -0.00779402
 -0.22393502  0.03520436 -0.07960196

In [28]:
print('top 10 words most similar to \'girl\':')
model.wv.most_similar('car', topn=10)

top 10 words most similar to 'girl':


[('very', 0.999654233455658),
 ('toward', 0.9996378421783447),
 ('came', 0.9996365308761597),
 ('next', 0.9996352195739746),
 ('long', 0.9996311068534851),
 ('still', 0.9996151924133301),
 ('which', 0.999612033367157),
 ('few', 0.9996045231819153),
 ('again', 0.9996023178100586),
 ('first', 0.9996015429496765)]

In [13]:
# similarity between two words
print('similarity between \'go\' and \'walk\' (regarding the haikus):')
print(model.wv.similarity(w1='go', w2='walk'))
print()

print('similarity between \'go\' and \'laugh\' (regarding the haikus):')
print(model.wv.similarity(w1='go', w2='laugh'))
print()

print('similarity between \'go\' and \'go\':')
print(model.wv.similarity(w1='go', w2='go'))

similarity between 'go' and 'walk' (regarding the haikus):
0.99578285

similarity between 'go' and 'laugh' (regarding the haikus):
0.9971985

similarity between 'go' and 'go':
1.0


In [14]:
# save model
#model.save('w2v_model.bin')

In [15]:
# load model
#new_model = Word2Vec.load('w2v_model.bin')
#print(new_model)

In [16]:
# extract the words & their vectors, as numpy arrays
vectors = np.asarray(model.wv.vectors)
labels = np.asarray(model.wv.index_to_key)  # fixed-width numpy strings

print('vectors:')
print(vectors[:2])
print()
print('labels:')
print(labels[:10])

vectors:
[[-0.13968319  0.5039783   0.08849982  0.08344058  0.37877145 -0.44779658
   0.3779186   1.0413792  -0.3710803  -0.18029764 -0.18503423 -0.52688724
  -0.01334676  0.0043259   0.50906676 -0.28334302  0.11979121 -0.14377664
   0.05835876 -0.7649864   0.50436354  0.01947019  0.20050618 -0.3582738
   0.0431937   0.16984197 -0.1581541  -0.51988554 -0.27227524  0.01201792
   0.37299243  0.20391868  0.0031895  -0.50887203 -0.08475018  0.46862692
  -0.04682276 -0.13558753 -0.07548592 -0.7260794   0.26088592 -0.068873
  -0.16426778  0.0215367   0.6039142  -0.15868577 -0.47599307  0.05856868
   0.5057014   0.03344871  0.05360534 -0.29869542 -0.25509396 -0.06994253
   0.09403911  0.10323661  0.18496366  0.29347643 -0.19522241  0.19901723
  -0.00849691  0.27679744  0.12782975 -0.0200706  -0.660942    0.5165119
   0.22386695  0.4763842  -0.46559688  0.4260287  -0.18487327  0.11791668
   0.71495295 -0.20919794  0.51777965 -0.01633553 -0.04108058 -0.04874487
  -0.5648129   0.09661488 -0.2211

In [17]:
len(vectors)

1943

In [18]:
len(labels)

1943

In [None]:
# https://projector.tensorflow.org/

In [19]:
# Save metadata (labels) into tsv file
pd.DataFrame(labels).to_csv("model_dir/metadata.tsv", sep = '\t', index=False)

In [20]:
# Save vectors into tsv file
pd.DataFrame(vectors).to_csv("model_dir/vectors.tsv", sep = '\t', index=False)