# Model training

## Import packages

In [1]:
import tensorflow as tf
import pandas as pd
import os
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np

import random
import re
import matplotlib.pyplot as plt
from scipy import sparse
from tqdm import tqdm
from scipy import sparse

# Deep learning: 
from keras.models import Input, Model
from keras.layers import Dense

In [2]:
#Activate tensorboard extension
try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass

%load_ext tensorboard

#Import necessary libs
import tensorflow_datasets as tfds
from tensorboard.plugins import projector

In [3]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  1


2022-05-30 10:05:44.851727: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-30 10:05:44.959195: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-30 10:05:44.959790: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero


## Read data

In [4]:
# Datensatz einlesen
df = pd.read_csv('data/out.csv')

## Transform dataframe

In [5]:
# Dataframe mit 3 Spalten. Werden so gejoint, dass ein neues Dataframe mit ein Haiku pro Zeile erstellt wird
df = df[['0', '1', '2']].agg(lambda x: ' '.join(x.values), axis=1)
# Dataframe to list [[]] -> []
haikus = df.values.tolist()

# ---------------------------------------------------
# Erst mal nur mit 1000 Haikus!
haikus = haikus[:5000]
# ---------------------------------------------------

In [6]:
# alle Haikus in Array
print(haikus[-10:])
number_of_haikus = len(haikus)
print('number of haikus: ' + str(number_of_haikus))

['its finals week so  im gonna be less active than normal love yall', 'money attract hoes  but i dont want hoes that shit getting so boring', 'without coffee i  dont think i could get out of bed in the mornings', 'you bitches are wild  believe it or not your friend can have other friends', 'i dont really care  if you cry on the real you shouldve never lied', 'i need a logo  i hope the logo maker does something i like', 'i dont understand  how people go to the gym and dont sweat at all', 'does anyone know  of a good accessible rooftop in munich', 'thinking of going  to see bae but then rain and tests are on my case', 'let yourself be drawn  by the stronger pull of that which you truly love']
number of haikus: 5000


## Clean text 

In [7]:
huge_string = ' SATZENDE SATZANFANG '.join(haikus)
huge_string_cleaned = re.sub('[.,_]', '', huge_string)

# TODO remove stopwords!

huge_list = huge_string_cleaned.split(' ')

In [8]:
#Get vocabulary
vocab = sorted(set(huge_list))
print(vocab[:20])
vocab_size = len(vocab)
print(vocab_size)

['', 'SATZANFANG', 'SATZENDE', 'a', 'aaron', 'abandon', 'abandoned', 'abandoning', 'abbey', 'abducted', 'ability', 'able', 'aboard', 'aboating', 'abolition', 'abort', 'abound', 'about', 'above', 'abroad']
7942


In [9]:
vocab = np.array(vocab)

## Create context windows

In [10]:
# Defining the window for context
window = 3

word_lists = []

# Creating a context dictionary
for i, word in enumerate(huge_list):
    for w in range(window):
        # Getting the context that is ahead by *window* words
        if i + 1 + w < len(huge_list): 
            word_lists.append([word] + [huge_list[(i + 1 + w)]])
        # Getting the context that is behind by *window* words    
        if i - w - 1 >= 0:
            word_lists.append([word] + [huge_list[(i - w - 1)]])

In [11]:
print(word_lists[-100:])

[['case', 'SATZANFANG'], ['case', 'on'], ['case', 'let'], ['case', 'are'], ['SATZENDE', 'SATZANFANG'], ['SATZENDE', 'case'], ['SATZENDE', 'let'], ['SATZENDE', 'my'], ['SATZENDE', 'yourself'], ['SATZENDE', 'on'], ['SATZANFANG', 'let'], ['SATZANFANG', 'SATZENDE'], ['SATZANFANG', 'yourself'], ['SATZANFANG', 'case'], ['SATZANFANG', 'be'], ['SATZANFANG', 'my'], ['let', 'yourself'], ['let', 'SATZANFANG'], ['let', 'be'], ['let', 'SATZENDE'], ['let', 'drawn'], ['let', 'case'], ['yourself', 'be'], ['yourself', 'let'], ['yourself', 'drawn'], ['yourself', 'SATZANFANG'], ['yourself', ''], ['yourself', 'SATZENDE'], ['be', 'drawn'], ['be', 'yourself'], ['be', ''], ['be', 'let'], ['be', 'by'], ['be', 'SATZANFANG'], ['drawn', ''], ['drawn', 'be'], ['drawn', 'by'], ['drawn', 'yourself'], ['drawn', 'the'], ['drawn', 'let'], ['', 'by'], ['', 'drawn'], ['', 'the'], ['', 'be'], ['', 'stronger'], ['', 'yourself'], ['by', 'the'], ['by', ''], ['by', 'stronger'], ['by', 'drawn'], ['by', 'pull'], ['by', 'be'], 

## One-hot encode

In [12]:
# Creating the dictionary for the unique words
unique_word_dict = {}
for i, word in enumerate(vocab):
    unique_word_dict.update({
        word: i
    })
    
print(unique_word_dict.get('a'))

3


In [13]:
# Defining the number of features (unique words)
n_words = len(unique_word_dict)

# Getting all the unique words 
words = list(unique_word_dict.keys())

In [14]:
# word_lists1 = word_lists[:int(len(word_lists)/10000)]
print('len(word_lists): ' + str(len(word_lists)))
#print('len(word_lists1): ' + str(len(word_lists1)))


# Creating the X and Y matrices using one hot encoding
X = np.zeros((len(word_lists), n_words), dtype=bool)
Y = np.zeros((len(word_lists), n_words), dtype=bool)

for i, word_list in tqdm(enumerate(word_lists)):
    # Getting the indices
    main_word_index = unique_word_dict.get(word_list[0])      #First word in tupel
    context_word_index = unique_word_dict.get(word_list[1])   #Second word in Tupel

    # One hot encoding the main word
    X[i, main_word_index] = 1

    # One hot encoding the Y matrix words 
    Y[i, context_word_index] = 1


#! Hier schmiert der Kernel bei der Iteration immer ab.. 

len(word_lists): 469434


469434it [00:02, 197887.49it/s]


In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((X, Y))

In [None]:
train_dataset

## The model

In [None]:
# Defining the size of the embedding (Often in practice, pre-trained word embeddings are used with typical word embedding dimensions being either 100, 200 or 300.)
embed_size = 100

# Defining the neural network
inp = Input(shape=(X.shape[1],))
x = Dense(units=embed_size, activation='linear')(inp)
x = Dense(units=Y.shape[1], activation='softmax')(x)
model = Model(inputs=inp, outputs=x)

In [None]:
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam')

In [None]:
train_in_batches():
    

In [None]:
# Optimizing the network weights
model.fit(
    train_dataset,
    batch_size=256,
    epochs=50
    )

In [None]:
model.summary()

In [None]:
# Obtaining the weights from the neural network. 
# These are the so called word embeddings

# The input layer 
weights = model.get_weights()[0]

In [None]:
# Creating a dictionary to store the embeddings in. The key is a unique word and 
# the value is the numeric vector
embedding_dict = {}
for word in words: 
    embedding_dict.update({
        word: np.asarray(weights[unique_word_dict.get(word)])
        })

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 10))

for word in list(unique_word_dict.keys()):
    coord = embedding_dict.get(word)
    plt.scatter(coord[0], coord[1])
    plt.annotate(word, (coord[0], coord[1]))

In [None]:
# Save metadata into tsv file
pd.DataFrame(embedding_dict.keys()).to_csv("model_dir/metadata.tsv", sep = '\t', index=False)

In [None]:
# Save embeddings_vectors into tsv file
pd.DataFrame(embedding_dict.values()).to_csv("model_dir/vectors_2.tsv", sep = '\t', index=False)