# Model training

## Import packages

In [1]:
import tensorflow as tf
import pandas as pd
import os
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np

import random
import re
import matplotlib.pyplot as plt
from scipy import sparse
from tqdm import tqdm
from scipy import sparse

# Deep learning: 
from keras.models import Input, Model
from keras.layers import Dense

2022-05-23 13:07:28.206066: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-05-23 13:07:28.206107: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
#Activate tensorboard extension
try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass

%load_ext tensorboard

#Import necessary libs
import tensorflow_datasets as tfds
from tensorboard.plugins import projector

In [3]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  0


2022-05-23 13:07:31.407232: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-05-23 13:07:31.407282: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2022-05-23 13:07:31.407308: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (jupyter-acs277): /proc/driver/nvidia/version does not exist


## Read data

In [4]:
# Datensatz einlesen
df = pd.read_csv('data/out.csv')

## Transform dataframe

In [5]:
# Dataframe mit 3 Spalten. Werden so gejoint, dass ein neues Dataframe mit ein Haiku pro Zeile erstellt wird
df = df[['0', '1', '2']].agg(lambda x: ' '.join(x.values), axis=1)
# Dataframe to list [[]] -> []
haikus = df.values.tolist()

# ---------------------------------------------------
# Erst mal nur mit 1000 Haikus!
haikus = haikus[:1000]
# ---------------------------------------------------

In [6]:
# alle Haikus in Array
print(haikus[-10:])
number_of_haikus = len(haikus)
print('number of haikus: ' + str(number_of_haikus))

['why are you letting  your infant have table food so early stop please', 'never trust virgin  east coast they dont care unless u are elderly', 'paces wrigley field  express is a game changer and a life saver', 'now that the secrets  out i can stop living this dumb secretive life', 'everyday i thank  god for a roof over my head and my bills paid', 'nobody talk to  me for the rest of the day im going to sleep', 'i found the cutest  set from vs i hope its in leeds tomorrow', 'bother me tell me  awful things you know i love it when you do that', 'walmart kid doesnt  even tap his foot in time and that is the tea', 'me when people ask  if i got highlights but ive never dyed my hair']
number of haikus: 1000


## Clean text 

In [7]:
huge_string = ' SATZENDE SATZANFANG '.join(haikus)
huge_string_cleaned = re.sub('[.,_]', '', huge_string)

# TODO remove stopwords!

huge_list = huge_string_cleaned.split(' ')

In [8]:
#Get vocabulary
vocab = sorted(set(huge_list))
print(vocab[:20])
vocab_size = len(vocab)
print(vocab_size)

['', 'SATZANFANG', 'SATZENDE', 'a', 'aaron', 'abandon', 'abandoning', 'abbey', 'aboating', 'abound', 'about', 'above', 'absent', 'absolute', 'absolutely', 'abstract', 'accepted', 'according', 'acrid', 'across']
3454


In [9]:
vocab = np.array(vocab)

## Create context windows

In [10]:
# Defining the window for context
window = 3

word_lists = []

# Creating a context dictionary
for i, word in enumerate(huge_list):
    for w in range(window):
        # Getting the context that is ahead by *window* words
        if i + 1 + w < len(huge_list): 
            word_lists.append([word] + [huge_list[(i + 1 + w)]])
        # Getting the context that is behind by *window* words    
        if i - w - 1 >= 0:
            word_lists.append([word] + [huge_list[(i - w - 1)]])

In [11]:
print(word_lists[-100:])

[['tea', 'SATZANFANG'], ['tea', 'is'], ['tea', 'me'], ['tea', 'that'], ['SATZENDE', 'SATZANFANG'], ['SATZENDE', 'tea'], ['SATZENDE', 'me'], ['SATZENDE', 'the'], ['SATZENDE', 'when'], ['SATZENDE', 'is'], ['SATZANFANG', 'me'], ['SATZANFANG', 'SATZENDE'], ['SATZANFANG', 'when'], ['SATZANFANG', 'tea'], ['SATZANFANG', 'people'], ['SATZANFANG', 'the'], ['me', 'when'], ['me', 'SATZANFANG'], ['me', 'people'], ['me', 'SATZENDE'], ['me', 'ask'], ['me', 'tea'], ['when', 'people'], ['when', 'me'], ['when', 'ask'], ['when', 'SATZANFANG'], ['when', ''], ['when', 'SATZENDE'], ['people', 'ask'], ['people', 'when'], ['people', ''], ['people', 'me'], ['people', 'if'], ['people', 'SATZANFANG'], ['ask', ''], ['ask', 'people'], ['ask', 'if'], ['ask', 'when'], ['ask', 'i'], ['ask', 'me'], ['', 'if'], ['', 'ask'], ['', 'i'], ['', 'people'], ['', 'got'], ['', 'when'], ['if', 'i'], ['if', ''], ['if', 'got'], ['if', 'ask'], ['if', 'highlights'], ['if', 'people'], ['i', 'got'], ['i', 'if'], ['i', 'highlights'], 

## One-hot encode

In [12]:
# Creating the dictionary for the unique words
unique_word_dict = {}
for i, word in enumerate(vocab):
    unique_word_dict.update({
        word: i
    })
    
print(unique_word_dict.get('a'))

3


In [13]:
# Defining the number of features (unique words)
n_words = len(unique_word_dict)

# Getting all the unique words 
words = list(unique_word_dict.keys())

In [14]:
# word_lists1 = word_lists[:int(len(word_lists)/10000)]
print('len(word_lists): ' + str(len(word_lists)))
#print('len(word_lists1): ' + str(len(word_lists1)))


# Creating the X and Y matrices using one hot encoding
X = np.zeros((len(word_lists),n_words),dtype=bool)
Y = np.zeros((len(word_lists),n_words),dtype=bool)

X.shape

for i, word_list in tqdm(enumerate(word_lists)):
    # Getting the indices
    main_word_index = unique_word_dict.get(word_list[0])      #First word in tupel
    context_word_index = unique_word_dict.get(word_list[1])   #Second word in Tupel

    # One hot encoding the main word
    X[i, main_word_index] = 1

    # One hot encoding the Y matrix words 
    Y[i, context_word_index] = 1


#! Hier schmiert der Kernel bei der Iteration immer ab.. 

len(word_lists): 89394


89394it [00:00, 381958.76it/s]


In [15]:
train_dataset = tf.data.Dataset.from_tensor_slices((X, Y))

2022-05-23 13:07:36.987440: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [16]:
train_dataset

<TensorSliceDataset element_spec=(TensorSpec(shape=(3454,), dtype=tf.bool, name=None), TensorSpec(shape=(3454,), dtype=tf.bool, name=None))>

## The model

In [27]:
# Defining the size of the embedding (Often in practice, pre-trained word embeddings are used with typical word embedding dimensions being either 100, 200 or 300.)
embed_size = 100

# Defining the neural network
inp = Input(shape=(X.shape[1],))
x = Dense(units=embed_size, activation='linear')(inp)
x = Dense(units=Y.shape[1], activation='softmax')(x)
model = Model(inputs=inp, outputs=x)

In [28]:

model.compile(loss = 'categorical_crossentropy', optimizer = 'adam')

In [29]:
# Optimizing the network weights
model.fit(
    train_dataset,
    batch_size=256,
    epochs=50
    )



Epoch 1/50


ValueError: in user code:

    File "/opt/conda/lib/python3.9/site-packages/keras/engine/training.py", line 1021, in train_function  *
        return step_function(self, iterator)
    File "/opt/conda/lib/python3.9/site-packages/keras/engine/training.py", line 1010, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/opt/conda/lib/python3.9/site-packages/keras/engine/training.py", line 1000, in run_step  **
        outputs = model.train_step(data)
    File "/opt/conda/lib/python3.9/site-packages/keras/engine/training.py", line 859, in train_step
        y_pred = self(x, training=True)
    File "/opt/conda/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 67, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/opt/conda/lib/python3.9/site-packages/keras/engine/input_spec.py", line 228, in assert_input_compatibility
        raise ValueError(f'Input {input_index} of layer "{layer_name}" '

    ValueError: Exception encountered when calling layer "model_3" (type Functional).
    
    Input 0 of layer "dense_6" is incompatible with the layer: expected min_ndim=2, found ndim=1. Full shape received: (3454,)
    
    Call arguments received:
      • inputs=tf.Tensor(shape=(3454,), dtype=bool)
      • training=True
      • mask=None


In [None]:
model.summary()

In [None]:
# Obtaining the weights from the neural network. 
# These are the so called word embeddings

# The input layer 
weights = model.get_weights()[0]

In [None]:
# Creating a dictionary to store the embeddings in. The key is a unique word and 
# the value is the numeric vector
embedding_dict = {}
for word in words: 
    embedding_dict.update({
        word: np.asarray(weights[unique_word_dict.get(word)])
        })

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 10))

for word in list(unique_word_dict.keys()):
  coord = embedding_dict.get(word)
  plt.scatter(coord[0], coord[1])
  plt.annotate(word, (coord[0], coord[1]))

In [None]:
# Save metadata into tsv file
pd.DataFrame(embedding_dict.keys()).to_csv("model_dir/metadata.tsv", sep = '\t', index=False)

In [None]:
# Save embeddings_vectors into tsv file
pd.DataFrame(embedding_dict.values()).to_csv("model_dir/vectors_2.tsv", sep = '\t', index=False)