## Initializing programming env

In [None]:
#Color printing
from termcolor import colored

#General data operations library
import math, string, glob
from datetime import datetime
import numpy as np
import functools

#The tensorflow library
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "1"
import tensorflow  as tf
import tensorflow_datasets as tfds
import tensorflow_text as tf_text

#Plotting libraries
import matplotlib as mpl
import matplotlib.pyplot as plt

#Increase plots font size
params = {'legend.fontsize': 'xx-large',
          'figure.figsize': (10, 7),
         'axes.labelsize': 'xx-large',
         'axes.titlesize':'xx-large',
         'xtick.labelsize':'xx-large',
         'ytick.labelsize':'xx-large'}
plt.rcParams.update(params) 

import os
os.chdir("/scratch_hdd/akalinow/Zajecia/2023-2024/Lato/Uczenie_maszynowe_2/UczenieMaszynoweII/PL/")

#append path with python modules
import importlib
import sys
sys.path.append("../modules")

#Private functions
import plotting_functions as plf
importlib.reload(plf);

import text_functions as txt_fcn
importlib.reload(txt_fcn);
#Hide GPU
#tf.config.set_visible_devices([], 'GPU')

<br/><br/>
<br/><br/>

<h1 align="center">
 Machine learning II
</h1>

<br/><br/>
<br/><br/>
<br/><br/>
<br/><br/>

<h1 align="right">
Artur Kalinowski <br>
University of Warsaw <br>
Faculty of Physics <br>    
</h1>

There is an incomplete set of standard operations that we perform on various types of data before they are used as input to the model.
The Keras API provides ready-made layers that perform many of these [operations](https://www.tensorflow.org/guide/keras/preprocessing_layers).
In this notebook, we will use several of them for different types of data: **numeric**, **text**, **images**.

## Numerical data

### Normalization

The standard operation we perform on numerical data before feeding it to the model input is normalisation.
Normalisation makes the order of magnitude of the weights similar for all features and the weights themselves are not too large.

```Python

normalization = tf.keras.layers.Normalization(mean, variance) # Normalising data to mean and variance
                                                               # by default: mean=0, variance=1
                                                               # normalisation is carried out for each characteristic separately
                                                               # requires the normalisation coefficients to be determined by the adapt(x) method
normalization.adapt(x)                                                             
```


**Please:**.

* generate a set of `(n,4)` numbers derived from a flat distribution in the ranges `[[-5,5],[-4,2],[2,2]]`.
* print the minimum, maximum and mean values of the features in the set.
* normalise the data to a range of `[0,1]` for each feature separately.
* print the minimum, maximum and mean values of the features in the normalised set.
* check whether the normalisation has worked as expected.

In [None]:
n = 10_000
#BEGIN_SOLUTION
x = tf.random.uniform([n, 3])
scales = np.array([[-5,5],[-4,2],[2,2]])
ranges = scales[:,1] - scales[:,0]
x = x * ranges + scales[:,0]
print(colored("min =", "blue"), tf.math.reduce_min(x, axis=0).numpy())
print(colored("mean =", "blue"), tf.math.reduce_mean(x, axis=0).numpy())
print(colored("max =", "blue"), tf.math.reduce_max(x, axis=0).numpy())
print(colored("stddev =", "blue"), tf.math.reduce_std(x, axis=0).numpy())
normalization = tf.keras.layers.Normalization()
normalization.adapt(x)
x = normalization(x)
print(colored("min =", "blue"), tf.math.reduce_min(x, axis=0).numpy())
print(colored("mean =", "blue"), tf.math.reduce_mean(x, axis=0).numpy())
print(colored("max =", "blue"), tf.math.reduce_max(x, axis=0).numpy())
#END_SOLUTION
pass


### Discretisation 

Sometimes it is useful to divide numerical data into categories - **discretisation**.
In situations where we do not need a high resolution, floating point values can be divided into `small`, `medium` and `large` or similar.
Reducing the resolution from floating-point to a list of categories can also facilitate training.

```Python

discretization = tf.keras.layers.Discretization(num_bins, bin_boundaries, output_mode) 
                 # Convert a continuous variable to a discrete variable of the form:
                 # output_mode = int - interval number (default value).
                 #               one_hot - vector of the hot-encoding type
                 # num_bins - number of intervals (requires calling adapt(x) method)
                 # bin_boundaries - ranges of intervals
```

**Please**

* discretise the data from the previous cell into 10 intervals.
* draw a histogram of the interval numbers for **all** the characteristics.

In [None]:
#BEGIN_SOLUTION
discretization = tf.keras.layers.Discretization(num_bins = 10)
discretization.adapt(x)
x = discretization(x)

fig, axis = plt.subplots(1,1, figsize=(5,5))
axis.hist(tf.reshape(shape=(-1,), tensor=x))
axis.set_xlabel('index')
axis.set_ylabel('counts')
#END_SOLUTION
pass

## Images

**Please:**

* using the `tensorflow_datasets` library load the set `imagenette/160px`
* draw a few pictures

In [None]:
#BEGIN_SOLUTION
ds, ds_info = tfds.load('imagenette/160px', split='train', with_info=True)
fig = tfds.show_examples(ds, ds_info, rows=1, cols=3);
#END_SOLUTION
pass


### Scaling

The change of resolution - image scaling. Scaling requires an interpolation algorithm to be specified, allowing the
to calculate the pixel values in the new image.

```Python
tf.keras.layers.Resizing(
    height, width,                # width and height of the new image
    interpolation='bilinear',     # interpolation algorithm
    crop_to_aspect_ratio=False,   # cropping the image to achieve
                                  # the same width/length ratio
                                  # as in the original image
)
```


## Cropping

A fragment - a `frame` - is cut out of the whole image:


```Python
tf.keras.layers.CenterCrop(
    height, width              # width and height of the rectangle cropping
                               # the frame in the middle of the image
)
```

Clipping at a random point can be used to enrich the sample by generating random image fragments - `augmenting`. Layers performing random operations on images are only active during training by default.

```Python
tf.keras.layers.RandomCrop(
    height, width, seed=None,  # width and height of the rectangle cropping
                               # the frame in the random place
)
```



### Rotation

```Python
tf.keras.layers.RandomRotation(
    factor,                         # range of rotation in 2pi units: (min, max)
    fill_mode='reflect',            # algorithm to fill the space created after image rotation
    interpolation='bilinear',
    seed=None,
    fill_value=0.0,                 # the value of the pixel used to fill the space created after moving the image,
                                    # if `fill_mode=constant` is set.
)
```




### Translation

```Python
tf.keras.layers.RandomTranslation(
    height_factor,                  # relative vertical displacement factor: (min, max)
    width_factor,                   # relative horizontal displacement coefficient: (min, max)
    fill_mode='reflect',            # algorithm to fill the space created after image translation
    interpolation='bilinear',
    seed=None,
    fill_value=0.0,                 # the value of the pixel used to fill the space created after moving the image,
                                    # if `fill_mode=constant` is set.
)
```


**Please:**

Draw random images from a set of `imagenette/160px` subjected to:
* scaling the area to a resolution of `(160,160)`.
* plot the resolution of the first example on the screen

**Hints:**
* use the `tf.data.Dataset.map()` method with the appropriate mapping function based on the appropriate layer.
* note the data type in the tensor containing the processed images.

In [None]:
#BEGIN_SOLUTION
ds = ds.map(lambda x: {"image": tf.keras.layers.Resizing(320,320, crop_to_aspect_ratio=True, dtype=tf.uint8)(x["image"]), "label": x["label"]})
tfds.show_examples(ds, ds_info, rows=1, cols=3)
item = next(iter(ds))
x_res = item["image"].shape[0]
y_res = item["image"].shape[1]
print(colored("Resolution: ", "blue"), x_res, y_res)          
#END_SOLUTION
pass

**Please:**.

Draw random images from a set of `imagenette/160px` subjected to:
* clipping to a central area of size `(64,64)`.

**Tips:**.
* use the `tf.data.Dataset.map()` method with an appropriate mapping function based on `tf.keras.layers.CenterCrop`.
* note the data type in the tensor containing the processed images.

In [None]:
#BEGIN_SOLUTION
layer = tf.keras.layers.CenterCrop(64,64, dtype=tf.uint8)
ds_randomCrop = ds.map(lambda x: {"image": layer(x["image"]), "label": x["label"]})
tfds.show_examples(ds_randomCrop, ds_info, rows=1, cols=3);
#END_SOLUTION
pass

**Please:**

Draw random images from a set of `imagenette/160px` subjected to:
* clipping to a random area of size `(64,64)`.

**Hint:**
* the use of layer in the definition of the lambda function will cause errors. Please try to interpret the error message and correct the code accordingly.


In [2]:
#BEGIN_SOLUTION
layer = tf.keras.layers.RandomCrop(64,64, dtype=tf.uint8)
ds_randomCrop = ds.map(lambda x: {"image": layer(x["image"]), "label": x["label"]})
tfds.show_examples(ds_randomCrop, ds_info, rows=1, cols=3);
#END_SOLUTION
pass

NameError: name 'tf' is not defined

**Please:**

Draw random images from a set of `imagenette/160px` subjected to:
* a random rotation in the range of $/pm/4$.
* fill in the blanks after rotation with black color.

**Hint:**
* the use of layer in the definition of the lambda function will cause errors. Please try to interpret the error message and correct the code accordingly.

In [None]:
#BEGIN_SOLUTION
layer = tf.keras.layers.RandomRotation(1/8.0, fill_mode='constant',  dtype=tf.uint8)
ds_randomRotation = ds.map(lambda x: {"image": layer(x["image"]), "label": x["label"]})
tfds.show_examples(ds_randomRotation, ds_info, rows=1, cols=3);
#END_SOLUTION
pass

## Text data

Converting text into digital form can be done in a number of ways. The two most common are:
* **text vectorisation** - each tag (`token`) is assigned an integer, an index in the dictionary. The mapping ${\mathrm tekst}\leftrightarrow {\mathrm indeks}$ is determined by the contents of the dataset. 

* **embedding** - each tag is assigned an n-dimensional vector of floating point numbers. The mapping ${\mathrm tekst}\leftrightarrow {\mathrm indeks}$ is found during model training.

### Vectorization

```Python
tf.keras.layers.TextVectorization(
    max_tokens=None,                           # maximum number of tokens in the dictionary
    standardize='lower_and_strip_punctuation', # text standardisation algorithm
    split='whitespace',                        # word splitting algorithm
    ngrams=None,                               # word splitting algorithm into n-letter chunks 
    output_mode='int',                         # output type  
    output_sequence_length=None,               # maximum length of encoded 'sentence' sequence 
    pad_to_max_tokens=False,                   # whether to complete the sequence with zeros up to the maximum length
    vocabulary=None                            # Dictionary. If not specified vocabulary generation requires calling the adapt() method
)
 ```

 Tags not present in the dictionary will be given the same index as the OOV tag (`out of vocabulary`) 

**Please:**.

* build a dictionary on the `wksf/Korpus_surowy` text loaded as part of the homework from the previous notebook.
* vectorise the text `Król zasiada na tronie.`
* print the vectorised form to the screen.
* reverse engineer the text from the vectorised form.
* repeat the procedure for the text `Ania ma małego kotka.`

**Hints**: 
* the dictionary created by the `tf.keras.layers.TextVectorization` layer is obtained by the `get_vocabulary()` method.
* a text can be created from the elements of the `words` sequence as follows:
```Python
sentence = " ".join(words)
```


In [None]:
#BEGIN_SOLUTION
filePath = "../data/wksf/Korpus_surowy/"
dataset = txt_fcn.load_wksf_dataset(filePath)

vectorize_layer = tf.keras.layers.TextVectorization(max_tokens=1000000, output_mode = "int")
vectorize_layer.adapt(dataset.batch(128))

text = 'Król zasiada na tronie.'
#text = 'Królowa zasiada na tronie.'
#text = 'Ania ma małego kotka.'
print(colored("Text:", "blue"), text)
encoded = vectorize_layer(tf.constant(text))
print(colored("Encoded:", "blue"), encoded.numpy())

vocabulary = vectorize_layer.get_vocabulary()
vocab_arr = np.array(vocabulary) 
decoded = " ".join(vocab_arr[encoded.numpy()])
print(colored("Decoded: ", "blue"), decoded)
#END_SOLUTION
pass


### Embedding


```Python
tf.keras.layers.Embedding(
    input_dim,                          # dictionary size - number of tags (tokens)
    output_dim,                         # representation dimension
)
```

The embedding layer assigns a floating point value to each token.
Such an operation can be represented by a matrix `(output_dim, input_dim)` which acts on a hot-one vector of length `(input_dim)`.
Here `output_dim=3`:

$$
\huge{
\begin{bmatrix}
a_{0} & b_{0} & c_{0} & \dots \\
a_{1} & b_{1} & c_{1} & \dots \\
a_{2} & b_{2} & c_{2} & \dots \\
\end{bmatrix}
\cdot
\begin{bmatrix}
1 \\
0 \\
0 \\
\dots \\
0
\end{bmatrix}
=
\begin{bmatrix}
a_{0} \\
a_{1} \\
a_{2} 
\end{bmatrix}
}
$$
The `tf.keras.layers.Enbedding()` layer performs this operation in an optimised way.
The enbedding matrix is usually changed when training the model that contains it, so it is not a standard preprocessing layer.


**Please:**

* vectorize the text `Król zasiada na tronie.`
* feed the vectorised form to the input of the embedding layer with `nDims = 4`
* print both forms of the text to the screen

In [None]:
#BEGIN_SOLUTION
nTokens = len(vocabulary)
nDims = 4 
embedding_layer = tf.keras.layers.Embedding(nTokens, nDims)

text = 'Król zasiada na tronie.'
#text = 'Królowa zasiada na tronie.'
encoded = vectorize_layer(tf.constant(text))
print(colored("Encoded:", "blue"), encoded.numpy())
print(colored("Embedded: ", "blue"), embedding_layer(encoded).numpy())
#END_SOLUTION
pass

### Division into n-grams

During analysis, a text is usually divided into sections containing `n` tokens - n-grams.
We will divide the sentences read from the Polish language corpus into segments of `n` word length. We will make use of ready-to-use functions for operating on text, available in the dedicated library `tensorflow_text`.

* splitting text into fragments (here words separated by a space):
```Python
tensorflow_text.WhitespaceTokenizer().tokenize(text)
```

* create groups of the chosen length using a running window - groups cross over except for the last word
```Python
tensorflow_text.tf_text.sliding_window(data,        # token list
                                        width,      # width of window running along list
                                        axis=-1,    # the dimension along which the window runs
                                        name=None   # function name
```

In [None]:
import tensorflow_text as tf_text
import functools

# split lines into words
dataset = dataset.map(tf_text.WhitespaceTokenizer().tokenize)

# fix all function arguments except for the input data
window_size = 5
slidingWindowWithWidth = functools.partial(tf_text.sliding_window, width=window_size)

# apply the sliding window to each line.
# this will priduce a tensor of shape (n, width) for each line,
# where n in the number of groups of words of words of width length
dataset = dataset.map(slidingWindowWithWidth)

# remove empty lines 
dataset = dataset.filter(lambda x: tf.size(x) > 0)

# split the (n, width) tensor into (n) tensors of shape (width)
dataset = dataset.unbatch()

# merge words into sentence framgents
dataset = dataset.map(lambda x: tf.strings.reduce_join(x, separator=' '))

print(colored("First five five-word blocks:", "blue"))
for item in dataset.take(5):
    print(colored("Text: ", "blue"), item.numpy().decode())

**Please:**

* create a dictionary for text vectorization using `tf.keras.layers.TextVectorization.adapt(...)` layer
* create a vectorised dataset: `dataset_vectorized` using `tf.keras.layers.TextVectorization` layer and `dataset.map()` operation
* store the dictionary in the `vocabulary` variable as a numpy array.
* print the number of tokens in the dictionary to the screen.
* print the first five examples to the screen in a vectorised form.

**Hint:**
* operations on datasets can be sped up by performing them on batches:
```Python
dataset.batch(n).map(...).unbatch()
``` 

In [1]:
#BEGIN_SOLUTION
vectorize_layer = tf.keras.layers.TextVectorization(output_mode = "int")
vectorize_layer.adapt(dataset.batch(1024))
vocabulary = np.array(vectorize_layer.get_vocabulary())
vocabulary_length = vocabulary.shape[0] 
dataset_vectorized = dataset.batch(1024).map(vectorize_layer, num_parallel_calls=tf.data.AUTOTUNE).unbatch()
dataset_vectorized = dataset_vectorized.filter(lambda x: tf.math.count_nonzero(x==1, axis=0) < 2)
print(colored("Vocabulary length: ", "blue"), vocabulary_length)
#END_SOLUTION

print(colored("First five five-word blocks in the vectorized form:", "blue"))
for item in dataset_vectorized.take(5):
    print(colored("Text: ", "blue"), item)

NameError: name 'tf' is not defined

**Please:**

* transform the vectorized set containing n-grams to the form `(features, label)` where:
    * **label** - middle word
    * **features** - the words outside the middle word
* the transformation should use the `Dataset.map(...)` method using a custom mapping function `map_fn(...)`.
* assume that the collection has been split into batches, so that a single feature has the shape `(None,width)`.
* print the features and labels for the five examples to the screen.

**Hints**: 
* it can be assumed that `n=5`.
* the middle word can be assumed to have index `2`.

In [None]:
###################################################
def map_fn(x):
    #BEGIN_SOLUTION
    middle = 2
    features =  tf.concat((x[:,:middle], x[:,middle+1:]), axis=1)
    label = x[:,middle]
    #END_SOLUTION
    return features, label
###################################################
def print_item(batch, vocabulary, width=2):
    batch_index = 0
    item = (batch[0][batch_index], batch[1][batch_index])
    features = " ".join(vocabulary[item[0].numpy()[0:width]])
    label = vocabulary[item[1].numpy()]   
    print(colored("Features", "blue"), end=" ")
    print(colored("(Label):", "red"), end=" ")

    print(features, end=" ")
    print(colored(label,"red"), end=" ")
    features = " ".join(vocabulary[item[0].numpy()[width:]])
    print(features)
################################################### 

dataset_final = dataset_vectorized.batch(32).map(map_fn)

for item in dataset_final.take(5):
    print_item(item, vocabulary)
    print(colored("Vectorized form:", "blue"), )
    print(colored("Features: ", "blue"), item[0][0].numpy(), end=" ")
    print(colored("Label: ", "blue"), item[1][0].numpy())  

**Please:**

* check the reading speed of the final dataset using the `benchmark` function.


In [None]:
#BEGIN_SOLUTION
tfds.benchmark(dataset_final)
#END_SOLUTION
pass

# Homework

**Please:**

* load the text from file filePath = `shakespeare.txt'`

```Python

filePath = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')
```

* do the `preprocessing` of the text:
    * division of the text into fragments of five words. One example in the new set should consist of one five-word example, not a group of formed by dividing a sentence into chunks of five words:
      ```
      
      Features (Label): before we proceed any further
      Features:  [128  33 123 639] Label:  1267
      ```
    * tokenisation with a dictionary limited to **1000** tokens
    * division of fragments into label (middle word) and features (other words)
* print out five examples on the screen, together with their features and the label
* create an embedding layer with `128` dimensions.
* output to the screen the five words closest to the word `man` in the embedding space with cosine distance:
```Python
cosine_similarity = tf.keras.losses.cosine_similarity(...)
```
* print out on the screen the five words closest to the sum of the words `mother` and `father` made in the embedding space

**Hint:**
* the largest `n` values in the list can be obtained with the `tf.math.top_k(...)` function


<hr>

**Optional:**

* perform embedding layer training with `128` dimensions using a continuous bag-of-words algorithm - [`Continous Bag of Words (CBOW)`](https://d2l.ai/chapter_natural-language-processing-pretraining/word2vec.html#the-continuous-bag-of-words-cbow-model) (naive version).

**Hint:**

* calculating the scalar product of the feature representation and all words of the dictionary requires the definition of a scalar product counting layer:
```Python
class Dot(tf.keras.Layer):
    def call(self, x):
        dot_product = ...
        return dot_product

```
and using its definition in the model.


In [None]:

# load text
#BEGIN_SOLUTION
filePath = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')
dataset = txt_fcn.load_wksf_dataset(filePath)
#END_SOLUTION

# adapt vextorization layer
#BEGIN_SOLUTION
vectorize_layer = tf.keras.layers.TextVectorization(max_tokens=1000, output_mode = "int")
vectorize_layer.adapt(dataset.batch(128))
#END_SOLUTION

# split lines into words
#BEGIN_SOLUTION
dataset = dataset.map(tf_text.WhitespaceTokenizer().tokenize)
#END_SOLUTION

# fix all tf_text.sliding_window function arguments except for the input data
#BEGIN_SOLUTION
window_size = 5
slidingWindowWithWidth = functools.partial(tf_text.sliding_window, width=window_size)
#END_SOLUTION

# apply the sliding window to each line.
# this will produce a tensor of shape (n, width) for each line,
# where n in the number of groups of words with length width
#BEGIN_SOLUTION
dataset = dataset.map(slidingWindowWithWidth)
#END_SOLUTION

# remove empty lines 
#BEGIN_SOLUTION
dataset = dataset.filter(lambda x: tf.size(x) > 0)
#END_SOLUTION

# split the (n, width) tensor into (n) tensors of shape (width)
#BEGIN_SOLUTION
dataset = dataset.unbatch()
#END_SOLUTION

# merge words into sentence framgents
#BEGIN_SOLUTION
dataset = dataset.map(lambda x: tf.strings.reduce_join(x, separator=' '))
#END_SOLUTION

#Vectorize
#BEGIN_SOLUTION
vectorize_layer = tf.keras.layers.TextVectorization(output_mode = "int")
vectorize_layer.adapt(dataset.batch(1024))
vocabulary = np.array(vectorize_layer.get_vocabulary())
vocabulary_length = vocabulary.shape[0] 
dataset_vectorized = dataset.batch(1024).map(vectorize_layer, num_parallel_calls=tf.data.AUTOTUNE).unbatch()
dataset_vectorized = dataset_vectorized.filter(lambda x: tf.math.count_nonzero(x==1, axis=0) < 2)
print(colored("Vocabulary length: ", "blue"), vocabulary_length)

dataset_final = dataset_vectorized.batch(32).map(map_fn)
#END_SOLUTION

for item in dataset_final.take(5):
    print_item(item, vocabulary, width=2)
    print(colored("Features: ", "blue"), item[0][0].numpy(), end=" ")
    print(colored("Label: ", "blue"), item[1][0].numpy())  

In [None]:
# CBOW model training (optional)
#BEGIN_SOLUTION
class Dot(tf.keras.Layer):
    def call(self, x):
        dot_product = tf.math.multiply(x[0], x[1])
        dot_product = tf.math.reduce_sum(dot_product, axis=2)
        return dot_product


embedding_depth = 128
input_layer = tf.keras.layers.Input(shape=(window_size-1,), dtype=tf.int32)
embedding_layer = tf.keras.layers.Embedding(vocabulary_length, embedding_depth, name="embedding")
context_embedding = embedding_layer(input_layer)
vocabulary_embedding = embedding_layer(tf.range(vocabulary_length))
context_mean = tf.keras.layers.GlobalAveragePooling1D(keepdims=True)(context_embedding)
dot_product = Dot()([context_mean, vocabulary_embedding])
model = tf.keras.Model(inputs=input_layer, outputs=dot_product)
model.summary()
tf.keras.utils.plot_model(model, 'fig_png/ML_model.png', show_shapes=True)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer='adam', loss=loss, metrics=['accuracy'])
#END_SOLUTION

#Evaluate non trained model
model.evaluate(dataset_final.take(16))

#Training 
#BEGIN_SOLUTION
nEpochs = 100
initial_learning_rate = 2E-2
    
nStepsPerEpoch = 2200
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(initial_learning_rate,
                decay_steps=nStepsPerEpoch*10,
                decay_rate=0.95,
                staircase=False)

early_stop_callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, verbose=1)
callbacks = [early_stop_callback]
           
history = model.fit(dataset_final.skip(16).take(nStepsPerEpoch), 
                    validation_data=dataset_final.take(16),
                    epochs=nEpochs,
                    callbacks=callbacks, 
                    verbose=0)
    
model.evaluate(dataset_final.take(16))  
txt_fcn.dump_embedding(model, vocabulary)
plf.plotTrainHistory(history)

# Print model predictions
for batch in dataset_final.skip(16).take(5):
    print_item(batch, vocabulary)
    response = tf.math.argmax(model(batch[0]), axis=1)[0]
    print(colored("Response:", "blue"), vocabulary[response])
#END_SOLUTION
pass

In [None]:
# Embeding space exploration - words similar to "man"
#BEGIN_SOLUTION
#embedding_layer = tf.keras.layers.Embedding(vocabulary_length, embedding_depth, name="embedding")
embedding_layer = model.get_layer("embedding")
vocabulary_embedding = embedding_layer(tf.range(vocabulary_length))

word = "man"
word_index = np.where(vocabulary == word)[0][0]
word_embedding = vocabulary_embedding[word_index]

print(colored("Word embedding:", "blue"), word_embedding.shape)
print(colored("Vocabulary embedding:", "blue"), vocabulary_embedding.shape)
cosine_similarity = -tf.keras.losses.cosine_similarity(word_embedding, vocabulary_embedding, axis=-1)
euclidean_distance = tf.norm(word_embedding - vocabulary_embedding, axis=-1)

top_k = tf.math.top_k(cosine_similarity, k=5)
#top_k = tf.math.top_k(euclidean_distance, k=5)
top_k_indices = top_k.indices.numpy()
top_k_values = top_k.values.numpy() 
top_k_words = vocabulary[top_k_indices]
print(colored("Top 5 words similar to: ", "blue"), word)
for word, distance in zip(top_k_words, top_k_values):
    print(colored("\t"+word+"\t", "red"), distance)
 #END_SOLUTION
pass   

In [None]:
# Word arithmetics - words similar to "mother" + "father"
words = np.array(["father", "mother"])
#BEGIN_SOLUTION
words_indices = [np.where(vocabulary == x)[0][0] for x in words]
words_embedding = tf.gather(vocabulary_embedding, words_indices)
word_embedding =  words_embedding[0] + words_embedding[1]
euclidean_distance = tf.norm(word_embedding - vocabulary_embedding, axis=-1)
cosine_similarity =  tf.keras.losses.cosine_similarity(word_embedding, vocabulary_embedding, axis=-1)
top_k = tf.math.top_k(-euclidean_distance, k=5)
#top_k = tf.math.top_k(-cosine_similarity, k=5)
top_k_indices = top_k.indices.numpy()
top_k_values = top_k.values.numpy() 
top_k_words = vocabulary[top_k_indices]
print(colored("Top words similar to: ", "blue"), words[1]," + ", words[0])
for word, distance in zip(top_k_words, top_k_values):
    print(colored("\t"+word+"\t", "red"), distance)
#END_SOLUTION
pass    

The immersion can be visualised using the portal [Embeding Projector](http://projector.tensorflow.org/?_gl=1*u2l7wh*_ga*MTg4NTM3NDUwOC4xNzA3OTg4NTU4*_ga_W0YLR4190T*MTcxNTI0MzQxOC44Ny4xLjE3MTUyNDQ5NzMuMC4wLjA.)
The `vectors.tsv` and `metadata.tsv` files obtained from the immersion layer need to be uploaded to the site.

In [None]:
def dump_embedding(model, vocabulary):
  import io
  out_v = io.open('vectors.tsv', 'w', encoding='utf-8')
  out_m = io.open('metadata.tsv', 'w', encoding='utf-8')
  weights = model.get_layer('embedding').get_weights()[0]
  for index, word in enumerate(vocabulary):
    if index == 0:
      continue  # skip 0, it's padding.
    vec = weights[index]
    out_v.write('\t'.join([str(x) for x in vec]) + "\n")
    out_m.write(word + "\n")
  out_v.close()
  out_m.close()

dump_embedding(model, vocabulary)