# Question auto-encoder evaluation

In this notebook we're going to evaluate the question auto-encoder results.

The first part until the model loading is the same as the `auto_encoder_training` notebook since we need to perform the same operations and pre-processing needed for predicting with the model.

### Imports

In [12]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from nltk.tokenize import word_tokenize
from collections import Counter
from keras.preprocessing.sequence import *
from keras.models import *
from keras.layers import *
from keras.utils import plot_model
from keras.callbacks import ModelCheckpoint
from sklearn.manifold import TSNE

import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import pickle

from pyspark.sql.window import Window

sns.set(style="ticks")

spark = SparkSession \
    .builder \
    .appName("QuestionRephrasing-AutoEncoder") \
    .config("spark.executor.memory", "5G")\
    .config("spark.driver.memory", "10G")\
    .config("spark.driver.maxResultSize", "5G")\
    .getOrCreate()

w = Window().orderBy(F.lit('A'))

spark.sparkContext.setCheckpointDir('data/checkpoints')
questions = spark.read.parquet("data/processed/union/*").withColumn("columnindex", F.row_number().over(w))
questions.printSchema()

root
 |-- question: string (nullable = true)
 |-- answer: string (nullable = true)
 |-- image_id: string (nullable = true)
 |-- tokenized_question: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- question_len: double (nullable = true)
 |-- question_word_len: double (nullable = true)
 |-- first_word: string (nullable = true)
 |-- columnindex: integer (nullable = true)



In [3]:
max_word_len = int(questions.agg({"question_word_len": "max"}).collect()[0]["max(question_word_len)"])

f"Maximum word length is {max_word_len}."

'Maximum word length is 28.'

## Vocabulary build

We need to extract a numerical representation for *tokens*.

In [4]:
# Tokens vocabulary and mappers
tokens = questions.select('tokenized_question')\
    .rdd\
    .flatMap(lambda x: x['tokenized_question'])\
    .collect()

word_mapping = {}
word_mapping_reversed = {}
word_counter = Counter(tokens)
for idx, value in enumerate(word_counter):
    word_mapping[value] = idx
    word_mapping_reversed[idx] = value
    
f"Word mapping example for 'is': {word_mapping['is']}."

"Word mapping example for 'is': 1."

### Input pre-processing

Now let's pre-process the input to have the corresponding **mappings** for *words*.

In [5]:
extract_word_embeddings = F.udf(lambda tokenized_question: [[word_mapping[word] + 1] for word in tokenized_question], ArrayType(ArrayType(IntegerType())))

questions = questions.withColumn('question_word_embeddings', extract_word_embeddings(F.col('tokenized_question')))
questions.head(1)

[Row(question='what is this photo taken looking through?', answer='net', image_id='458752', tokenized_question=['what', 'is', 'this', 'photo', 'taken', 'looking', 'through', '?'], question_len=41.0, question_word_len=8.0, first_word='what', question_word_embeddings=[[1], [2], [3], [4], [5], [6], [7], [8]])]

In [6]:
word_embeddings = questions.select('question_word_embeddings')\
    .rdd\
    .map(lambda x: x['question_word_embeddings'])\
    .collect()
word_embeddings = pad_sequences(word_embeddings, maxlen=max_word_len, dtype='int32', padding='post', truncating='pre', value=0.0)
word_embeddings[:1]

array([[[1],
        [2],
        [3],
        [4],
        [5],
        [6],
        [7],
        [8],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0]]], dtype=int32)

### Load the model

Load the model by dropping it's last 3 layers in order to obtain the array representation of the input computed by the neural network.

In [7]:
encoding_dim = 100

model = Sequential()
model.add(LSTM(encoding_dim, activation='relu', input_shape=(max_word_len, 1)))
model.add(RepeatVector(max_word_len))
model.add(LSTM(max_word_len, activation='relu', return_sequences=True))
model.add(TimeDistributed(Dense(1)))
model.compile(optimizer='adam', loss='mse', metrics=['mae', 'accuracy'])
model.summary()

model.load_weights("model-checkpoints/autoencoder-words/autoencoder-model-10-0.01.hdf5")

model.pop()
model.pop()
model.pop()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 100)               40800     
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 28, 100)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 28, 28)            14448     
_________________________________________________________________
time_distributed_1 (TimeDist (None, 28, 1)             29        
Total params: 55,277
Trainable params: 55,277
Non-trainable params: 0
_________________________________________________________________


### Extract the question embeddings.

Compute the embeddings and dump them as pickle (so we can play with them without running the above cells). 

In [8]:
question_embeddings = model.predict(word_embeddings, verbose=1)
pickle.dump(question_embeddings, open("model-checkpoints/autoencoder-words/question_embeddings.pickle", "wb" ) )



### Original dataframe mapping

Let's map now the question embeddings to the original dataframe so we can create some nice plots below.

In [9]:
question_embeddings = pickle.load(open("model-checkpoints/autoencoder-words/question_embeddings.pickle", "rb" ) )
question_embeddings.shape, questions.count()

((1895874, 100), 1895874)

In [10]:
question_embeddings_arrays = [(embedding.tolist(), ) for embedding in question_embeddings]
question_embeddings_df = spark.createDataFrame(question_embeddings_arrays, ["question_embeddings"])\
    .withColumn("columnindex", F.row_number().over(w))
question_embeddings_df.head(10)

[Row(question_embeddings=[0.0004767561622429639, 0.02725340984761715, 0.015538804233074188, 0.0, 1.1279046248091618e-06, 0.1348886489868164, 0.45405861735343933, 0.4031505584716797, 0.0, 0.3173833191394806, 0.8812439441680908, 0.26736029982566833, 0.00025665387511253357, 0.14521507918834686, 0.05515861511230469, 0.00047478172928094864, 0.0011164809111505747, 0.5989505648612976, 0.6091572642326355, 0.002047825139015913, 0.28944143652915955, 0.08717868477106094, 0.006879337597638369, 0.007671233732253313, 0.0, 0.0, 0.40932971239089966, 2.3797378540039062, 0.00675981817767024, 0.0, 0.0, 0.009526976384222507, 0.032301124185323715, 0.09007634967565536, 0.0021635936573147774, 0.0, 0.0, 0.0017508039018139243, 0.08948977291584015, 0.44869595766067505, 0.14767955243587494, 0.006362385582178831, 0.00017617909179534763, 0.49680453538894653, 0.007712991908192635, 4.136678218841553, 0.17391712963581085, 0.5104081630706787, 2.8478558306233026e-05, 0.040004707872867584, 0.006689118687063456, 0.935730

In [13]:
questions_with_embeddings = questions.join(question_embeddings_df, "columnindex")
questions_with_embeddings.write.mode('overwrite').parquet("data/processed/auto-encoder-questions-with-embeddings")

In [14]:
questions_with_embeddings = spark.read.parquet("data/processed/auto-encoder-questions-with-embeddings/*")
questions_with_embeddings.head(5)

[Row(columnindex=1, question='what is this photo taken looking through?', answer='net', image_id='458752', tokenized_question=['what', 'is', 'this', 'photo', 'taken', 'looking', 'through', '?'], question_len=41.0, question_word_len=8.0, first_word='what', question_embeddings=[0.0004767561622429639, 0.02725340984761715, 0.015538804233074188, 0.0, 1.1279046248091618e-06, 0.1348886489868164, 0.45405861735343933, 0.4031505584716797, 0.0, 0.3173833191394806, 0.8812439441680908, 0.26736029982566833, 0.00025665387511253357, 0.14521507918834686, 0.05515861511230469, 0.00047478172928094864, 0.0011164809111505747, 0.5989505648612976, 0.6091572642326355, 0.002047825139015913, 0.28944143652915955, 0.08717868477106094, 0.006879337597638369, 0.007671233732253313, 0.0, 0.0, 0.40932971239089966, 2.3797378540039062, 0.00675981817767024, 0.0, 0.0, 0.009526976384222507, 0.032301124185323715, 0.09007634967565536, 0.0021635936573147774, 0.0, 0.0, 0.0017508039018139243, 0.08948977291584015, 0.44869595766067

## Apply dimensionality reduction through TSNE and plot the question embeddings. 

Extract a sample of data from the initial dataset.

In [20]:
limit = 5000
embeddings = questions_with_embeddings.select("question_embeddings")\
    .orderBy(F.rand())\
    .limit(limit)\
    .rdd\
    .map(lambda x: x["question_embeddings"])\
    .collect()

embeddings[:10]

[[0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  48.3398323059082,
  56948.83984375,
  89681.25,
  0.0,
  39265.5625,
  143636.59375,
  0.0,
  0.0,
  0.0,
  253999.75,
  0.0,
  3.065093480674602e-13,
  79896.890625,
  403532.5625,
  3.5779035091400146,
  1.2449005317126636e-25,
  46541.6796875,
  0.0,
  0.0,
  5.087332920492505e-35,
  0.0,
  98116.015625,
  1493.719482421875,
  0.0,
  0.0,
  122.41236877441406,
  0.0,
  0.0,
  0.0,
  0.0,
  66.15975952148438,
  0.0,
  0.0,
  0.0,
  212684.53125,
  0.0,
  149.26373291015625,
  0.0,
  0.0,
  0.0,
  1148460.625,
  0.0,
  287067.15625,
  0.0,
  0.0,
  0.0,
  294569.78125,
  211.09207153320312,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  296410.28125,
  0.0,
  528259.6875,
  0.0,
  77902.734375,
  0.0,
  0.0,
  0.0,
  114.57537841796875,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.6075478196144104,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  32032.451171875,
  0.0,
  968585.75,
  0.0,
  0.0,
  0.0,
  0.0,
  0

### 2D TSNE representation

In [51]:
tsne = TSNE(n_components=2, perplexity=10)
X_embedded = tsne.fit_transform(embeddings)
pickle.dump(X_embedded, open("model-checkpoints/autoencoder-words/question_embeddings_tsne.pickle", "wb" ) )
tsne.kl_divergence_

0.5653679966926575

In [52]:
question_embeddings_tsne_df = spark.createDataFrame([x.tolist() for x in X_embedded], ["X_tsne", "Y_tsne"])
question_embeddings_tsne_df = question_embeddings_tsne_df.withColumn("columnindex", F.row_number().over(w))

questions_with_tsne_embeddings = questions_with_embeddings.limit(limit)\
    .join(question_embeddings_tsne_df, questions_with_embeddings.columnindex == question_embeddings_tsne_df.columnindex)\
    .select("question", "X_tsne", "Y_tsne")\
    .toPandas()
questions_with_tsne_embeddings[:10]

Unnamed: 0,question,X_tsne,Y_tsne
0,what is this photo taken looking through?,-68.651619,-66.453575
1,what position is this man playing?,-29.373774,47.110592
2,what color is the players shirt?,-73.331406,51.280445
3,is this man a professional baseball player?,110.457047,8.151302
4,what color is the snow?,46.114807,-13.601019
5,what is the person doing?,15.066262,16.832659
6,what color is the persons headwear?,97.528236,-14.53904
7,what is in the person's hand?,58.032219,16.795269
8,is the dog waiting?,-3.288669,20.97929
9,is the dog looking at a tennis ball or frisbee?,14.343685,12.81972


In [58]:
# Use column names of df for the different parameters x, y, color, ...
fig = px.scatter(questions_with_tsne_embeddings, x="X_tsne", y="Y_tsne",
                 hover_name="question",
                 title="Question embeddings", 
                 range_color=[0, 1],
                 opacity=0.3
                )

fig.show()

### 3D TSNE representation

In [59]:
tsne = TSNE(n_components=3, perplexity=10)
X_embedded = tsne.fit_transform(embeddings)
pickle.dump(X_embedded, open("model-checkpoints/autoencoder-words/question_embeddings_tsne_3d.pickle", "wb" ) )
tsne.kl_divergence_

0.4420629143714905

In [60]:
question_embeddings_tsne_df = spark.createDataFrame([x.tolist() for x in X_embedded], ["X_tsne", "Y_tsne", "Z_tsne"])
question_embeddings_tsne_df = question_embeddings_tsne_df.withColumn("columnindex", F.row_number().over(w))

questions_with_tsne_embeddings = questions_with_embeddings.limit(limit)\
    .join(question_embeddings_tsne_df, questions_with_embeddings.columnindex == question_embeddings_tsne_df.columnindex)\
    .select("question", "X_tsne", "Y_tsne", "Z_tsne")\
    .toPandas()
questions_with_tsne_embeddings[:10]

Unnamed: 0,question,X_tsne,Y_tsne,Z_tsne
0,what is this photo taken looking through?,3.229079,-18.718359,11.808313
1,what position is this man playing?,-16.531904,11.154267,-12.307136
2,what color is the players shirt?,14.653105,3.713481,25.622866
3,is this man a professional baseball player?,14.690291,-20.245443,2.061845
4,what color is the snow?,8.08521,9.521637,-14.623901
5,what is the person doing?,17.471678,-5.899321,2.547429
6,what color is the persons headwear?,5.435132,-5.909604,1.619166
7,what is in the person's hand?,20.827772,0.961384,-22.968546
8,is the dog waiting?,-3.14932,-13.663744,-1.032894
9,is the dog looking at a tennis ball or frisbee?,17.124624,-4.374685,-0.198798


In [61]:
# Use column names of df for the different parameters x, y, color, ...
fig = px.scatter_3d(questions_with_tsne_embeddings, x="X_tsne", y="Y_tsne", z="Z_tsne",
                 hover_name="question",
                 title="3D Question embeddings", opacity=0.3
                )

fig.show()