# Create LSH dataset for testing purpose

### Import libraries

In [None]:
%matplotlib inline

import sys
sys.path.append('../')

In [None]:
import time
import os
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from keras.callbacks import EarlyStopping
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot
import matplotlib.pyplot as plt
import seaborn as sns
from deeper_model import build_model
from deeper_utils import preprocess_data

### Read the dataset

In [None]:
trainData, testData, valData, embeddingMatrix, wordsWithNoEmbeddings = preprocess_data(
    'Fodors_Zagats', baseDir = '..')

Printing out words with no embeddings in GloVe

In [None]:
for word in wordsWithNoEmbeddings:
    print(word)
print()
print("There are {} words with no embeddings in GloVe".format(len(wordsWithNoEmbeddings)))

Print embedding matrix shape

In [None]:
embeddingMatrix.shape

In [None]:
leftTableTrainData, rightTableTrainData, trainLabels = trainData
leftTableTestData, rightTableTestData, testLabels = testData
leftTableValData, rightTableValData, valLabels = valData

Print training set size

In [None]:
print(leftTableTrainData.shape)
print(rightTableTrainData.shape)

Print test set size

In [None]:
print(leftTableTestData.shape)
print(rightTableTestData.shape)

Print validation set size

In [None]:
print(leftTableValData.shape)
print(rightTableValData.shape)

### Build the DeepER model 

In [None]:
model = build_model(embeddingMatrix, denseUnits=[32, 16])
model.summary()

### Plot DeepER architecture

In [None]:
SVG(model_to_dot(model, show_shapes=True).create(prog='dot', format='svg'))

### Testing LSH
Build a new NN that given two inputs (a record from the left table and another one from the right table) outputs their corresponding embeddings.

In [None]:
from keras.models import Model

recordEmbeddingNN = Model(inputs=model.input, 
                          outputs= [model.get_layer("left_tuple_embedding").output, model.get_layer("right_tuple_embedding").output])

In [None]:
recordEmbeddingNN.summary()

In [None]:
SVG(model_to_dot(recordEmbeddingNN, show_shapes=True).create(prog='dot', format='svg'))

In [None]:
predictedEmbeddings = recordEmbeddingNN.predict(x=[leftTableTrainData, rightTableTrainData])

In [None]:
leftTableEmbeddings = predictedEmbeddings[0]
rightTableEmbeddings = predictedEmbeddings[1]

In [None]:
leftTableEmbeddings = leftTableEmbeddings.tolist()

In [None]:
rightTableEmbeddings = rightTableEmbeddings.tolist()

In [None]:
trainLabels

In [None]:
import numpy as np

labels = np.argmax(trainLabels, axis=1)
labels

In [None]:
import pandas as pd

lshTestDfDict = {"left_table": leftTableEmbeddings, "right_table": rightTableEmbeddings, "label": labels}
lshTestDf = pd.DataFrame(lshTestDfDict)
lshTestDf.head()

In [None]:
lshTestDf['left_table'][0]

In [None]:
type(lshTestDf['left_table'][0])

In [None]:
len(lshTestDf['left_table'][0])