# Create LSH dataset for testing purpose

### Import libraries

In [1]:
%matplotlib inline

import sys
sys.path.append('../')

In [2]:
import time
import pprint
import os
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from keras.callbacks import EarlyStopping
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot
import matplotlib.pyplot as plt
import seaborn as sns
from deeper_model import build_model
from deeper_utils import preprocess_data

Using TensorFlow backend.


### Read the dataset

In [6]:
DATASET_NAME = 'DI2KG'

In [None]:
#cella per eseguire su Kaggle
trainData, testData, valData, embeddingMatrix, wordsWithNoEmbeddings = preprocess_data(
    DATASET_NAME, baseDir = '../input', embeddingDir='fasttextmodel', datasetDir='deeperdatasets/datasets', maxSequenceLength=100)

In [7]:
#cella per leggere in locale
trainData, testData, valData, embeddingMatrix, wordsWithNoEmbeddings = preprocess_data(
    DATASET_NAME, baseDir = '..',usePretrainedModel=False,embeddingDir='glove',embeddingFilename='glove.840B.300d.txt' ,
    datasetDir='datasets', maxSequenceLength=100)

In [13]:
leftTableTrainData, rightTableTrainData, trainLabels = trainData
leftTableTestData, rightTableTestData, testLabels = testData
leftTableValData, rightTableValData, valLabels = valData

In [26]:
print(leftTableTrainData.shape)
print(leftTableValData.shape)
print(leftTableTestData.shape)

(4195, 100)
(525, 100)
(524, 100)


In [24]:
leftTableData = np.concatenate((leftTableTrainData,leftTableValData,leftTableTestData),axis=0)

In [34]:
rightTableData = np.concatenate((rightTableTrainData,rightTableValData,rightTableTestData),axis=0)

### Build the DeepER model 

In [27]:
from keras.models import load_model

model = load_model('../models/' + DATASET_NAME + '-model.h5')
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 100)          0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 100)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 100, 300)     201600      input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) (None, 150)          225600      embedding_1[0][0]                
          

### Plot DeepER architecture

In [28]:
SVG(model_to_dot(model, show_shapes=True).create(prog='dot', format='svg'))

ImportError: Failed to import `pydot`. Please install `pydot`. For example with `pip install pydot`.

### Testing LSH
Build a new NN that given two inputs (a record from the left table and another one from the right table) outputs their corresponding embeddings.

In [29]:
import pprint
from keras.models import load_model
indexToLayerMap = {i: v for i, v in enumerate(model.layers)}

pprint.pprint(indexToLayerMap)

{0: <keras.engine.input_layer.InputLayer object at 0x7f2410ee7748>,
 1: <keras.engine.input_layer.InputLayer object at 0x7f2410ee75f8>,
 2: <keras.layers.embeddings.Embedding object at 0x7f2410ee7b70>,
 3: <keras.layers.wrappers.Bidirectional object at 0x7f24112dddd8>,
 4: <keras.layers.merge.Subtract object at 0x7f2410ef28d0>,
 5: <keras.layers.core.Dense object at 0x7f2410ef2ac8>,
 6: <keras.layers.core.Dense object at 0x7f2410ef2f28>,
 7: <keras.layers.core.Dense object at 0x7f2410f06c18>}


In [30]:
from keras.models import Model

recordToEmbeddingNN = Model(inputs=model.input, 
                          outputs= [model.get_layer(index=3).get_output_at(0), model.get_layer(index=3).get_output_at(1)])

recordToEmbeddingNN.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 100)          0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 100)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 100, 300)     201600      input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) (None, 150)          225600      embedding_1[0][0]                
          

In [31]:
SVG(model_to_dot(recordToEmbeddingNN, show_shapes=True).create(prog='dot', format='svg'))

ImportError: Failed to import `pydot`. Please install `pydot`. For example with `pip install pydot`.

In [35]:
predictedEmbeddings = recordToEmbeddingNN.predict(x=[leftTableData, rightTableData])

leftTableEmbeddings = predictedEmbeddings[0]
rightTableEmbeddings = predictedEmbeddings[1]

leftTableEmbeddings = leftTableEmbeddings.tolist()
rightTableEmbeddings = rightTableEmbeddings.tolist()

labels = np.argmax(testLabels, axis=1)

In [36]:
len(labels)

524

In [13]:
lshDfDict = {"left_table": leftTableEmbeddings, "right_table": rightTableEmbeddings, "label": labels}
lshDf = pd.DataFrame(lshDfDict)
lshDf.head()

Unnamed: 0,left_table,right_table,label
0,"[0.0, 0.0, 0.9993600249290466, -0.918881118297...","[-0.0, -0.0, 0.21460436284542084, -0.984001100...",0
1,"[0.0, 0.0, 0.9993600249290466, -0.918881118297...","[-0.0, 0.0, 0.7097654938697815, 0.450796663761...",0
2,"[0.0, 0.0, 0.9993600249290466, -0.918881118297...","[-0.0, 0.0, 0.47325998544692993, -0.9972549080...",0
3,"[0.0, 0.0, 0.9993600249290466, -0.918881118297...","[-0.0, 0.0, 0.03747762367129326, -0.5199741125...",0
4,"[0.0, -0.0, -0.74958735704422, -0.999116063117...","[-0.0, 0.0, 0.03747762367129326, -0.5199741125...",0


In [14]:
lshDf.shape

(5742, 3)

In [15]:
type(lshDf['left_table'][0])

list

In [16]:
len(lshDf['left_table'][0])

150

In [17]:
lshDf.to_csv('../lsh-testing-data/' + DATASET_NAME + '-embeddings.csv', encoding='utf-8', index=False)