# Create LSH dataset for testing purpose

### Import libraries

In [1]:
%matplotlib inline

import sys
sys.path.append('../')

In [2]:
import time
import pprint
import os
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from keras.callbacks import EarlyStopping
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot
import matplotlib.pyplot as plt
import seaborn as sns
from deeper_model import build_model
from deeper_utils import preprocess_data

Using TensorFlow backend.


### Read the dataset

In [4]:
DATASET_NAME = 'DI2KG'

In [5]:
#cella per eseguire su Kaggle
trainData, testData, valData, embeddingMatrix, wordsWithNoEmbeddings = preprocess_data(
    DATASET_NAME, baseDir = '../input', embeddingDir='fasttextmodel', datasetDir='deeperdatasets/datasets', maxSequenceLength=100)




ValueError: ../input\fasttextmodel\crawl-300d-2M-subword.bin cannot be opened for loading!

In [6]:
#cella per leggere in locale
trainData, testData, valData, embeddingMatrix, wordsWithNoEmbeddings = preprocess_data(
    DATASET_NAME, baseDir = '../',usePretrainedModel=False,embeddingDir='glove',embeddingFilename='glove.840B.300d.txt' ,
    datasetDir='datasets', maxSequenceLength=100)

In [7]:
leftTableTrainData, rightTableTrainData, trainLabels = trainData
leftTableTestData, rightTableTestData, testLabels = testData
leftTableValData, rightTableValData, valLabels = valData

In [8]:
print(leftTableTrainData.shape)
print(leftTableValData.shape)
print(leftTableTestData.shape)

(4195, 100)
(525, 100)
(524, 100)


In [9]:
leftTableData = np.concatenate((leftTableTrainData,leftTableValData,leftTableTestData),axis=0)

In [10]:
rightTableData = np.concatenate((rightTableTrainData,rightTableValData,rightTableTestData),axis=0)

In [21]:
dataLabels = np.concatenate((trainLabels,valLabels,testLabels),axis=0)

### Build the DeepER model 

In [11]:
from keras.models import load_model

model = load_model('../models/' + DATASET_NAME + '-model.h5')
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 100)          0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 100)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 100, 300)     201600      input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) (None, 150)          225600      embedding_1[0][0]                
          

### Plot DeepER architecture

In [12]:
SVG(model_to_dot(model, show_shapes=True).create(prog='dot', format='svg'))

OSError: `pydot` failed to call GraphViz.Please install GraphViz (https://www.graphviz.org/) and ensure that its executables are in the $PATH.

### Testing LSH
Build a new NN that given two inputs (a record from the left table and another one from the right table) outputs their corresponding embeddings.

In [13]:
import pprint
from keras.models import load_model
indexToLayerMap = {i: v for i, v in enumerate(model.layers)}

pprint.pprint(indexToLayerMap)

{0: <keras.engine.input_layer.InputLayer object at 0x0000027CCDDB4EB8>,
 1: <keras.engine.input_layer.InputLayer object at 0x0000027CCDDB47F0>,
 2: <keras.layers.embeddings.Embedding object at 0x0000027CCDDB4CF8>,
 3: <keras.layers.wrappers.Bidirectional object at 0x0000027CCDDAE630>,
 4: <keras.layers.merge.Subtract object at 0x0000027CCDDB4860>,
 5: <keras.layers.core.Dense object at 0x0000027CCDDB4748>,
 6: <keras.layers.core.Dense object at 0x0000027CCDDB4358>,
 7: <keras.layers.core.Dense object at 0x0000027CCDDB9B70>}


In [14]:
from keras.models import Model

recordToEmbeddingNN = Model(inputs=model.input, 
                          outputs= [model.get_layer(index=3).get_output_at(0), model.get_layer(index=3).get_output_at(1)])

recordToEmbeddingNN.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 100)          0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 100)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 100, 300)     201600      input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) (None, 150)          225600      embedding_1[0][0]                
          

In [None]:
SVG(model_to_dot(recordToEmbeddingNN, show_shapes=True).create(prog='dot', format='svg'))

In [22]:
predictedEmbeddings = recordToEmbeddingNN.predict(x=[leftTableData, rightTableData])

leftTableEmbeddings = predictedEmbeddings[0]
rightTableEmbeddings = predictedEmbeddings[1]

leftTableEmbeddings = leftTableEmbeddings.tolist()
rightTableEmbeddings = rightTableEmbeddings.tolist()

labels = np.argmax(dataLabels, axis=1)

In [24]:
len(leftTableEmbeddings)

5244

In [34]:
train_df = pd.read_csv('../datasets/DI2KG/DI2KG_train.csv')
valid_df = pd.read_csv('../datasets/DI2KG/DI2KG_valid.csv')
test_df = pd.read_csv('../datasets/DI2KG/DI2KG_test.csv')

In [53]:
camera_df = pd.concat([train_df,valid_df,test_df],axis=0)
camera_df = camera_df.reset_index(drop=True)

In [55]:
camera_df.tail(10)

Unnamed: 0,attributi_x,attributi_y,label
5234,Sony Alpha NEX-F3 / 18-55mm Kit - Price compar...,Canon PowerShot SX20 IS - PowerShot and IXUS d...,0
5235,Pentax Q10 / 5-15mm & 15-45mm Kit - Price comp...,Canon IXUS 105 - PowerShot and IXUS digital co...,0
5236,Nikon D4s - Price comparison & reviews - Digit...,Canon PowerShot S120 - PowerShot and IXUS digi...,0
5237,Nikon D3200 / 18-200mm Kit - Price comparison ...,Nikon D3200 Twin Kit with Nikon 18-55mm VR II ...,1
5238,Canon EOS 5D Mark III Black SLR Digital Camera...,HASSELBLAD STELLAR ORANGE W/WENGE GRIP H-3012709,0
5239,Canon EOS 100D / 24-105mm Kit - Price comparis...,Canon PowerShot G3 - PowerShot and IXUS digita...,0
5240,Sony Alpha NEX-5T/B Black SLR Digital Camera (...,"SONY DSC-W800 SILVER 20.1MP 5X WIDE ANGLE 2.7""...",0
5241,Fujifilm X-Pro 1 / 60mm Kit - Price comparison...,Canon IXUS 125 HS - PowerShot and IXUS digital...,0
5242,Pentax K-S1 - Price comparison & reviews - Dig...,Canon Digital IXUS 50 - PowerShot and IXUS dig...,0
5243,Olympus PEN E-PM1 Purple Silver Camera Kit W/ ...,"SONY DSC-W800B BLACK 20.1MP 5X WIDE ANGLE 2.7""...",0


In [47]:
camera_df.head()

Unnamed: 0,attributi_x,attributi_y,label
0,Samsung Galaxy EK-GC110 16.3 Megapixel Compact...,"SAMSUNG WB350F BROWN 16MP 21X 3"" SMART CAMERA ...",0
1,Canon PowerShot SX510 HS Black Digital Camera ...,"NIKON COOLPIX L610 BLACK 16MP 14X 3""LCD 32408",0
2,Panasonic Lumix DMC-GH4 - Price comparison & r...,Canon Digital IXUS 800 IS - PowerShot and IXUS...,0
3,Nikon D600 / 24-85mm Kit - Price comparison & ...,Canon PowerShot G11 - PowerShot and IXUS digit...,0
4,Canon EOS Rebel T3i Black SLR Digital Camera K...,"FUJIFILM JX600 WHITE 14MP 5X 2.7""LCD CCD 60001...",0


In [25]:
len(labels)

5244

In [38]:
lshDfDict = {"left_table": leftTableEmbeddings, "right_table": rightTableEmbeddings, "label": labels}
lshDf = pd.DataFrame(lshDfDict)
lshDf.tail(20)

Unnamed: 0,left_table,right_table,label
5224,"[-0.9988965392112732, -0.0, -0.094288572669029...","[0.039297040551900864, -0.0, -0.80969095230102...",0
5225,"[-0.9986437559127808, -0.0, -0.063912093639373...","[0.03933756798505783, -0.0, -0.809261441230773...",0
5226,"[-0.9986437559127808, -0.0, -0.063912093639373...","[0.03929021954536438, -0.0, -0.811318218708038...",0
5227,"[-0.0, -0.0002873741614166647, -0.874802410602...","[-0.0, -0.2524072229862213, 0.2572113573551178...",1
5228,"[-0.34073543548583984, -0.0, 0.449658751487731...","[-0.3613590896129608, 0.0, 0.22806447744369507...",1
5229,"[-1.0, 0.0, -0.0, -0.7615934014320374, 0.0, 0....","[-0.0, -0.10287119448184967, 0.024670321494340...",0
5230,"[-0.9999977350234985, 0.0, -0.0, -0.7615934610...","[-0.0, -0.0, -0.26511502265930176, 0.855300188...",0
5231,"[-0.9987872838973999, -0.0, -0.161479070782661...","[0.03912555053830147, -0.0, -0.809287488460540...",0
5232,"[-0.9999997615814209, 0.0, -0.0, -0.7615923881...","[-0.00032894365722313523, -0.00078655791003257...",0
5233,"[-1.0, 0.0, -0.0, -0.7615925073623657, 0.0, 0....","[0.0, -0.0, -0.418541818857193, 0.531643748283...",0


In [27]:
lshDf.shape

(5244, 3)

In [56]:
lshTestDataframe = pd.concat([camera_df,lshDf],axis=1)

In [58]:
lshTestDataframe['label']

Unnamed: 0,label,label.1
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
5,0,0
6,1,1
7,0,0
8,0,0
9,0,0


In [28]:
type(lshDf['left_table'][0])

list

In [29]:
len(lshDf['left_table'][0])

150

In [31]:
lshDf.to_csv('../lsh-test-data/' + DATASET_NAME + '-embeddings.csv', encoding='utf-8', index=False)