# Importing Libraries

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import io
from google.colab import files
from google.colab import drive
import sys

In [4]:
sys.path.insert(0,'/content/drive/My Drive/CERTA/models')
sys.path.insert(0,'/content/drive/My Drive/CERTA/certa')

In [19]:
import DeepER as dp
import eval
import os
from local_explain import find_thresholds
from local_explain import dataset_local
from triangles_method import explainSamples
from eval import expl_eval
import numpy as np
import pandas as pd
import gensim.downloader as api

# Preprocessing

In [24]:
def to_deeper_data(df: pd.DataFrame):
    res = []
    for r in range(len(df)):
        row = df.iloc[r]
        lpd = row.filter(regex='^ltable_')
        rpd = row.filter(regex='^rtable_')
        if 'label' in row:
            label = row['label']
            res.append((lpd.values.astype('str'), rpd.values.astype('str'), label))
        else:
            res.append((lpd.values.astype('str'), rpd.values.astype('str')))
    return res

In [15]:
def change_prefix_for_deeper(df: pd.DataFrame):
  lprefix = 'ltable_'
  rprefix = 'rtable_'
  columns = df.columns
  new_names = {}
  for col_name in columns:
    if col_name != "id" or col_name != "label":
      if col_name[0:5] == "left_":
        new_name = lprefix + col_name[5:]
      else:
        new_name = rprefix + col_name[6:]
        
      new_names[col_name] = new_name
  
  new_names["id"] = "id"
  new_names["label"] = "label"
  df = df.rename(columns=new_names)
  return df

In [16]:
train = pd.read_csv('trainRM.csv')
valid = pd.read_csv('validationRM.csv')
test = pd.read_csv('testRM.csv')

In [17]:
train = change_prefix_for_deeper(train)
valid = change_prefix_for_deeper(valid)
test = change_prefix_for_deeper(test)
test.columns

Index(['id', 'label', 'ltable_id', 'ltable_name', 'ltable_host_id',
       'ltable_host_name', 'ltable_neighbourhood_group',
       'ltable_neighbourhood', 'ltable_latitude', 'ltable_longitude',
       'ltable_room_type', 'ltable_price', 'ltable_minimum_nights',
       'ltable_number_of_reviews', 'ltable_last_review',
       'ltable_reviews_per_month', 'ltable_calculated_host_listings_count',
       'ltable_availability_365', 'rtable_id', 'rtable_name', 'rtable_host_id',
       'rtable_host_name', 'rtable_neighbourhood_group',
       'rtable_neighbourhood', 'rtable_latitude', 'rtable_longitude',
       'rtable_room_type', 'rtable_price', 'rtable_minimum_nights',
       'rtable_number_of_reviews', 'rtable_last_review',
       'rtable_reviews_per_month', 'rtable_calculated_host_listings_count',
       'rtable_availability_365'],
      dtype='object')

# Load embeddings

In [21]:
if not os.path.exists('glove.6B.50d.txt'):
    word_vectors = api.load("glove-wiki-gigaword-50")
    word_vectors.save_word2vec_format('glove.6B.50d.txt', binary=False)

In [22]:
embeddings_index = dp.init_embeddings_index('glove.6B.50d.txt')
emb_dim = len(embeddings_index['cat'])
embeddings_model, tokenizer = dp.init_embeddings_model(embeddings_index)
model = dp.init_DeepER_model(emb_dim)

* Costruzione indice degli embeddings.....Fatto. 400001 embeddings totali.
* Creazione del modello per il calcolo degli embeddings....
* Inizializzo il tokenizzatore.....Fatto: 400001 parole totali.
* Preparazione della matrice di embedding.....Fatto. Dimensioni matrice embeddings: (400002, 50)

°°° EMBEDDING MODEL °°°
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Tupla_A (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
Tupla_B (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
Embedding_lookup (Embedding)    (None, None, 50)     20000100    Tupla

# Training

In [None]:
model = dp.train_model_ER(to_deeper_data(train), model, embeddings_model, tokenizer)

# Testing

Testing on Rome's test set

In [26]:
dp.model_statistics(to_deeper_data(test), model, embeddings_model, tokenizer)

* Avvio test metriche....
-- Corpus size: 3997
-- Non Match: 3189
-- Match: 808
* Preparazione input......Fatto. 3997 tuple totali, esempio label: 0 -> [1. 0.], Table1 shape: (3997, 28), Table2 shape: (3997, 28)
Precision: 0.9744245524296675, Recall: 0.943069306930693, f1-score: 0.9584905660377359
Total retrieved: 782, retrieved/total matches: 762/808


(0.9744245524296675, 0.943069306930693, 0.9584905660377359)

Testing on Amsterdam's test set

In [27]:
test = pd.read_csv('testAMS.csv')
test = change_prefix_for_deeper(test)           

In [28]:
dp.model_statistics(to_deeper_data(test), model, embeddings_model, tokenizer)

* Avvio test metriche....
-- Corpus size: 1057
-- Non Match: 890
-- Match: 167
* Preparazione input......Fatto. 1057 tuple totali, esempio label: 0 -> [1. 0.], Table1 shape: (1057, 24), Table2 shape: (1057, 22)
Precision: 0.5035971223021583, Recall: 0.8383233532934131, f1-score: 0.6292134831460674
Total retrieved: 278, retrieved/total matches: 140/167


(0.5035971223021583, 0.8383233532934131, 0.6292134831460674)

Testing on Bergamo's test set

In [29]:
test = pd.read_csv('testBER.csv')
test = change_prefix_for_deeper(test)  

In [30]:
dp.model_statistics(to_deeper_data(test), model, embeddings_model, tokenizer)

* Avvio test metriche....
-- Corpus size: 49
-- Non Match: 8
-- Match: 41
* Preparazione input......Fatto. 49 tuple totali, esempio label: 1 -> [0. 1.], Table1 shape: (49, 20), Table2 shape: (49, 20)
Precision: 0.8, Recall: 0.5853658536585366, f1-score: 0.676056338028169
Total retrieved: 30, retrieved/total matches: 24/41


(0.8, 0.5853658536585366, 0.676056338028169)