In [79]:
# import modules
import pandas as pd
from constants import *
from helpers import normalise_word_data, cross_join_dataframes, get_jaccard_sim
import numpy as np
from IPython.display import display
from sklearn import preprocessing
import matplotlib.pyplot as plt

**<span style="color:crimson">1. Naive data linkage without blocking</span>**

In [80]:
# read csv files
google_products = pd.read_csv(GOOGLE_SMALL_PATH)
amazon_products = pd.read_csv(AMAZON_SMALL_PATH)

In [81]:
# normalise the text in the name/title columns of the dataframes
google_products[google_word_columns] = normalise_word_data(google_products[['name']].astype(str))
amazon_products[amazon_word_columns] = normalise_word_data(amazon_products[['title']].astype(str))

In [114]:
# join the 2 dataframes
joined = cross_join_dataframes(google_products, amazon_products)
# calculate the scores for the names/titles using the jaccard index
joined['name_score'] =joined.apply(lambda row: get_jaccard_sim(row['name'], row['title']), axis=1)
# calculate the scores for the price similarities
joined['price_score'] = joined.apply(lambda row: (min(row['price_x'], row['price_y'])/max(row['price_x'], row['price_y'])), axis=1)
# calculate the final scores with the price weighted less as there is more chance of duplicates
joined['final_score'] = joined['name_score'] + joined['price_score']/2

In [115]:
# threshold determined through trial and error, only concerned with values above this threshold
THRESHOLD = 0.55
joined = joined[joined['final_score'] > THRESHOLD]
# take only the largest score for each amazonID for comparison
joined = joined.sort_values(by='final_score', ascending=False).drop_duplicates(['idAmazon'])
# create new dataframes for faster calculations
predicted = joined.loc[:, ['idAmazon', 'idGoogleBase']].sort_values(by='idAmazon')
true = pd.read_csv(GOOGLE_AMAZON_SMALL_TRUTH_PATH).sort_values(by='idAmazon')

In [123]:
# create a dataframe of tp values
tp_values = []
for index, row in predicted.iterrows():
    for index2, row2 in true.iterrows():
        # check matches
        if row['idAmazon'] == row2['idAmazon'] and row['idGoogleBase'] == row2['idGoogleBase']:
            tp_values.append((row['idAmazon'], row['idGoogleBase']))

tp_df = pd.DataFrame(tp_values)
tp_df.columns = ['idAmazon', 'idGoogleBase']
display(tp_df.head())

Unnamed: 0,idAmazon,idGoogleBase
0,1931102953,http://www.google.com/base/feeds/snippets/1272...
1,b00002s6sc,http://www.google.com/base/feeds/snippets/1049...
2,b00004nhn7,http://www.google.com/base/feeds/snippets/1843...
3,b000051sgq,http://www.google.com/base/feeds/snippets/1758...
4,b00006gxbz,http://www.google.com/base/feeds/snippets/9070...


In [129]:
# calculate precision and recall using tp, fp, fn
tp = len(tp_df)
fp = len(predicted) - tp
fn = len(true) - tp - fp
precision = tp/(tp+fp)
recall = tp/(tp+fn)
print(f'precision = {precision}')
print(f'recall = {recall}')

precision = 0.926829268292683
recall = 0.9421487603305785


**<span style="color:green">DISCUSSION**</span><br>
After testing, it was decided that manufacturer and description should not be used in linkage as this lead to too much variance in results. 

A jaccard index, which is used to measure the overlap of two strings, was used to compare the normalized strings in order to get a score based on the similarity of the titles, this was used as titles/names will usually be quite similar across platforms and the jaccard index will usually lead to accurate results while measuring short titles such as the ones in these datasets.

to calculate the similarity of the prices, i took the minimum value of the two and divided that by the maximum value of the two, this leads to creating a (smaller than one) score based on how much smaller the first number is from the second.

The final score was decided by summing the name score and half the price score. The reason half the price score was used was due to it being a less accurate representation of similarity (multiple items can have the same/similar price) in comparison to name which will very rarely have the same/similar values.

The threshold for determining the scores was done through trial and error in order to get the best balance between precision and recall. We also only accounted for the idAmazon's with the highest final scores when comparing with our truth dataset as there can only be one true match for each id, this sufficienty improved the performance of our linkage.

The performance shows us precision ~ 0.927 and recall ~ 0.942. These values both appear to be very good as we have a very high rate of correct linkage between our datasets while still covering a large amount of the true values. 

**<span style="color:crimson">1. Blocking for efficient data linkage</span>**

In [130]:
google_products = pd.read_csv(GOOGLE_PATH)
amazon_products = pd.read_csv(AMAZON_PATH)

PART 2

In [85]:
yeast_data = pd.read_csv(YEAST_PATH)

In [86]:
yeast_data['Class'] = yeast_data.Class.replace(to_replace=yeast_data.Class.unique(), value=[0, 1])


In [87]:
from sklearn.impute import SimpleImputer    

In [88]:
mean_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
median_imputer = SimpleImputer(missing_values=np.nan, strategy='median')

In [89]:
X_mean = pd.DataFrame(mean_imputer.fit_transform(yeast_data.drop('Class', axis=1)))
X_median = pd.DataFrame(median_imputer.fit_transform(yeast_data.drop('Class', axis=1)))
Y = yeast_data['Class']

In [90]:
X_mean.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8
count,1484.0,1484.0,1484.0,1484.0,1484.0,1484.0,1484.0,1484.0,1484.0
mean,742.5,0.499349,0.499876,0.505848,0.264379,0.506921,0.0075,0.503816,0.279816
std,428.538213,0.131357,0.121945,0.199837,0.150286,0.091573,0.075683,0.153481,0.151736
min,1.0,0.11,0.13,0.21,0.0,0.5,0.0,0.0,0.0
25%,371.75,0.41,0.42,0.46,0.17,0.5,0.0,0.48,0.22
50%,742.5,0.499349,0.49,0.51,0.23,0.5,0.0,0.51,0.22
75%,1113.25,0.5625,0.57,0.55,0.3,0.5,0.0,0.53,0.29
max,1484.0,1.0,1.0,7.501819,3.000728,3.500849,0.83,6.001456,4.501092


In [91]:
X_median.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8
count,1484.0,1484.0,1484.0,1484.0,1484.0,1484.0,1484.0,1484.0,1484.0
mean,742.5,0.497628,0.499643,0.506167,0.260432,0.506739,0.0075,0.504199,0.276712
std,428.538213,0.131472,0.121954,0.19984,0.150817,0.09158,0.075683,0.153488,0.152315
min,1.0,0.11,0.13,0.21,0.0,0.5,0.0,0.0,0.0
25%,371.75,0.41,0.42,0.46,0.17,0.5,0.0,0.48,0.22
50%,742.5,0.48,0.49,0.51,0.22,0.5,0.0,0.51,0.22
75%,1113.25,0.5625,0.57,0.55,0.3,0.5,0.0,0.53,0.29
max,1484.0,1.0,1.0,7.501819,3.000728,3.500849,0.83,6.001456,4.501092


In [92]:
from sklearn.preprocessing import StandardScaler, normalize
stdscl = StandardScaler(with_mean=True, with_std=True)

In [93]:
X_std = pd.DataFrame(stdscl.fit_transform(X_median))
X_norm = pd.DataFrame(normalize(X_median))

In [94]:
X_std.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8
count,1484.0,1484.0,1484.0,1484.0,1484.0,1484.0,1484.0,1484.0,1484.0
mean,-4.174558e-17,5.850367e-16,-1.911469e-16,2.817827e-16,1.8029900000000003e-17,3.727738e-16,4.263399e-16,-1.979175e-16,-4.642139e-16
std,1.000337,1.000337,1.000337,1.000337,1.000337,1.000337,1.000337,1.000337,1.000337
min,-1.730884,-2.949354,-3.032012,-1.482523,-1.72739,-0.07361222,-0.0991314,-3.286049,-1.817324
25%,-0.865442,-0.6667373,-0.653274,-0.231098,-0.5998152,-0.07361222,-0.0991314,-0.1577143,-0.3724614
50%,0.0,-0.1341268,-0.07909596,0.01918687,-0.2681755,-0.07361222,-0.0991314,0.03780662,-0.3724614
75%,0.865442,0.4935928,0.5771075,0.2194148,0.262448,-0.07361222,-0.0991314,0.1681539,0.08726755
max,1.730884,3.822409,4.104201,35.01812,18.17582,32.70504,10.87141,35.82762,27.74385


In [95]:
X_norm.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8
count,1484.0,1484.0,1484.0,1484.0,1484.0,1484.0,1484.0,1484.0,1484.0
mean,0.999462,0.002552,0.002574,0.002529,0.00128,0.0025,7.9e-05,0.002537,0.001256
std,0.010533,0.013511,0.014612,0.01229,0.005868,0.012421,0.002144,0.012575,0.005638
min,0.635629,0.00014,0.000112,0.000177,0.0,0.000337,0.0,0.0,0.0
25%,0.999995,0.000435,0.000441,0.000454,0.000193,0.000449,0.0,0.000442,0.000232
50%,0.999999,0.000669,0.000666,0.000681,0.000342,0.000679,0.0,0.000682,0.000378
75%,0.999999,0.001314,0.001318,0.001354,0.000699,0.001371,0.0,0.001378,0.000739
max,1.0,0.368665,0.387734,0.298746,0.114957,0.317814,0.081594,0.305102,0.139838


In [96]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

In [97]:
knn = KNeighborsClassifier(n_neighbors=5)
dtc = DecisionTreeClassifier()

In [98]:
X_norm['label'] = Y
X_norm = X_norm.sample(frac=1).reset_index(drop=True)

In [99]:
X_train_with_labels = X_norm[0:1000]
X_test_with_labels = X_norm[1000:]

In [100]:
Y_train = X_train_with_labels['label']
X_train = X_train_with_labels.drop('label', axis=1)
Y_test = X_test_with_labels['label']
X_test = X_test_with_labels.drop('label', axis=1)

In [101]:
knn.fit(X_train, Y_train)
y_pred = knn.predict(X_train)

In [102]:
from sklearn.metrics import precision_score
score = precision_score(Y_train, y_pred)

In [103]:
score

0.7195945945945946