In [1]:
import pandas as pd
import os
import py_entitymatching as em
import numpy as np

DATA_DIR = "../data"
datasets_dir = DATA_DIR + os.sep

pathA = datasets_dir + "imdb_clean.csv"
pathB = datasets_dir + "tomato_clean.csv"

A = pd.read_csv(pathA, encoding = 'utf-8')
B = pd.read_csv(pathB, encoding = 'utf-8')

# Rename first empty attr 
# df.rename(columns={"Unnamed: 0": "id"},  inplace=True)

A = A[['movie_no', 'movie_name', 'movie_year', 'movie_director', 'movie_star']]
B = B[['movie_no', 'movie_name', 'movie_year', 'movie_director', 'movie_star']]

em.set_key(A, 'movie_no')
em.set_key(B, 'movie_no')

pathS = datasets_dir + "labeled_data.csv"

S = em.read_csv_metadata(pathS, 
                         key='_id',
                         ltable=A, rtable=B, 
                         fk_ltable='ltable_movie_no',
                         fk_rtable='rtable_movie_no')
                         

IJ = em.split_train_test(S, train_proportion=0.7, random_state=0)
I = IJ['train']
J = IJ['test']

Metadata file is not present in the given path; proceeding to read the csv file.


In [2]:
# I.to_csv("I.csv")
# J.to_csv("J.csv")

In [3]:
# Classifier

dt = em.DTMatcher(name='DecisionTree', random_state=0)
svm = em.SVMMatcher(name='SVM', random_state=0)
rf = em.RFMatcher(name='RF', random_state=0)
lg = em.LogRegMatcher(name='LogReg', random_state=0)
ln = em.LinRegMatcher(name='LinReg')
nb = em.NBMatcher(name='NB')

In [4]:
# Feature generation
F = em.get_features_for_matching(A, B, validate_inferred_attr_types=False)

H = em.extract_feature_vecs(I, 
                            feature_table=F, 
                            attrs_after='label',
                            show_progress=False)

# Missing value
H = em.impute_table(H, 
                exclude_attrs=['_id', 'ltable_movie_no', 'rtable_movie_no', 'label'],
                strategy='mean')

In [5]:
# Corss Validation 

result = em.select_matcher([dt, rf, svm, ln, lg, nb], table=H, 
        exclude_attrs=['_id', 'ltable_movie_no', 'rtable_movie_no', 'label'],
        k=5, # Num of fold
        target_attr='label', metric_to_select_matcher='f1', random_state=0)



In [7]:
result['cv_stats']

Unnamed: 0,Matcher,Average precision,Average recall,Average f1
0,DecisionTree,0.984615,1.0,0.992157
1,RF,0.984615,0.992857,0.98852
2,SVM,0.991667,0.896529,0.941052
3,LinReg,0.984615,0.977473,0.98052
4,LogReg,0.984615,0.992857,0.98852
5,NB,0.984615,0.985165,0.984599


In [8]:
# Apply to testing set

dt = result['selected_matcher'] # LinReg here
# dt = em.DTMatcher(name='DT', random_state=0)

dt.fit(table=H, 
       exclude_attrs=['_id', 'ltable_movie_no', 'rtable_movie_no', 'label'], 
       target_attr='label')

L = em.extract_feature_vecs(J, feature_table=F,
                            attrs_after='label', show_progress=False)

predictions = dt.predict(table=L, exclude_attrs=['_id', 'ltable_movie_no', 'rtable_movie_no', 'label'], 
              append=True, target_attr='predicted', inplace=False)

# predictions[['_id', 'ltable_movie_no', 'rtable_movie_no', 'predicted', 'proba']].head()

eval_result = em.eval_matches(predictions, 'label', 'predicted')
em.print_eval_summary(eval_result)

Precision : 98.18% (54/55)
Recall : 96.43% (54/56)
F1 : 97.3%
False positives : 1 (out of 55 positive predictions)
False negatives : 2 (out of 98 negative predictions)


In [9]:
# False Positivies

fn = predictions[ (predictions['predicted'] == 1) & (predictions['label'] == 0)]
S[S['_id'].isin(fn['_id'])]

Unnamed: 0.1,Unnamed: 0,_id,ltable_movie_no,rtable_movie_no,ltable_movie_name,ltable_movie_year,ltable_movie_director,ltable_movie_star,rtable_movie_name,rtable_movie_year,rtable_movie_director,rtable_movie_star,label
144,144,203430,1008,774,the hunger games: mockingjay - part 1,2014,Francis Lawrence,"Jennifer Lawrence, Josh Hutcherson, Liam Hemsworth, Woody Harrelson",the hunger games: mockingjay - part 2,2015,Francis Lawrence,"Jennifer Lawrence, Josh Hutcherson, Liam Hemsworth, Julianne Moore, Gwendoline Christie, Robert ...",0


In [10]:
# 6 learning-based applied on testing set

classifiers = result['drill_down_cv_stats']['precision'][['Name','Matcher']]

for index, row in classifiers.iterrows():
    print("-----  %s ----- ", row['Name'])
    
    cl = row['Matcher']
    cl.fit(table=H, 
        exclude_attrs=['_id', 'ltable_movie_no', 'rtable_movie_no', 'label'], 
        target_attr='label')
    
    L = em.extract_feature_vecs(J, feature_table=F,
                            attrs_after='label', show_progress=False)

    predictions = cl.predict(table=L, exclude_attrs=['_id', 'ltable_movie_no', 'rtable_movie_no', 'label'], 
              append=True, target_attr='predicted', inplace=False)

    eval_result = em.eval_matches(predictions, 'label', 'predicted')
    em.print_eval_summary(eval_result)
    print("")

-----  %s -----  DecisionTree
Precision : 98.18% (54/55)
Recall : 96.43% (54/56)
F1 : 97.3%
False positives : 1 (out of 55 positive predictions)
False negatives : 2 (out of 98 negative predictions)

-----  %s -----  RF
Precision : 96.36% (53/55)
Recall : 94.64% (53/56)
F1 : 95.5%
False positives : 2 (out of 55 positive predictions)
False negatives : 3 (out of 98 negative predictions)

-----  %s -----  SVM
Precision : 98.0% (49/50)
Recall : 87.5% (49/56)
F1 : 92.45%
False positives : 1 (out of 50 positive predictions)
False negatives : 7 (out of 103 negative predictions)

-----  %s -----  LinReg




Precision : 98.18% (54/55)
Recall : 96.43% (54/56)
F1 : 97.3%
False positives : 1 (out of 55 positive predictions)
False negatives : 2 (out of 98 negative predictions)

-----  %s -----  LogReg
Precision : 96.49% (55/57)
Recall : 98.21% (55/56)
F1 : 97.35%
False positives : 2 (out of 57 positive predictions)
False negatives : 1 (out of 96 negative predictions)

-----  %s -----  NB
Precision : 96.49% (55/57)
Recall : 98.21% (55/56)
F1 : 97.35%
False positives : 2 (out of 57 positive predictions)
False negatives : 1 (out of 96 negative predictions)

