In [2]:
import pandas as pd
import os
import py_entitymatching as em
import numpy as np

datasets_dir = os.getcwd() + os.sep

pathA = datasets_dir + "imdb.csv"
pathB = datasets_dir + "tomato.csv"

A = pd.read_csv(pathA)
B = pd.read_csv(pathB)

# Rename first empty 
# df.rename(columns={"Unnamed: 0": "id"},  inplace=True)

A = A[['movie_no', 'movie_name', 'movie_year', 'movie_director', 'movie_star']]
B = B[['movie_no', 'movie_name', 'movie_year', 'movie_director', 'movie_star']]

em.set_key(A, 'movie_no')
em.set_key(B, 'movie_no')

pathS = datasets_dir + "labeled_data.csv"

S = em.read_csv_metadata(pathS, 
                         key='_id',
                         ltable=A, rtable=B, 
                         fk_ltable='ltable_movie_no', fk_rtable='rtable_movie_no')

# S.rename(columns={"Unnamed: 12": "label"},  inplace=True)

IJ = em.split_train_test(S, train_proportion=0.7, random_state=0)
I = IJ['train']
J = IJ['test']

No handlers could be found for logger "py_entitymatching.io.parsers"


In [3]:
dt = em.DTMatcher(name='DecisionTree', random_state=0)
svm = em.SVMMatcher(name='SVM', random_state=0)
rf = em.RFMatcher(name='RF', random_state=0)
lg = em.LogRegMatcher(name='LogReg', random_state=0)
ln = em.LinRegMatcher(name='LinReg')
nb = em.NBMatcher(name='NB')

In [4]:
F = em.get_features_for_matching(A, B, validate_inferred_attr_types=False)

In [5]:
# F.feature_name

In [6]:
H = em.extract_feature_vecs(I, 
                            feature_table=F, 
                            attrs_after='label',
                            show_progress=False)

In [7]:
H = em.impute_table(H, 
                exclude_attrs=['_id', 'ltable_movie_no', 'rtable_movie_no', 'label'],
                strategy='mean')

In [8]:
result = em.select_matcher([dt, rf, svm, ln, lg, nb], table=H, 
        exclude_attrs=['_id', 'ltable_movie_no', 'rtable_movie_no', 'label'],
        k=5,
        target_attr='label', metric_to_select_matcher='f1', random_state=0)


In [9]:
result['cv_stats']

Unnamed: 0,Matcher,Average precision,Average recall,Average f1
0,DecisionTree,0.97592,0.962637,0.96806
1,RF,0.975612,0.992,0.983634
2,SVM,0.991667,0.896529,0.941052
3,LinReg,0.984615,1.0,0.992157
4,LogReg,0.984615,0.985714,0.984749
5,NB,0.984615,0.992308,0.988235


In [11]:
dt = result['selected_matcher']
# dt = em.DTMatcher(name='DT', random_state=0)

dt.fit(table=H, 
       exclude_attrs=['_id', 'ltable_movie_no', 'rtable_movie_no', 'label'], 
       target_attr='label')

L = em.extract_feature_vecs(J, feature_table=F,
                            attrs_after='label', show_progress=False)

predictions = dt.predict(table=L, exclude_attrs=['_id', 'ltable_movie_no', 'rtable_movie_no', 'label'], 
              append=True, target_attr='predicted', inplace=False, return_probs=True,
                        probs_attr='proba')

predictions[['_id', 'ltable_movie_no', 'rtable_movie_no', 'predicted', 'proba']].head()

eval_result = em.eval_matches(predictions, 'label', 'predicted')
em.print_eval_summary(eval_result)

Precision : 96.49% (55/57)
Recall : 98.21% (55/56)
F1 : 97.35%
False positives : 2 (out of 57 positive predictions)
False negatives : 1 (out of 96 negative predictions)


In [14]:
# False Positivies
fn = predictions[ (predictions['predicted'] == 1) & (predictions['label'] == 0)]

In [13]:
S[S['_id'].isin(fn['_id'])]

Unnamed: 0.1,Unnamed: 0,_id,ltable_movie_no,rtable_movie_no,ltable_movie_name,ltable_movie_year,ltable_movie_director,ltable_movie_star,rtable_movie_name,rtable_movie_year,rtable_movie_director,rtable_movie_star,label
144,144,203430,1008,774,the hunger games: mockingjay - part 1,2014,Francis Lawrence,"Jennifer Lawrence, Josh Hutcherson, Liam Hemsworth, Woody Harrelson",the hunger games: mockingjay - part 2,2015,Francis Lawrence,"Jennifer Lawrence, Josh Hutcherson, Liam Hemsworth, Julianne Moore, Gwendoline Christie, Robert ...",0
400,400,610699,1476,2378,the purge: election year,2016,James DeMonaco,"Frank Grillo, Elizabeth Mitchell, Mykelti Williamson, Joseph Julian Soria",the purge: anarchy,2014,James DeMonaco,"Frank Grillo, Carmen Ejogo, Zach Gilford, Kiele Sanchez, Michael Kenneth Williams, Zoe Soul, Jus...",0
