# Introduction

This IPython notebook illustrates how to select the best learning based matcher. First, we need to import py_entitymatching package and other libraries as follows:

In [1]:
# Import py_entitymatching package
import py_entitymatching as em
import os
import pandas as pd

from py_entitymatching.feature.scalers import scale_features
from py_entitymatching.feature.selectfeatures import select_features_univariate

# Set the seed value 
seed = 0

In [2]:
# Get the datasets directory
datasets_dir = em.get_install_path() + os.sep + 'datasets'

path_A = datasets_dir + os.sep + 'dblp_demo.csv'
path_B = datasets_dir + os.sep + 'acm_demo.csv'
path_labeled_data = datasets_dir + os.sep + 'labeled_data_demo.csv'

In [3]:
A = em.read_csv_metadata(path_A, key='id')
B = em.read_csv_metadata(path_B, key='id')
# Load the pre-labeled data
S = em.read_csv_metadata(path_labeled_data, 
                         key='_id',
                         ltable=A, rtable=B, 
                         fk_ltable='ltable_id', fk_rtable='rtable_id')

Metadata file is not present in the given path; proceeding to read the csv file.
Metadata file is not present in the given path; proceeding to read the csv file.


Then, split the labeled data into development set and evaluation set and convert them into feature vectors

In [4]:
# Split S into I an J
IJ = em.split_train_test(S, train_proportion=0.5, random_state=0)
I = IJ['train']
J = IJ['test']

In [5]:
# Generate a set of features
F = em.get_features_for_matching(A, B, validate_inferred_attr_types=False)

In [6]:
# Convert I into feature vectors using updated F
H = em.extract_feature_vecs(I, 
                            feature_table=F, 
                            attrs_after='label',
                            show_progress=False)

# Selecting most relevant features from F

In [7]:
# Scale features with the specified scaling method
H, H_scaler = scale_features(table=H,
                                 exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'],
                                 scaling_method='MinMax')
H.head()

Unnamed: 0,_id,ltable_id,rtable_id,id_id_lev_dist,id_id_lev_sim,id_id_jar,id_id_jwn,id_id_exm,id_id_jac_qgm_3_qgm_3,title_title_jac_qgm_3_qgm_3,...,venue_venue_cos_dlm_dc0_dlm_dc0,venue_venue_lev_dist,venue_venue_sw,venue_venue_mel,venue_venue_lev_sim,year_year_exm,year_year_anm,year_year_lev_dist,year_year_lev_sim,label
430,430,l1494,r1257,0.666667,0.333333,0.636364,0.636364,0.0,0.0,0.0,...,1.0,0.0,0.923077,1.0,1.0,1.0,1.0,0.0,1.0,0
35,35,l1385,r1160,0.666667,0.333333,0.636364,0.636364,0.0,0.0,0.025641,...,0.0,0.672131,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0
394,394,l1345,r85,0.666667,0.333333,0.0,0.0,0.0,0.454545,1.0,...,0.0,0.213115,0.0,0.790695,0.227113,1.0,1.0,0.0,1.0,1
29,29,l611,r141,0.333333,0.416667,0.909091,0.909091,0.0,0.454545,0.049383,...,0.0,0.196721,0.0,0.516518,0.049914,1.0,1.0,0.0,1.0,0
181,181,l1164,r1161,0.0,1.0,1.0,1.0,0.0,0.384615,1.0,...,0.0,0.213115,0.0,0.790695,0.227113,1.0,1.0,0.0,1.0,1


In [8]:
FF = select_features_univariate(feature_table=F,
                                 table=H, 
                                 target_attr='label', 
                                 exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'],
                                 score='mutual_info', 
                                 mode='k_best',
                                 parameter=2)
FF.head()

Unnamed: 0,feature_name,left_attribute,right_attribute,left_attr_tokenizer,right_attr_tokenizer,simfunction,function,function_source,is_auto_generated
0,title_title_jac_qgm_3_qgm_3,title,title,qgm_3,qgm_3,jaccard,<function title_title_jac_qgm_3_qgm_3 at 0x7f2fa9e96a60>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
1,title_title_lev_sim,title,title,,,lev_sim,<function title_title_lev_sim at 0x7f2fa9e96d08>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True


In [9]:
# Convert the I into a set of feature vectors using auto-generated features
HH = em.extract_feature_vecs(I, 
                             feature_table=FF, 
                             attrs_after='label',
                             show_progress=False)

HH, HH_scaler = scale_features(table=HH,
                                 exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'],
                                 scaling_method='MinMax')
HH.head()

Unnamed: 0,_id,ltable_id,rtable_id,title_title_jac_qgm_3_qgm_3,title_title_lev_sim,label
430,430,l1494,r1257,0.0,0.102564,0
35,35,l1385,r1160,0.025641,0.193307,0
394,394,l1345,r85,1.0,1.0,1
29,29,l611,r141,0.049383,0.159989,0
181,181,l1164,r1161,1.0,1.0,1
