# Introduction

This IPython notebook illustrates how to rescale/normalize certain features in a table. First, we need to import py_entitymatching package and other libraries as follows:

In [1]:
# Import py_entitymatching package
import py_entitymatching as em
import os
import pandas as pd
from py_entitymatching.feature.scalers import scale_features

# Set the seed value 
seed = 0

In [2]:
# Get the datasets directory
datasets_dir = em.get_install_path() + os.sep + 'datasets'

path_A = datasets_dir + os.sep + 'dblp_demo.csv'
path_B = datasets_dir + os.sep + 'acm_demo.csv'
path_labeled_data = datasets_dir + os.sep + 'labeled_data_demo.csv'

In [3]:
A = em.read_csv_metadata(path_A, key='id')
B = em.read_csv_metadata(path_B, key='id')
# Load the pre-labeled data
S = em.read_csv_metadata(path_labeled_data, 
                         key='_id',
                         ltable=A, rtable=B, 
                         fk_ltable='ltable_id', fk_rtable='rtable_id')

Metadata file is not present in the given path; proceeding to read the csv file.
Metadata file is not present in the given path; proceeding to read the csv file.


Then, split the labeled data into development set and evaluation set and convert them into feature vectors

In [4]:
# Split S into I an J
IJ = em.split_train_test(S, train_proportion=0.5, random_state=0)
I = IJ['train']
J = IJ['test']

In [5]:
# Generate a set of features
F = em.get_features_for_matching(A, B, validate_inferred_attr_types=False)

In [6]:
# Convert I into feature vectors using updated F
H = em.extract_feature_vecs(I, 
                            feature_table=F, 
                            attrs_after='label',
                            show_progress=False)
H.head()

Unnamed: 0,_id,ltable_id,rtable_id,id_id_lev_dist,id_id_lev_sim,id_id_jar,id_id_jwn,id_id_exm,id_id_jac_qgm_3_qgm_3,title_title_jac_qgm_3_qgm_3,...,venue_venue_nmw,venue_venue_jac_dlm_dc0_dlm_dc0,venue_venue_jac_qgm_3_qgm_3,venue_venue_sw,venue_venue_cos_dlm_dc0_dlm_dc0,year_year_exm,year_year_anm,year_year_lev_dist,year_year_lev_sim,label
430,430,l1494,r1257,4,0.2,0.466667,0.466667,0,0.0,0.0,...,9.0,0.666667,0.619048,13.0,0.816497,1,1.0,0.0,1.0,0
35,35,l1385,r1160,4,0.2,0.466667,0.466667,0,0.0,0.025641,...,-41.0,0.0,0.0,1.0,0.0,1,1.0,0.0,1.0,0
394,394,l1345,r85,4,0.2,0.0,0.0,0,0.090909,1.0,...,-13.0,0.0,0.035714,1.0,0.0,1,1.0,0.0,1.0,1
29,29,l611,r141,3,0.25,0.666667,0.666667,0,0.090909,0.049383,...,-12.0,0.0,0.0,1.0,0.0,1,1.0,0.0,1.0,0
181,181,l1164,r1161,2,0.6,0.733333,0.733333,0,0.076923,1.0,...,-13.0,0.0,0.035714,1.0,0.0,1,1.0,0.0,1.0,1


# Rescaling/normalizing feature vectors in H

In [7]:
# Specify a scaling method to scale H
M, M_scaler = scale_features(table=H,
                                exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'],
                                scaling_method='MinMax')
M.head()

Unnamed: 0,_id,ltable_id,rtable_id,id_id_lev_dist,id_id_lev_sim,id_id_jar,id_id_jwn,id_id_exm,id_id_jac_qgm_3_qgm_3,title_title_jac_qgm_3_qgm_3,...,venue_venue_nmw,venue_venue_jac_dlm_dc0_dlm_dc0,venue_venue_jac_qgm_3_qgm_3,venue_venue_sw,venue_venue_cos_dlm_dc0_dlm_dc0,year_year_exm,year_year_anm,year_year_lev_dist,year_year_lev_sim,label
430,430,l1494,r1257,0.666667,0.333333,0.636364,0.636364,0.0,0.0,0.0,...,1.0,1.0,1.0,0.923077,1.0,1.0,1.0,0.0,1.0,0
35,35,l1385,r1160,0.666667,0.333333,0.636364,0.636364,0.0,0.0,0.025641,...,0.285714,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0
394,394,l1345,r85,0.666667,0.333333,0.0,0.0,0.0,0.454545,1.0,...,0.685714,0.0,0.057692,0.0,0.0,1.0,1.0,0.0,1.0,1
29,29,l611,r141,0.333333,0.416667,0.909091,0.909091,0.0,0.454545,0.049383,...,0.7,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0
181,181,l1164,r1161,0.0,1.0,1.0,1.0,0.0,0.384615,1.0,...,0.685714,0.0,0.057692,0.0,0.0,1.0,1.0,0.0,1.0,1


In [8]:
# Use a pre-fitted scaler to scale H
N, N_scaler = scale_features(table=H,
                                exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'],
                                scaler=M_scaler)
N.head()

Unnamed: 0,_id,ltable_id,rtable_id,id_id_lev_dist,id_id_lev_sim,id_id_jar,id_id_jwn,id_id_exm,id_id_jac_qgm_3_qgm_3,title_title_jac_qgm_3_qgm_3,...,venue_venue_nmw,venue_venue_jac_dlm_dc0_dlm_dc0,venue_venue_jac_qgm_3_qgm_3,venue_venue_sw,venue_venue_cos_dlm_dc0_dlm_dc0,year_year_exm,year_year_anm,year_year_lev_dist,year_year_lev_sim,label
430,430,l1494,r1257,-0.444444,0.555556,0.867769,0.867769,0.0,0.0,0.0,...,0.885714,1.5,1.615385,-0.005917,1.224745,1.0,1.0,0.0,1.0,0
35,35,l1385,r1160,-0.444444,0.555556,0.867769,0.867769,0.0,0.0,0.025641,...,0.87551,0.0,0.0,-0.076923,0.0,1.0,1.0,0.0,1.0,0
394,394,l1345,r85,-0.444444,0.555556,0.0,0.0,0.0,2.272727,1.0,...,0.881224,0.0,0.093195,-0.076923,0.0,1.0,1.0,0.0,1.0,1
29,29,l611,r141,-0.555556,0.694444,1.239669,1.239669,0.0,2.272727,0.049383,...,0.881429,0.0,0.0,-0.076923,0.0,1.0,1.0,0.0,1.0,0
181,181,l1164,r1161,-0.666667,1.666667,1.363636,1.363636,0.0,1.923077,1.0,...,0.881224,0.0,0.093195,-0.076923,0.0,1.0,1.0,0.0,1.0,1
