In [44]:
import pandas as pd
from splycer.blocker import BlockDB
from splycer.record_set import RecordDB
from splycer.pairs_set import PairsDB
from splycer.feature_engineer import FeatureEngineer
import recordlinkage as rl
import pyodbc
import numpy as np
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
import pickle as pkl
from tqdm import tqdm

# Set up a database connection
import turbodbc
conn = turbodbc.connect('rec_db')

import os.path
basePath = r'R:\JoePriceResearch\record_linking\projects\deep_learning\paper_RR\CensusTree_2020\final'
trainPath = os.path.abspath(os.path.join(basePath, '2-split_train_test', 'train_1910_1920.csv'))
testPath = os.path.abspath(os.path.join(basePath, '2-split_train_test', 'test_1910_1920.csv'))

## Create the class for comparing features.

In [5]:
from recordlinkage.base import BaseCompareFeature

class eucledian_distance(BaseCompareFeature):
    def __init__(self, left_on, right_on):
        super(eucledian_distance, self).__init__(left_on, right_on)
        self.n = len(left_on)
    def _compute_vectorized(self,*args):
        s1 = args[:self.n]
        s2 = args[self.n:]
        return np.linalg.norm(np.array(s1)-np.array(s2),ord=2,axis=0)
    
class commonality_weight(BaseCompareFeature):
    def __init__(self,left_on,right_on):
        super(commonality_weight, self).__init__(left_on, right_on)
    def _compute_vectorized(self,s1,s2):
        return 1 / np.log1p((s1 + s2) / 2)
    
def get_compare_engine(drop=[]):
    exact_match_features = ['marstat','mbp','fbp','rel','first_nysiis','last_nysiis']
    exact_match_features = [feat for feat in exact_match_features if feat not in drop]
    c = rl.Compare() # declare comparison object
    if 'res' not in drop:
        c.geo('res_lat','res_lon','res_lat','res_lon',method = 'exp',scale=500)
    if 'bp' not in drop:
        c.geo('bp_lat','bp_lon','bp_lat','bp_lon', method = 'exp',scale=500)
    if 'first_jaro' not in drop:
        c.string('first','first',method = 'jarowinkler')
    if 'last_jaro' not in drop:
        c.string('last','last', method = 'jarowinkler')
    #c.string('first','first',method = 'qgram')
    #c.string('last','last', method = 'qgram')
    if 'birth_year' not in drop:
        c.numeric('birth_year','birth_year', method = 'lin', scale = 1, offset = 1)
    if 'immigration' not in drop:
        c.numeric('immigration','immigration', method = 'lin', scale = 1, offset = 1)
    
    vec_cols = [f'occ_vec{i}' for i in range(128)]
    if 'occ' not in drop:
        c.add(eucledian_distance(vec_cols,vec_cols))
    if 'comm_first' not in drop:
        c.add(commonality_weight('first_comm','first_comm'))
    if 'comm_last' not in drop:
        c.add(commonality_weight('last_comm','last_comm'))    
    for col in exact_match_features:
        c.exact(col,col)
    return c

## Load the training data

In [32]:
# Get the training set.
df = pd.read_csv(trainPath)

# Get the full data using SQL.
sql1910 = RecordDB('compiled_1910','ark1910','rec_db')
sql1920 = RecordDB('compiled_1920','ark1920','rec_db')
rec1910 = sql1910.get_records(df['ark1910'].drop_duplicates()).set_index('index')
rec1920 = sql1920.get_records(df['ark1920'].drop_duplicates()).set_index('index')

In [33]:
# Create the truth value.
pairs = pd.MultiIndex.from_arrays((df['ark1910'],df['ark1920']))
y = df['ark1920']==df['true_ark_1920']
y.value_counts(normalize=1)

False    0.931924
True     0.068076
dtype: float64

In [34]:
rec1910.index = rec1910.index_
rec1920.index = rec1920.index_

In [35]:
c = get_compare_engine(drop=['occ','first_nysiis','last_nysiis'])
X = c.compute(pairs,rec1910,rec1920)
X.columns=['res','bp','first_jaro','last_jaro','birth_year','immigration','first_comm',
           'last_comm','marstat','mbp','fbp','rel']

In [37]:
X.describe()

Unnamed: 0,res,bp,first_jaro,last_jaro,birth_year,immigration,first_comm,last_comm,marstat,mbp,fbp,rel
count,213217.0,213217.0,213217.0,213217.0,213217.0,213217.0,213217.0,213217.0,213217.0,213217.0,213217.0,213217.0
mean,0.58499,0.999991,0.919006,0.830371,0.481066,0.027688,0.078305,0.103442,0.73595,0.643457,0.624734,0.695615
std,0.317306,0.003063,0.155184,0.172934,0.443577,0.154307,0.029566,0.088566,0.440827,0.478979,0.484193,0.460147
min,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,0.0,0.0,0.0,0.0
25%,0.332144,1.0,0.885714,0.666667,0.0,0.0,0.071743,0.08594,0.0,0.0,0.0,0.0
50%,0.656111,1.0,1.0,0.866667,0.5,0.0,0.076312,0.09683,1.0,1.0,1.0,1.0
75%,0.842718,1.0,1.0,1.0,1.0,0.0,0.082698,0.11568,1.0,1.0,1.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.442695,1.091357,1.0,1.0,1.0,1.0


## Loading the test data

In [38]:
# Load in the test data.
val = pd.read_csv(testPath)
val.columns = ['ark1910','ark1920','true_ark1920']

val['truth'] = val['ark1920']==val['true_ark1920']
pairs = pd.MultiIndex.from_arrays((val['ark1910'],val['ark1920']))

recb = sql1920.get_records(val['ark1920'].drop_duplicates().tolist()).set_index('index')
reca = sql1910.get_records(val['ark1910'].drop_duplicates().tolist()).set_index('index')
reca.index=reca.index_
recb.index=recb.index_

test_X=c.compute(pairs,reca,recb)

test_y = val['truth']
test_X.columns=['res','bp','first_jaro','last_jaro','birth_year','immigration','first_comm',
           'last_comm','marstat','mbp','fbp','rel']

test_X.describe()

Unnamed: 0,res,bp,first_jaro,last_jaro,birth_year,immigration,first_comm,last_comm,marstat,mbp,fbp,rel
count,91379.0,91379.0,91379.0,91379.0,91379.0,91379.0,91379.0,91379.0,91379.0,91379.0,91379.0,91379.0
mean,0.585929,0.999989,0.918454,0.830825,0.478956,0.028765,0.07848,0.102832,0.734315,0.646297,0.627617,0.691505
std,0.317238,0.003308,0.156634,0.172951,0.44312,0.157514,0.02742,0.089951,0.4417,0.478121,0.483442,0.461875
min,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,0.0,0.0,0.0,0.0
25%,0.335166,1.0,0.883333,0.666667,0.0,0.0,0.071743,0.08594,0.0,0.0,0.0,0.0
50%,0.658113,1.0,1.0,0.866667,0.5,0.0,0.076323,0.096814,1.0,1.0,1.0,1.0
75%,0.843234,1.0,1.0,1.0,1.0,0.0,0.082756,0.115596,1.0,1.0,1.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.442695,1.442695,1.0,1.0,1.0,1.0


## Train using three algorithms

In [51]:
# Train on nearest centroid.
from sklearn.neighbors import NearestCentroid
model = NearestCentroid()
model.fit(X.fillna(X.mean()),y)

y_pred_val = model.predict(X_val.fillna(X.mean()))
y_pred = model.predict(X.fillna(X.mean()))
print(f'train_recall: {recall_score(y,y_pred)}')
print(f'train_precision: {precision_score(y,y_pred)}\n')
print(f'val recall: {recall_score(y_val,y_pred_val)}')
print(f'val precision: {precision_score(y_val,y_pred_val)}\n')
print(f'train_f1_score: {f1_score(y,y_pred)}')
print(f'test_f1_score: {f1_score(y_val, y_pred_val)}')

train_recall: 0.8761970375473648
train_precision: 0.22924402465842317

val recall: 0.8615755627009646
val precision: 0.2272592341291718

train_f1_score: 0.36340776934836344
test_f1_score: 0.3596523606590383


In [47]:
# Train using Logistic Regression
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter=300)
model.fit(X.fillna(X.mean()),y)

# Predict
y_pred_val = model.predict(test_X.fillna(X.mean()))
y_pred = model.predict(X.fillna(X.mean()))

# Print stats.
print(f'train_recall: {recall_score(y,y_pred)}')
print(f'train_precision: {precision_score(y,y_pred)}\n')
print(f'val recall: {recall_score(y_val,y_pred_val)}')
print(f'val precision: {precision_score(y_val,y_pred_val)}\n')
print(f'train_f1_score: {f1_score(y,y_pred)}')
print(f'test_f1_score: {f1_score(y_val, y_pred_val)}')

train_recall: 0.712090940406476
train_precision: 0.8290687414775006

val recall: 0.7016077170418007
val precision: 0.8271417740712661

train_f1_score: 0.7661403898895559
test_f1_score: 0.7592205984690327


In [52]:
# Train using XGB.
model = XGBClassifier()
model.fit(X,y)

y_pred_val = model.predict(X_val)
y_pred = model.predict(X)

print(f'train_recall: {recall_score(y,y_pred)}')
print(f'train_precision: {precision_score(y,y_pred)}\n')
print(f'test_recall: {recall_score(y_val,y_pred_val)}')
print(f'test_precision: {precision_score(y_val,y_pred_val)}\n')
print(f'train_f1_score: {f1_score(y,y_pred)}')
print(f'test_f1_score: {f1_score(y_val, y_pred_val)}')


train_recall: 0.871098863244919
train_precision: 0.9102296450939458

test_recall: 0.8353697749196142
test_precision: 0.8788903924221921

train_f1_score: 0.890234457508977
test_f1_score: 0.8565776458951534


## Test micro-parameters for XGBoost

In [None]:
# Check the following micro parameters.
learning_rates=[.3,.4,.5]
max_depth=[5,6,7]
alpha_vals = [0,0.5]
lambda_vals = [0,1]
n_jobs=16


for lr in learning_rates:
    for depth in max_depth:
        for alph in alpha_vals:
            for lam in lambda_vals:
                model = XGBClassifier(
                    learning_rate=lr, max_depth=depth, n_jobs=n_jobs,
                    reg_alpha=alph, reg_lambda=lam)
                model.fit(X,y)
                y_pred_val = model.predict(X_val)
                print(f1_score(y_val, y_pred_val), lr, depth, alph, lam)

0.8544973544973544 0.3 5 0 0
0.8561287659925713 0.3 5 0 1
0.8568358406348157 0.3 5 0.5 0
0.8550449632868574 0.3 5 0.5 1
0.8612873980054397 0.3 6 0 0
0.8565776458951534 0.3 6 0 1
0.8581326294952162 0.3 6 0.5 0
0.85718983803338 0.3 6 0.5 1
0.8558262014483213 0.3 7 0 0
0.8556624722427831 0.3 7 0 1
0.8549542871262664 0.3 7 0.5 0
0.8584542250635194 0.3 7 0.5 1
0.8562711305351695 0.4 5 0 0
0.8568836096675739 0.4 5 0 1
0.8581525312294545 0.4 5 0.5 0
0.8567183889072301 0.4 5 0.5 1
0.8568604842694779 0.4 6 0 0


## ReCreate and save our best model

In [None]:
model = XGBClassifier()

In [57]:
tn, fp, fn, tp = confusion_matrix(y_val,y_pred_val).ravel()
tn, fp, fn, tp

(84429, 730, 1042, 5178)