In [1]:
import pandas as pd
from splycer.blocker import BlockDB
from splycer.record_set import RecordDB
from splycer.pairs_set import PairsDB
from splycer.feature_engineer import FeatureEngineer
import recordlinkage as rl
import pyodbc
import numpy as np
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
import pickle as pkl
from tqdm import tqdm

# Set up a database connection
import turbodbc
conn = turbodbc.connect('rec_db')

import os.path
basePath = r'R:\JoePriceResearch\record_linking\projects\deep_learning\paper_RR\CensusTree_2020\final'
trainPath = os.path.abspath(os.path.join(basePath, '2-split_train_test', 'train_1900_1920.csv'))
testPath = os.path.abspath(os.path.join(basePath, '2-split_train_test', 'test_1900_1920.csv'))

## Create the class for comparing features.

In [2]:
from recordlinkage.base import BaseCompareFeature

class eucledian_distance(BaseCompareFeature):
    def __init__(self, left_on, right_on):
        super(eucledian_distance, self).__init__(left_on, right_on)
        self.n = len(left_on)
    def _compute_vectorized(self,*args):
        s1 = args[:self.n]
        s2 = args[self.n:]
        return np.linalg.norm(np.array(s1)-np.array(s2),ord=2,axis=0)
    
class commonality_weight(BaseCompareFeature):
    def __init__(self,left_on,right_on):
        super(commonality_weight, self).__init__(left_on, right_on)
    def _compute_vectorized(self,s1,s2):
        return 1 / np.log1p((s1 + s2) / 2)
    
def get_compare_engine(drop=[]):
    exact_match_features = ['marstat','mbp','fbp','rel','first_nysiis','last_nysiis']
    exact_match_features = [feat for feat in exact_match_features if feat not in drop]
    c = rl.Compare() # declare comparison object
    if 'res' not in drop:
        c.geo('res_lat','res_lon','res_lat','res_lon',method = 'exp',scale=500)
    if 'bp' not in drop:
        c.geo('bp_lat','bp_lon','bp_lat','bp_lon', method = 'exp',scale=500)
    if 'first_jaro' not in drop:
        c.string('first','first',method = 'jarowinkler')
    if 'last_jaro' not in drop:
        c.string('last','last', method = 'jarowinkler')
    #c.string('first','first',method = 'qgram')
    #c.string('last','last', method = 'qgram')
    if 'birth_year' not in drop:
        c.numeric('birth_year','birth_year', method = 'lin', scale = 1, offset = 1)
    if 'immigration' not in drop:
        c.numeric('immigration','immigration', method = 'lin', scale = 1, offset = 1)
    
    vec_cols = [f'occ_vec{i}' for i in range(128)]
    if 'occ' not in drop:
        c.add(eucledian_distance(vec_cols,vec_cols))
    if 'comm_first' not in drop:
        c.add(commonality_weight('first_comm','first_comm'))
    if 'comm_last' not in drop:
        c.add(commonality_weight('last_comm','last_comm'))    
    for col in exact_match_features:
        c.exact(col,col)
    return c

## Load the training data

In [3]:
# Get the training set.
df = pd.read_csv(trainPath)

# Get the full data using SQL.
sql1900 = RecordDB('compiled_1900','ark1900','rec_db')
sql1920 = RecordDB('compiled_1920','ark1920','rec_db')
rec1900 = sql1900.get_records(df['ark1900'].drop_duplicates()).set_index('index')
rec1920 = sql1920.get_records(df['ark1920'].drop_duplicates()).set_index('index')

In [4]:
# Create the truth value.
pairs = pd.MultiIndex.from_arrays((df['ark1900'],df['ark1920']))
y = df['ark1920']==df['true_ark_1920']
y.value_counts(normalize=1)

False    0.874904
True     0.125096
dtype: float64

In [5]:
rec1900.index = rec1900.index_
rec1920.index = rec1920.index_

In [6]:
c = get_compare_engine(drop=['occ','first_nysiis','last_nysiis'])
X = c.compute(pairs,rec1900,rec1920)
X.columns=['res','bp','first_jaro','last_jaro','birth_year','immigration','first_comm',
           'last_comm','marstat','mbp','fbp','rel']

In [7]:
X.describe()

Unnamed: 0,res,bp,first_jaro,last_jaro,birth_year,immigration,first_comm,last_comm,marstat,mbp,fbp,rel
count,219303.0,219303.0,219303.0,219303.0,219303.0,219303.0,219282.0,219149.0,219303.0,219303.0,219303.0,219303.0
mean,0.584706,0.999986,0.9378,0.927219,0.615559,0.042653,0.078292,0.101102,0.617269,0.698394,0.681833,0.588888
std,0.329039,0.003699,0.154777,0.127898,0.412514,0.190296,0.017104,0.035528,0.486055,0.458956,0.465766,0.492037
min,0.0,0.0,0.0,0.0,0.0,0.0,0.069103,0.072581,0.0,0.0,0.0,0.0
25%,0.29832,1.0,1.0,0.9,0.0,0.0,0.070884,0.080839,0.0,0.0,0.0,0.0
50%,0.656961,1.0,1.0,1.0,0.5,0.0,0.074171,0.091911,1.0,1.0,1.0,1.0
75%,0.868014,1.0,1.0,1.0,1.0,0.0,0.08137,0.107847,1.0,1.0,1.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.442695,1.091357,1.0,1.0,1.0,1.0


## Loading the test data

In [8]:
# Load in the test data.
val = pd.read_csv(testPath)
val.columns = ['ark1900','ark1920','true_ark1920']

val['truth'] = val['ark1920']==val['true_ark1920']
pairs = pd.MultiIndex.from_arrays((val['ark1900'],val['ark1920']))

recb = sql1920.get_records(val['ark1920'].drop_duplicates().tolist()).set_index('index')
reca = sql1900.get_records(val['ark1900'].drop_duplicates().tolist()).set_index('index')
reca.index=reca.index_
recb.index=recb.index_

test_X=c.compute(pairs,reca,recb)

test_y = val['truth']
test_X.columns=['res','bp','first_jaro','last_jaro','birth_year','immigration','first_comm',
           'last_comm','marstat','mbp','fbp','rel']

test_X.describe()

Unnamed: 0,res,bp,first_jaro,last_jaro,birth_year,immigration,first_comm,last_comm,marstat,mbp,fbp,rel
count,93987.0,93987.0,93987.0,93987.0,93987.0,93987.0,93976.0,93931.0,93987.0,93987.0,93987.0,93987.0
mean,0.585779,0.999979,0.93704,0.926442,0.617479,0.043389,0.078301,0.101268,0.616809,0.697224,0.681552,0.59006
std,0.32852,0.004613,0.155697,0.128521,0.412936,0.191372,0.017017,0.036013,0.486167,0.459462,0.465877,0.491825
min,0.0,0.0,0.0,0.0,0.0,0.0,0.069103,0.072581,0.0,0.0,0.0,0.0
25%,0.300413,1.0,1.0,0.9,0.0,0.0,0.070884,0.08084,0.0,0.0,0.0,0.0
50%,0.658113,1.0,1.0,1.0,0.5,0.0,0.074171,0.091911,1.0,1.0,1.0,1.0
75%,0.869093,1.0,1.0,1.0,1.0,0.0,0.0814,0.108154,1.0,1.0,1.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.091357,1.091357,1.0,1.0,1.0,1.0


## Train using three algorithms

In [11]:
# Train on nearest centroid.
from sklearn.neighbors import NearestCentroid
model = NearestCentroid()
model.fit(X.fillna(X.mean()),y)

y_pred_val = model.predict(test_X.fillna(X.mean()))
y_pred = model.predict(X.fillna(X.mean()))

print(f'train_recall: {recall_score(y,y_pred)}')
print(f'train_precision: {precision_score(y,y_pred)}\n')
print(f'val recall: {recall_score(test_y,y_pred_val)}')
print(f'val precision: {precision_score(test_y,y_pred_val)}\n')
print(f'train_f1_score: {f1_score(y,y_pred)}')
print(f'test_f1_score: {f1_score(test_y, y_pred_val)}')

train_recall: 0.8033462127287307
train_precision: 0.28021614748887474

val recall: 0.7983385606510129
val precision: 0.2783096926713948

train_f1_score: 0.41550092379623693
test_f1_score: 0.41273528058373693


In [13]:
# Train using Logistic Regression
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter=300)
model.fit(X.fillna(X.mean()),y)

# Predict
y_pred_val = model.predict(test_X.fillna(X.mean()))
y_pred = model.predict(X.fillna(X.mean()))

# Print stats.
print(f'train_recall: {recall_score(y,y_pred)}')
print(f'train_precision: {precision_score(y,y_pred)}\n')
print(f'val recall: {recall_score(test_y,y_pred_val)}')
print(f'val precision: {precision_score(test_y,y_pred_val)}\n')
print(f'train_f1_score: {f1_score(y,y_pred)}')
print(f'test_f1_score: {f1_score(test_y, y_pred_val)}')

train_recall: 0.6241160603630531
train_precision: 0.8327821011673152

val recall: 0.6261761464779181
val precision: 0.8293477040529921

train_f1_score: 0.7135058548985289
test_f1_score: 0.7135819165378671


In [14]:
# Train using XGB.
model = XGBClassifier()
model.fit(X,y)

y_pred_val = model.predict(test_X)
y_pred = model.predict(X)

print(f'train_recall: {recall_score(y,y_pred)}')
print(f'train_precision: {precision_score(y,y_pred)}\n')
print(f'test_recall: {recall_score(test_y,y_pred_val)}')
print(f'test_precision: {precision_score(test_y,y_pred_val)}\n')
print(f'train_f1_score: {f1_score(y,y_pred)}')
print(f'test_f1_score: {f1_score(test_y, y_pred_val)}')


train_recall: 0.8225924035867901
train_precision: 0.8801138801138801

test_recall: 0.78697974061202
test_precision: 0.8601074671113582

train_f1_score: 0.8503815355628828
test_f1_score: 0.8219202337213936


## Test micro-parameters for XGBoost

In [17]:
# Check the following micro parameters.
learning_rates=[.3,.4]
max_depth=[5,6]
alpha_vals = [0,0.5]
lambda_vals = [0,1]
n_jobs=16


for lr in learning_rates:
    for depth in max_depth:
        for alph in alpha_vals:
            for lam in lambda_vals:
                model = XGBClassifier(
                    learning_rate=lr, max_depth=depth, n_jobs=n_jobs,
                    reg_alpha=alph, reg_lambda=lam)
                model.fit(X,y)
                y_pred_val = model.predict(test_X)
                print(f1_score(test_y, y_pred_val), lr, depth, alph, lam)

0.8178675884073385 0.3 5 0 0
0.8210078823841999 0.3 5 0 1
0.8192856509793495 0.3 5 0.5 0
0.819473334808586 0.3 5 0.5 1
0.8238571049606509 0.3 6 0 0
0.8219202337213936 0.3 6 0 1
0.8223027304055845 0.3 6 0.5 0
0.8228010436474593 0.3 6 0.5 1
0.8210917784566344 0.4 5 0 0
0.8227534307215583 0.4 5 0 1
0.8223910930458602 0.4 5 0.5 0
0.8223951883955423 0.4 5 0.5 1
0.8238985372751778 0.4 6 0 0
0.8264032805679262 0.4 6 0 1
0.8246721709567751 0.4 6 0.5 0
0.8263283108643935 0.4 6 0.5 1


## ReCreate and save our best model

In [19]:
model = XGBClassifier(learning_rate=0.4, max_depth=6, n_jobs=n_jobs,
                    reg_alpha=0, reg_lambda=1)
model.fit(X,y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.4, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=16, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [20]:
y_pred_val = model.predict(test_X)
y_pred = model.predict(X)
tn, fp, fn, tp = confusion_matrix(test_y,y_pred_val).ravel()
tn, fp, fn, tp

(80679, 1511, 2426, 9371)

In [22]:
# Save the model.
import pickle
pickle.dump(model, open("model_1900_1920.dat", "wb"))

In [23]:
# Load the model
loaded_model = pickle.load(open("model_1900_1920.dat", "rb"))
loaded_model.predict(X)

array([False, False, False, ...,  True, False, False])