In [3]:
import pandas as pd
from splycer.blocker import BlockDB
from splycer.record_set import RecordDB
from splycer.pairs_set import PairsDB
from splycer.feature_engineer import FeatureEngineer
import recordlinkage as rl
import pyodbc
import numpy as np
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
import pickle as pkl
from tqdm import tqdm

# Set up a database connection
import turbodbc
conn = turbodbc.connect('rec_db')

import os.path
basePath = r'R:\JoePriceResearch\record_linking\projects\deep_learning\paper_RR\CensusTree_2020\final'
trainPath = os.path.abspath(os.path.join(basePath, '2-split_train_test', 'train_1910_1920.csv'))
testPath = os.path.abspath(os.path.join(basePath, '2-split_train_test', 'test_1910_1920.csv'))

## Create the class for comparing features.

In [4]:
from recordlinkage.base import BaseCompareFeature

class eucledian_distance(BaseCompareFeature):
    def __init__(self, left_on, right_on):
        super(eucledian_distance, self).__init__(left_on, right_on)
        self.n = len(left_on)
    def _compute_vectorized(self,*args):
        s1 = args[:self.n]
        s2 = args[self.n:]
        return np.linalg.norm(np.array(s1)-np.array(s2),ord=2,axis=0)
    
class commonality_weight(BaseCompareFeature):
    def __init__(self,left_on,right_on):
        super(commonality_weight, self).__init__(left_on, right_on)
    def _compute_vectorized(self,s1,s2):
        return 1 / np.log1p((s1 + s2) / 2)
    
def get_compare_engine(drop=[]):
    exact_match_features = ['marstat','mbp','fbp','rel','first_nysiis','last_nysiis']
    exact_match_features = [feat for feat in exact_match_features if feat not in drop]
    c = rl.Compare() # declare comparison object
    if 'res' not in drop:
        c.geo('res_lat','res_lon','res_lat','res_lon',method = 'exp',scale=500)
    if 'bp' not in drop:
        c.geo('bp_lat','bp_lon','bp_lat','bp_lon', method = 'exp',scale=500)
    if 'first_jaro' not in drop:
        c.string('first','first',method = 'jarowinkler')
    if 'last_jaro' not in drop:
        c.string('last','last', method = 'jarowinkler')
    #c.string('first','first',method = 'qgram')
    #c.string('last','last', method = 'qgram')
    if 'birth_year' not in drop:
        c.numeric('birth_year','birth_year', method = 'lin', scale = 1, offset = 1)
    if 'immigration' not in drop:
        c.numeric('immigration','immigration', method = 'lin', scale = 1, offset = 1)
    
    vec_cols = [f'occ_vec{i}' for i in range(128)]
    if 'occ' not in drop:
        c.add(eucledian_distance(vec_cols,vec_cols))
    if 'comm_first' not in drop:
        c.add(commonality_weight('first_comm','first_comm'))
    if 'comm_last' not in drop:
        c.add(commonality_weight('last_comm','last_comm'))    
    for col in exact_match_features:
        c.exact(col,col)
    return c

## Load the training data

In [5]:
# Get the training set.
df = pd.read_csv(trainPath)

# Get the full data using SQL.
sql1910 = RecordDB('compiled_1910','ark1910','rec_db')
sql1920 = RecordDB('compiled_1920','ark1920','rec_db')
rec1910 = sql1910.get_records(df['ark1910'].drop_duplicates()).set_index('index')
rec1920 = sql1920.get_records(df['ark1920'].drop_duplicates()).set_index('index')

In [6]:
# Create the truth value.
pairs = pd.MultiIndex.from_arrays((df['ark1910'],df['ark1920']))
y = df['ark1920']==df['true_ark_1920']
y.value_counts(normalize=1)

False    0.931924
True     0.068076
dtype: float64

In [7]:
rec1910.index = rec1910.index_
rec1920.index = rec1920.index_

In [9]:
c = get_compare_engine(drop=['occ','first_nysiis','last_nysiis','res'])
X = c.compute(pairs,rec1910,rec1920)
X.columns=['bp','first_jaro','last_jaro','birth_year','immigration','first_comm',
           'last_comm','marstat','mbp','fbp','rel']

In [10]:
X.describe()

Unnamed: 0,bp,first_jaro,last_jaro,birth_year,immigration,first_comm,last_comm,marstat,mbp,fbp,rel
count,213217.0,213217.0,213217.0,213217.0,213217.0,213099.0,212194.0,213217.0,213217.0,213217.0,213217.0
mean,0.999991,0.919006,0.830371,0.481066,0.027688,0.078902,0.108761,0.73595,0.643457,0.624734,0.695615
std,0.003063,0.155184,0.172934,0.443577,0.154307,0.015181,0.044535,0.440827,0.478979,0.484193,0.460147
min,0.0,0.0,0.0,0.0,0.0,0.06896,0.072193,0.0,0.0,0.0,0.0
25%,1.0,0.885714,0.666667,0.0,0.0,0.071743,0.086103,0.0,0.0,0.0,0.0
50%,1.0,1.0,0.866667,0.5,0.0,0.076312,0.096981,1.0,1.0,1.0,1.0
75%,1.0,1.0,1.0,1.0,0.0,0.082698,0.115864,1.0,1.0,1.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.442695,1.091357,1.0,1.0,1.0,1.0


## Loading the test data

In [11]:
# Load in the test data.
val = pd.read_csv(testPath)
val.columns = ['ark1910','ark1920','true_ark1920']

val['truth'] = val['ark1920']==val['true_ark1920']
pairs = pd.MultiIndex.from_arrays((val['ark1910'],val['ark1920']))

recb = sql1920.get_records(val['ark1920'].drop_duplicates().tolist()).set_index('index')
reca = sql1910.get_records(val['ark1910'].drop_duplicates().tolist()).set_index('index')
reca.index=reca.index_
recb.index=recb.index_

test_X=c.compute(pairs,reca,recb)

test_y = val['truth']
test_X.columns=['bp','first_jaro','last_jaro','birth_year','immigration','first_comm',
           'last_comm','marstat','mbp','fbp','rel']

test_X.describe()

Unnamed: 0,bp,first_jaro,last_jaro,birth_year,immigration,first_comm,last_comm,marstat,mbp,fbp,rel
count,91379.0,91379.0,91379.0,91379.0,91379.0,91339.0,90916.0,91379.0,91379.0,91379.0,91379.0
mean,0.999989,0.918454,0.830825,0.478956,0.028765,0.078953,0.108448,0.734315,0.646297,0.627617,0.691505
std,0.003308,0.156634,0.172951,0.44312,0.157514,0.015576,0.043669,0.4417,0.478121,0.483442,0.461875
min,0.0,0.0,0.0,0.0,0.0,0.06896,0.072193,0.0,0.0,0.0,0.0
25%,1.0,0.883333,0.666667,0.0,0.0,0.071743,0.086026,0.0,0.0,0.0,0.0
50%,1.0,1.0,0.866667,0.5,0.0,0.076323,0.096981,1.0,1.0,1.0,1.0
75%,1.0,1.0,1.0,1.0,0.0,0.082766,0.115759,1.0,1.0,1.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.442695,1.442695,1.0,1.0,1.0,1.0


## Train using three algorithms

In [14]:
# Train on nearest centroid.
from sklearn.neighbors import NearestCentroid
model = NearestCentroid()
model.fit(X.fillna(X.mean()),y)

y_pred_val = model.predict(test_X.fillna(X.mean()))
y_pred = model.predict(X.fillna(X.mean()))

print(f'train_recall: {recall_score(y,y_pred)}')
print(f'train_precision: {precision_score(y,y_pred)}\n')
print(f'val recall: {recall_score(test_y,y_pred_val)}')
print(f'val precision: {precision_score(test_y,y_pred_val)}\n')
print(f'train_f1_score: {f1_score(y,y_pred)}')
print(f'test_f1_score: {f1_score(test_y, y_pred_val)}')

train_recall: 0.882190837065105
train_precision: 0.19571430754887126

val recall: 0.8731511254019293
val precision: 0.19463159403669725

train_f1_score: 0.3203572590127843
test_f1_score: 0.3183096940569687


In [15]:
# Train using Logistic Regression
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter=300)
model.fit(X.fillna(X.mean()),y)

# Predict
y_pred_val = model.predict(test_X.fillna(X.mean()))
y_pred = model.predict(X.fillna(X.mean()))

# Print stats.
print(f'train_recall: {recall_score(y,y_pred)}')
print(f'train_precision: {precision_score(y,y_pred)}\n')
print(f'val recall: {recall_score(test_y,y_pred_val)}')
print(f'val precision: {precision_score(test_y,y_pred_val)}\n')
print(f'train_f1_score: {f1_score(y,y_pred)}')
print(f'test_f1_score: {f1_score(test_y, y_pred_val)}')

train_recall: 0.5267654150878401
train_precision: 0.8222389504247769

val recall: 0.5290996784565917
val precision: 0.8186567164179105

train_f1_score: 0.642143277063912
test_f1_score: 0.6427734375


In [16]:
# Train using XGB.
model = XGBClassifier()
model.fit(X,y)

y_pred_val = model.predict(test_X)
y_pred = model.predict(X)

print(f'train_recall: {recall_score(y,y_pred)}')
print(f'train_precision: {precision_score(y,y_pred)}\n')
print(f'test_recall: {recall_score(test_y,y_pred_val)}')
print(f'test_precision: {precision_score(test_y,y_pred_val)}\n')
print(f'train_f1_score: {f1_score(y,y_pred)}')
print(f'test_f1_score: {f1_score(test_y, y_pred_val)}')


train_recall: 0.7466758525663107
train_precision: 0.8477784730913642

test_recall: 0.7138263665594855
test_precision: 0.818735017517979

train_f1_score: 0.7940217590387927
test_f1_score: 0.7626900283432105


## Test micro-parameters for XGBoost

In [17]:
# Check the following micro parameters.
learning_rates=[.3,.4,.5]
max_depth=[5,6,7]
alpha_vals = [0,0.5]
lambda_vals = [0,1]
n_jobs=16


for lr in learning_rates:
    for depth in max_depth:
        for alph in alpha_vals:
            for lam in lambda_vals:
                model = XGBClassifier(
                    learning_rate=lr, max_depth=depth, n_jobs=n_jobs,
                    reg_alpha=alph, reg_lambda=lam)
                model.fit(X,y)
                y_pred_val = model.predict(test_X)
                print(f1_score(test_y, y_pred_val), lr, depth, alph, lam)

0.7562705414288186 0.3 5 0 0
0.7579979360165119 0.3 5 0 1
0.7587987264435074 0.3 5 0.5 0
0.761105880338954 0.3 5 0.5 1
0.7593959155654711 0.3 6 0 0
0.7626900283432105 0.3 6 0 1
0.7614797013131921 0.3 6 0.5 0
0.7623379967384775 0.3 6 0.5 1
0.7633262260127933 0.3 7 0 0
0.7627380339680906 0.3 7 0 1
0.7651385562777969 0.3 7 0.5 0
0.7606888869848342 0.3 7 0.5 1
0.758086717136958 0.4 5 0 0
0.7600827300930714 0.4 5 0 1
0.7593226137529935 0.4 5 0.5 0
0.7597736625514403 0.4 5 0.5 1
0.7607435197817191 0.4 6 0 0
0.7601230138390569 0.4 6 0 1
0.7609272089641604 0.4 6 0.5 0
0.7607414367472453 0.4 6 0.5 1
0.7609864322894444 0.4 7 0 0
0.7594351732991014 0.4 7 0 1
0.7611927618723983 0.4 7 0.5 0
0.7631220177232448 0.4 7 0.5 1
0.7573264781491001 0.5 5 0 0
0.7601678225875503 0.5 5 0 1
0.7601988344189236 0.5 5 0.5 0
0.7584549356223177 0.5 5 0.5 1
0.760771649528342 0.5 6 0 0
0.7605103176641835 0.5 6 0 1
0.7581577158395649 0.5 6 0.5 0
0.7629692832764505 0.5 6 0.5 1
0.7572980312287849 0.5 7 0 0
0.763056999233

## ReCreate and save our best model

In [18]:
model = XGBClassifier(learning_rate=0.3, max_depth=7, n_jobs=n_jobs,
                    reg_alpha=0.5, reg_lambda=0)
model.fit(X,y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.3, max_delta_step=0, max_depth=7,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=16, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0.5,
              reg_lambda=0, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [19]:
y_pred_val = model.predict(test_X)
y_pred = model.predict(X)
tn, fp, fn, tp = confusion_matrix(test_y,y_pred_val).ravel()
tn, fp, fn, tp

(84160, 999, 1747, 4473)

In [20]:
# Save the model.
import pickle
pickle.dump(model, open("model_1910_1920_no_res.dat", "wb"))

In [21]:
# Load the model
loaded_model = pickle.load(open("model_1910_1920_no_res.dat", "rb"))
loaded_model.predict(X)

array([False, False, False, ..., False, False, False])