In [1]:
import pandas as pd
from splycer.blocker import BlockDB
from splycer.record_set import RecordDB
from splycer.pairs_set import PairsDB
from splycer.feature_engineer import FeatureEngineer
import recordlinkage as rl
import pyodbc
import numpy as np
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, precision_score, recall_score
import pickle as pkl
from tqdm import tqdm

import turbodbc
conn = turbodbc.connect('rec_db')

In [2]:
# Load the model
modelPath = r'R:\JoePriceResearch\record_linking\projects\deep_learning\paper_RR\CensusTree_2020\final\3-train_models\model_1900_1910.dat'
model = pkl.load(open(modelPath, "rb"))


In [3]:
extra_join = ''
sql1910 = RecordDB('compiled_1910','index','rec_db',extra_joins=extra_join)
sql1920 = RecordDB('compiled_1920','index','rec_db',extra_joins=extra_join)

def run( outfile, chunksize=1000000, logfile='log5.txt'):
    """Run the model on the full compare set, writing results to file."""
    total = 93000000
    for i in tqdm(range(0,total,chunksize)):
        print("start tqdm")
        
        print("getting chunk1")
        cursor = conn.cursor()
        
        cursor.execute(f"""WITH curr AS (
            SELECT * FROM compares_1910_1920 c19101920 WHERE c19101920.index1910 < {i+chunksize} AND c19101920.index1910 >= {i}
            ) 
            SELECT DISTINCT "index" as index_,* FROM compiled_1910 where "index" in (SELECT index1910 FROM curr)""")
        table = cursor.fetchallarrow()
        rec1 = table.to_pandas().set_index('index')
        
        print("getting chunk2")
        cursor.execute(f"""WITH curr AS (
            SELECT * FROM compares_1910_1920 c19101920 WHERE c19101920.index1910 < {i+chunksize} AND c19101920.index1910 >= {i}
            ) 
            SELECT DISTINCT "index" as index_, * FROM compiled_1920 where "index" in (SELECT index1920 FROM curr)""")
        table = cursor.fetchallarrow()
        rec2 = table.to_pandas().set_index('index')
     
        print("getting pairs")
        cursor.execute(f"""SELECT * FROM compares_1910_1920 c19101920 WHERE c19101920.index1910 < {i+chunksize} AND c19101920.index1910 >= {i}""")
        table = cursor.fetchallarrow()
        pairs = pd.MultiIndex.from_frame(table.to_pandas())
        pairs.names = ['','']


        comp_vecs = c.compute(pairs, rec2, rec1)
        comp_vecs.columns=['res','bp','first_jaro','last_jaro','birth_year','immigration','first_comm',
           'last_comm','marstat','mbp','fbp','rel']
                
        print("predicting")
        preds = model.predict_proba(comp_vecs) # predict matches with model
        print("Done predicting")
        

        data = pd.DataFrame({'index1910':[item[1] for item in pairs.values],'index1920':[item[0] for item in pairs.values],'link_prob':preds[:,1]})
        print("saving")
        data.to_csv(outfile,mode='a',header=None,index=False)
     

In [4]:
from recordlinkage.base import BaseCompareFeature

class eucledian_distance(BaseCompareFeature):
    def __init__(self, left_on, right_on):
        super(eucledian_distance, self).__init__(left_on, right_on)
        self.n = len(left_on)
    def _compute_vectorized(self,*args):
        s1 = args[:self.n]
        s2 = args[self.n:]
        return np.linalg.norm(np.array(s1)-np.array(s2),ord=2,axis=0)
    
class commonality_weight(BaseCompareFeature):
    def __init__(self,left_on,right_on):
        super(commonality_weight, self).__init__(left_on, right_on)
    def _compute_vectorized(self,s1,s2):
        return 1 / np.log1p((s1 + s2) / 2)
    
def get_compare_engine(drop=[]):
    exact_match_features = ['marstat','mbp','fbp','rel','first_nysiis','last_nysiis']
    exact_match_features = [feat for feat in exact_match_features if feat not in drop]
    c = rl.Compare() # declare comparison object
    if 'res' not in drop:
        c.geo('res_lat','res_lon','res_lat','res_lon',method = 'exp',scale=500)
    if 'bp' not in drop:
        c.geo('bp_lat','bp_lon','bp_lat','bp_lon', method = 'exp',scale=500)
    if 'first_jaro' not in drop:
        c.string('first','first',method = 'jarowinkler')
    if 'last_jaro' not in drop:
        c.string('last','last', method = 'jarowinkler')
    #c.string('first','first',method = 'qgram')
    #c.string('last','last', method = 'qgram')
    if 'birth_year' not in drop:
        c.numeric('birth_year','birth_year', method = 'lin', scale = 1, offset = 1)
    if 'immigration' not in drop:
        c.numeric('immigration','immigration', method = 'lin', scale = 1, offset = 1)
    
    vec_cols = [f'occ_vec{i}' for i in range(128)]
    if 'occ' not in drop:
        c.add(eucledian_distance(vec_cols,vec_cols))
    if 'comm_first' not in drop:
        c.add(commonality_weight('first_comm','first_comm'))
    if 'comm_last' not in drop:
        c.add(commonality_weight('last_comm','last_comm'))    
    for col in exact_match_features:
        c.exact(col,col)
    return c

c = get_compare_engine(drop=['occ','first_nysiis','last_nysiis'])

In [5]:
blocked_pairs = PairsDB('compares_1910_1920',['index1910','index1920'],'rec_db')
savePath = r"R:\JoePriceResearch\record_linking\projects\deep_learning\paper_RR\CensusTree_2020\final\4-predict\predictions_1910_1920_using_1900_1910.csv"
run(savePath)

  0%|                                                                               | 0/93 [00:00<?, ?it/s]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


  1%|▋                                                                    | 1/93 [00:59<1:30:31, 59.04s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


  2%|█▍                                                                   | 2/93 [02:05<1:32:57, 61.29s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


  3%|██▏                                                                  | 3/93 [03:19<1:37:49, 65.21s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


  4%|██▉                                                                  | 4/93 [04:24<1:36:14, 64.88s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


  5%|███▋                                                                 | 5/93 [06:44<2:08:25, 87.56s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


  6%|████▍                                                                | 6/93 [08:18<2:09:56, 89.62s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


  8%|█████                                                               | 7/93 [10:25<2:24:21, 100.72s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


  9%|█████▊                                                              | 8/93 [13:44<3:04:16, 130.08s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 10%|██████▌                                                             | 9/93 [15:39<2:56:03, 125.75s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 11%|███████▏                                                           | 10/93 [17:02<2:36:10, 112.90s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 12%|███████▉                                                           | 11/93 [18:20<2:19:57, 102.40s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 13%|████████▊                                                           | 12/93 [19:45<2:11:20, 97.29s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 14%|█████████▎                                                         | 13/93 [21:43<2:17:42, 103.29s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 15%|██████████                                                         | 14/93 [25:22<3:01:57, 138.20s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 16%|██████████▊                                                        | 15/93 [29:17<3:37:14, 167.11s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 17%|███████████▌                                                       | 16/93 [32:36<3:46:35, 176.56s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 18%|████████████▏                                                      | 17/93 [35:47<3:49:09, 180.91s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 19%|████████████▉                                                      | 18/93 [38:53<3:48:20, 182.68s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 20%|█████████████▋                                                     | 19/93 [41:31<3:35:59, 175.13s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 22%|██████████████▍                                                    | 20/93 [42:56<3:00:13, 148.14s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 23%|███████████████▏                                                   | 21/93 [44:43<2:42:52, 135.72s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 24%|███████████████▊                                                   | 22/93 [47:29<2:51:14, 144.71s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 25%|████████████████▌                                                  | 23/93 [50:14<2:56:02, 150.90s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 26%|█████████████████▎                                                 | 24/93 [53:05<3:00:25, 156.90s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 27%|██████████████████                                                 | 25/93 [55:42<2:57:53, 156.97s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 28%|██████████████████▋                                                | 26/93 [57:14<2:33:23, 137.36s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 29%|███████████████████▍                                               | 27/93 [58:44<2:15:37, 123.30s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 30%|███████████████████▌                                             | 28/93 [1:00:29<2:07:36, 117.79s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 31%|████████████████████▎                                            | 29/93 [1:01:41<1:50:57, 104.02s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 32%|████████████████████▉                                            | 30/93 [1:03:27<1:49:46, 104.55s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 33%|██████████████████████                                            | 31/93 [1:04:42<1:38:55, 95.74s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 34%|██████████████████████▋                                           | 32/93 [1:05:58<1:31:20, 89.85s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 35%|███████████████████████▍                                          | 33/93 [1:07:19<1:27:19, 87.32s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 37%|████████████████████████▏                                         | 34/93 [1:08:56<1:28:45, 90.27s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 38%|████████████████████████▊                                         | 35/93 [1:10:18<1:24:35, 87.51s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 39%|█████████████████████████▌                                        | 36/93 [1:11:59<1:27:11, 91.79s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 40%|██████████████████████████▎                                       | 37/93 [1:13:25<1:23:55, 89.92s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 41%|██████████████████████████▉                                       | 38/93 [1:14:41<1:18:37, 85.78s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 42%|███████████████████████████▋                                      | 39/93 [1:16:23<1:21:27, 90.51s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 43%|███████████████████████████▉                                     | 40/93 [1:19:34<1:46:40, 120.77s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 44%|████████████████████████████▋                                    | 41/93 [1:20:59<1:35:28, 110.16s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 45%|█████████████████████████████▎                                   | 42/93 [1:23:58<1:51:06, 130.72s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 46%|██████████████████████████████                                   | 43/93 [1:27:09<2:04:05, 148.91s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 47%|██████████████████████████████▊                                  | 44/93 [1:29:38<2:01:37, 148.93s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 48%|███████████████████████████████▍                                 | 45/93 [1:30:40<1:38:17, 122.85s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 49%|████████████████████████████████▏                                | 46/93 [1:31:43<1:22:08, 104.85s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 51%|█████████████████████████████████▎                                | 47/93 [1:32:54<1:12:36, 94.71s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 52%|█████████████████████████████████▌                               | 48/93 [1:35:05<1:19:02, 105.39s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 53%|██████████████████████████████████▏                              | 49/93 [1:38:29<1:39:03, 135.09s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 54%|██████████████████████████████████▉                              | 50/93 [1:42:07<1:54:34, 159.88s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 55%|███████████████████████████████████▋                             | 51/93 [1:45:25<2:00:03, 171.52s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 56%|████████████████████████████████████▎                            | 52/93 [1:47:22<1:45:56, 155.03s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 57%|█████████████████████████████████████                            | 53/93 [1:49:48<1:41:39, 152.48s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 58%|█████████████████████████████████████▋                           | 54/93 [1:52:58<1:46:22, 163.65s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 59%|██████████████████████████████████████▍                          | 55/93 [1:56:19<1:50:39, 174.73s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 60%|███████████████████████████████████████▏                         | 56/93 [1:59:19<1:48:43, 176.30s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 61%|███████████████████████████████████████▊                         | 57/93 [2:00:11<1:23:25, 139.04s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 62%|████████████████████████████████████████▌                        | 58/93 [2:01:43<1:12:58, 125.10s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 63%|█████████████████████████████████████████▏                       | 59/93 [2:04:20<1:16:10, 134.42s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 65%|█████████████████████████████████████████▉                       | 60/93 [2:06:34<1:13:56, 134.43s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 66%|██████████████████████████████████████████▋                      | 61/93 [2:08:21<1:07:16, 126.14s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 67%|███████████████████████████████████████████▎                     | 62/93 [2:11:19<1:13:11, 141.66s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 68%|████████████████████████████████████████████                     | 63/93 [2:13:51<1:12:25, 144.85s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 69%|████████████████████████████████████████████▋                    | 64/93 [2:16:44<1:14:05, 153.29s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 70%|█████████████████████████████████████████████▍                   | 65/93 [2:19:38<1:14:23, 159.41s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 71%|██████████████████████████████████████████████▏                  | 66/93 [2:22:26<1:12:54, 162.02s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 72%|██████████████████████████████████████████████▊                  | 67/93 [2:25:59<1:16:53, 177.45s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 73%|███████████████████████████████████████████████▌                 | 68/93 [2:29:46<1:20:09, 192.39s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 74%|████████████████████████████████████████████████▏                | 69/93 [2:33:41<1:22:03, 205.13s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 75%|████████████████████████████████████████████████▉                | 70/93 [2:37:24<1:20:42, 210.54s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 76%|█████████████████████████████████████████████████▌               | 71/93 [2:41:02<1:17:57, 212.63s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 77%|██████████████████████████████████████████████████▎              | 72/93 [2:44:47<1:15:40, 216.23s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 78%|███████████████████████████████████████████████████              | 73/93 [2:48:00<1:09:49, 209.50s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 80%|███████████████████████████████████████████████████▋             | 74/93 [2:51:11<1:04:30, 203.71s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 81%|████████████████████████████████████████████████████▍            | 75/93 [2:54:49<1:02:28, 208.26s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 82%|██████████████████████████████████████████████████████▊            | 76/93 [2:55:47<46:09, 162.92s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 83%|███████████████████████████████████████████████████████▍           | 77/93 [2:56:44<34:58, 131.16s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 84%|████████████████████████████████████████████████████████▏          | 78/93 [2:59:24<34:58, 139.93s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 85%|████████████████████████████████████████████████████████▉          | 79/93 [3:02:02<33:53, 145.26s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 86%|█████████████████████████████████████████████████████████▋         | 80/93 [3:04:58<33:28, 154.49s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 87%|██████████████████████████████████████████████████████████▎        | 81/93 [3:07:54<32:10, 160.87s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 88%|███████████████████████████████████████████████████████████        | 82/93 [3:10:50<30:22, 165.68s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 89%|███████████████████████████████████████████████████████████▊       | 83/93 [3:12:31<24:20, 146.05s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 90%|████████████████████████████████████████████████████████████▌      | 84/93 [3:13:41<18:30, 123.37s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 91%|█████████████████████████████████████████████████████████████▏     | 85/93 [3:16:07<17:20, 130.05s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 92%|█████████████████████████████████████████████████████████████▉     | 86/93 [3:20:32<19:53, 170.50s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 94%|██████████████████████████████████████████████████████████████▋    | 87/93 [3:24:31<19:06, 191.06s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 95%|███████████████████████████████████████████████████████████████▍   | 88/93 [3:28:24<16:58, 203.63s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 96%|████████████████████████████████████████████████████████████████   | 89/93 [3:32:21<14:14, 213.60s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 97%|████████████████████████████████████████████████████████████████▊  | 90/93 [3:36:16<11:00, 220.04s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 98%|█████████████████████████████████████████████████████████████████▌ | 91/93 [3:40:05<07:25, 222.85s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


 99%|██████████████████████████████████████████████████████████████████▎| 92/93 [3:44:12<03:50, 230.04s/it]

start tqdm
getting chunk1
getting chunk2
getting pairs
predicting
Done predicting
saving


100%|███████████████████████████████████████████████████████████████████| 93/93 [3:47:57<00:00, 147.07s/it]
