In [22]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, precision_recall_curve, auc, roc_auc_score, roc_curve
import datetime
import joblib
from pathlib import Path

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
np.set_printoptions(suppress = True)

In [2]:
# data and model imports

val_df = pd.read_csv(r'C:\Users\caseyrya\Dropbox\foursquare_location_matching_data\data\model_training\val_features.csv')

print(val_df.shape)
val_df.head()

(114163, 28)


Unnamed: 0,id_1,id_2,match,name_ratio,city_ratio,state_ratio,zip_ratio,country_ratio,url_ratio,categories_ratio,name_ratio_part,city_ratio_part,state_ratio_part,zip_ratio_part,country_ratio_part,url_ratio_part,categories_ratio_part,proximity,full_address_1_lang,full_address_2_lang,same_lang,len_name_diff,len_address_diff,len_city_diff,len_state_diff,len_zip_diff,len_phone_diff,len_categories_diff
0,E_000008a8ba4f48,E_0a0413f6011f21,False,40,25,100,100,100,100,45,40,50,100,100,100,100,50,0.074396,tr,tr,1.0,-4,2,-2,0,0,,-13
1,E_00001d92066153,E_7e0d8e9138dd56,True,65,21,29,0,100,100,100,100,38,40,0,100,100,100,0.011176,es,es,1.0,12,-22,-12,4,-2,,0
2,E_00002a131a2bf6,E_b0acaf62ca34b4,False,9,100,100,100,100,100,100,12,100,100,100,100,100,100,0.311571,en,ar,0.0,-12,-3,0,0,0,,0
3,E_0001827d6b4ee2,E_902096602680c1,True,100,100,100,100,100,100,100,100,100,100,100,100,100,100,0.118222,tr,so,0.0,0,-2,0,0,0,0.0,0
4,E_0001a397f67ad5,E_9b7401384bf388,False,34,24,100,0,100,11,52,34,25,100,0,100,67,50,1.416378,en,fr,0.0,1,9,1,0,2,,-3


In [12]:
model = joblib.load(Path(r'C:\Users\caseyrya\Documents\foursquare_location_matching\models\xgb_classifier_full_train.sav'))

In [4]:
def jaccard_similarity(list1, list2):
    
    s1 = set(list1) # predictions
    s2 = set(list2) # actuals
    
    return float(len(s1.intersection(s2)) / len(s1.union(s2))) 

In [14]:
id_pairs_true = val_df[['id_1','id_2','match']].copy()
X = val_df.drop(['match','id_1','id_2','full_address_1_lang','full_address_2_lang'], axis = 1).copy()
y = val_df[['match']].copy()

preds_df = id_pairs_true.copy()
preds_df['prob_false'] = [i[0] for i in model.predict_proba(X)]
preds_df['prob_true'] = [i[1] for i in model.predict_proba(X)]

preds_df['model_pred'] = np.where(preds_df['prob_true']>=0.3, True, False)

# define required figure sets
actuals = preds_df['match'].copy()
preds = preds_df['model_pred'].copy()
pos_pred_probs = preds_df['prob_true'].copy()

# calculate classic metrics
model_acc = accuracy_score(actuals, preds)
model_prec = precision_score(actuals, preds)
model_rec = recall_score(actuals, preds)
model_f1 = f1_score(actuals, preds)

prec, rec, _ = precision_recall_curve(actuals, pos_pred_probs)
model_auc_pr = auc(rec, prec)

fpr, tpr, _ = roc_curve(actuals, pos_pred_probs)
model_roc_auc = roc_auc_score(actuals, pos_pred_probs)

In [15]:
preds_df.head()

Unnamed: 0,id_1,id_2,match,prob_false,prob_true,model_pred
0,E_000008a8ba4f48,E_0a0413f6011f21,False,0.998283,0.001717,False
1,E_00001d92066153,E_7e0d8e9138dd56,True,0.007358,0.992642,True
2,E_00002a131a2bf6,E_b0acaf62ca34b4,False,0.453611,0.546389,True
3,E_0001827d6b4ee2,E_902096602680c1,True,0.25757,0.74243,True
4,E_0001a397f67ad5,E_9b7401384bf388,False,0.999592,0.000408,False


In [None]:
# for each id:
    # 

In [19]:
len(preds_df.id_1.drop_duplicates().tolist())

114006

In [20]:
len(preds_df)

114163

In [28]:
# calculate competition specific metric (mean jaccard similarity score)
t0 = datetime.datetime.now()
print('cell started at: {}'.format(t0))

unique_ids = []
actual_matches = []
predicted_matches = []

count = 0

for unique_id in preds_df.id_1.drop_duplicates().tolist():

    id_actual_matches = []
    id_predicted_matches = []

    id_df = preds_df[preds_df['id_1']==unique_id].copy()

    for row in range(0, len(id_df)): # will iterate through the n rows (n defined in generate_pairs())

        if id_df['match'][row:row+1].values[0] == True:
            id_actual_matches.append(id_df['id_2'][row:row+1].values[0])
        else:
            pass

        if id_df['model_pred'][row:row+1].values[0] == True:
            id_predicted_matches.append(id_df['id_2'][row:row+1].values[0])
        else:
            pass

    id_actual_matches.append(unique_id)
    id_predicted_matches.append(unique_id)

    unique_ids.append(unique_id)
    actual_matches.append(id_actual_matches)
    predicted_matches.append(id_predicted_matches)
    
    count +=1
    
    if count % 10000 == 0:
        print('{} / {} complete. time: {}'.format(count, len(preds_df.id_1.drop_duplicates().tolist()), datetime.datetime.now()))


comp_metric_df = pd.DataFrame({'id': unique_ids,
                               'actual_matches': actual_matches,
                               'predicted_matches': predicted_matches})

comp_metric_df['jaccard_score'] = comp_metric_df.apply(lambda x: round(jaccard_similarity(x['predicted_matches'], x['actual_matches']),3), axis=1)

model_comp_score = round(comp_metric_df.jaccard_score.mean(), 3)

print('cell finished, time taken: {}'.format(datetime.datetime.now() - t0))

cell started at: 2022-07-07 09:12:10.731152
10000 / 114006 complete. time: 2022-07-07 09:13:11.300329
20000 / 114006 complete. time: 2022-07-07 09:14:13.849207
30000 / 114006 complete. time: 2022-07-07 09:15:18.450338
40000 / 114006 complete. time: 2022-07-07 09:16:21.684799
50000 / 114006 complete. time: 2022-07-07 09:17:25.402244
60000 / 114006 complete. time: 2022-07-07 09:18:28.071702
70000 / 114006 complete. time: 2022-07-07 09:19:30.656877
80000 / 114006 complete. time: 2022-07-07 09:20:34.508592
90000 / 114006 complete. time: 2022-07-07 09:21:38.625144
100000 / 114006 complete. time: 2022-07-07 09:22:43.775071
110000 / 114006 complete. time: 2022-07-07 09:23:53.625072
cell finished, time taken: 0:25:37.505401


In [37]:
print(f'model accuracy val: {model_acc}')
print(f'model precision val: {model_prec}')
print(f'model recall val: {model_rec}')
print(f'model f1-score val: {model_f1}')
print(f'model AUC PR val: {model_auc_pr}')
print(f'model AUC ROC val: {model_roc_auc}')
print(f'model competition score val: {model_comp_score}')
print()

model accuracy val: 0.9615900072703065
model precision val: 0.9625358472145952
model recall val: 0.9490053741878559
model f1-score val: 0.9557227242893925
model AUC PR val: 0.9893913568204551
model AUC ROC val: 0.9910304730267516
model competition score val: 0.981



In [33]:
# efficiency update
t0 = datetime.datetime.now()
print('cell started at: {}'.format(t0))

comp_metric_df_ = comp_metric_df.copy()

comp_metric_df_['jaccard_score_'] = comp_metric_df_.apply(lambda x: round(jaccard_similarity(x['predicted_matches'], x['actual_matches']),3), axis=1)

model_comp_score_ = round(comp_metric_df_.jaccard_score_.mean(), 3)

print(model_comp_score_)

print('cell finished, time taken: {}'.format(datetime.datetime.now() - t0))

comp_metric_df_.head()

cell started at: 2022-07-07 09:41:02.349923
0.981
cell finished, time taken: 0:00:01.352476


Unnamed: 0,id,actual_matches,predicted_matches,jaccard_score,jaccard_score_
0,E_000008a8ba4f48,[E_000008a8ba4f48],[E_000008a8ba4f48],1.0,1.0
1,E_00001d92066153,"[E_7e0d8e9138dd56, E_00001d92066153]","[E_7e0d8e9138dd56, E_00001d92066153]",1.0,1.0
2,E_00002a131a2bf6,[E_00002a131a2bf6],"[E_b0acaf62ca34b4, E_00002a131a2bf6]",0.5,0.5
3,E_0001827d6b4ee2,"[E_902096602680c1, E_0001827d6b4ee2]","[E_902096602680c1, E_0001827d6b4ee2]",1.0,1.0
4,E_0001a397f67ad5,[E_0001a397f67ad5],[E_0001a397f67ad5],1.0,1.0


In [36]:
comp_metric_df_[0:100]

Unnamed: 0,id,actual_matches,predicted_matches,jaccard_score,jaccard_score_
0,E_000008a8ba4f48,[E_000008a8ba4f48],[E_000008a8ba4f48],1.0,1.0
1,E_00001d92066153,"[E_7e0d8e9138dd56, E_00001d92066153]","[E_7e0d8e9138dd56, E_00001d92066153]",1.0,1.0
2,E_00002a131a2bf6,[E_00002a131a2bf6],"[E_b0acaf62ca34b4, E_00002a131a2bf6]",0.5,0.5
3,E_0001827d6b4ee2,"[E_902096602680c1, E_0001827d6b4ee2]","[E_902096602680c1, E_0001827d6b4ee2]",1.0,1.0
4,E_0001a397f67ad5,[E_0001a397f67ad5],[E_0001a397f67ad5],1.0,1.0
5,E_000260dd24cecf,"[E_919a37430bea7a, E_000260dd24cecf]","[E_919a37430bea7a, E_000260dd24cecf]",1.0,1.0
6,E_000288f2046ce3,"[E_23f8fce694c9a5, E_000288f2046ce3]","[E_23f8fce694c9a5, E_000288f2046ce3]",1.0,1.0
7,E_0002baf74f8aa3,[E_0002baf74f8aa3],[E_0002baf74f8aa3],1.0,1.0
8,E_0003e423fa55aa,[E_0003e423fa55aa],[E_0003e423fa55aa],1.0,1.0
9,E_000411fceef43c,"[E_8883e4a40e1935, E_000411fceef43c]","[E_8883e4a40e1935, E_000411fceef43c]",1.0,1.0


In [30]:
for i in [model_acc, model_prec, model_rec, model_f1, model_auc_pr, model_roc_auc, model_comp_score]:
    print(i)

0.9615900072703065
0.9625358472145952
0.9490053741878559
0.9557227242893925
0.9893913568204551
0.9910304730267516
0.981


In [None]:
# function which returns all classic metrics and comp specific metrics

def evaluate_model(
    data,
    model,
    pos_threshold = 0.3
):
    '''
    add function description here
    '''
    
    id_pairs_true = data[['id_1','id_2','match']].copy()
    X = data.drop(['match','id_1','id_2','full_address_1_lang','full_address_2_lang'], axis = 1).copy()
    y = data[['match']].copy()
    
    preds_df = id_pairs_true.copy()
    preds_df['prob_false'] = [i[0] for i in model.predict_proba(X)]
    preds_df['prob_true'] = [i[1] for i in model.predict_proba(X)]

    preds_df['model_pred'] = np.where(preds_df['prob_true']>=pos_threshold, True, False)
    
    # define required figure sets
    actuals = preds_df['match'].copy()
    preds = preds_df['model_pred'].copy()
    pos_pred_probs = preds_df['prob_true'].copy()
    
    # calculate classic metrics
    model_acc = accuracy_score(actuals, preds)
    model_prec = precision_score(actuals, preds)
    model_rec = recall_score(actuals, preds)
    model_f1 = f1_score(actuals, preds)
    
    prec, rec, _ = precision_recall_curve(actuals, pos_pred_probs)
    model_auc_pr = auc(rec, prec)
    
    fpr, tpr, _ = roc_curve(actuals, pos_pred_probs)
    model_roc_auc = roc_auc_score(actuals, pos_pred_probs)
    
    # calculate competition specific metric (mean jaccard similarity score)
    unique_ids = []
    actual_matches = []
    predicted_matches = []

    for unique_id in preds_df.id_1.drop_duplicates().tolist():

        id_actual_matches = []
        id_predicted_matches = []

        id_df = preds_df[preds_df['id_1']==unique_id].copy()

        for row in range(0, len(id_df)):

            if id_df['match'][row:row+1].values[0] == True:
                id_actual_matches.append(id_df['id_2'][row:row+1].values[0])
            else:
                pass

            if id_df['model_pred'][row:row+1].values[0] == True:
                id_predicted_matches.append(id_df['id_2'][row:row+1].values[0])
            else:
                pass

        id_actual_matches.append(unique_id)
        id_predicted_matches.append(unique_id)

        unique_ids.append(unique_id)
        actual_matches.append(id_actual_matches)
        predicted_matches.append(id_predicted_matches)


    comp_metric_df = pd.DataFrame({'id': unique_ids,
                                   'actual_matches': actual_matches,
                                   'predicted_matches': predicted_matches})
    
    comp_metric_df['jaccard_score'] = comp_metric_df.apply(lambda x: round(jaccard_similarity(x['predicted_matches'], x['actual_matches']),3), axis=1)
    
    model_comp_score = round(comp_metric_df.jaccard_score.mean(), 3)
    
    return model_acc, model_prec, model_rec, model_f1, model_auc_pr, model_roc_auc, model_comp_score

In [None]:
model_acc, model_prec, model_rec, model_f1, model_auc_pr, model_roc_auc, model_comp_score = evaluate_model(val_df, clf)

In [None]:
for i in [model_acc, model_prec, model_rec, model_f1, model_auc_pr, model_roc_auc, model_comp_score]:
    print(i)