In [1]:
# Title : Data Cleaning + Base Model
# Author : Alex Bass
# Date : 27 March 2023

import pandas as pd
from thefuzz import fuzz
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.utils import shuffle
from sklearn.metrics import precision_score, recall_score, roc_auc_score, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model, ensemble, neighbors
import re
import pickle
import time

# - - - - - - - - - - - #
# Cleaning before modeling
# - - - - - - - - - - - #

def get_lev_distance_or_NA(data, columns, partial = False, sort = False):
    assert isinstance(data, pd.DataFrame)
    assert isinstance(columns, list)
    assert len(columns) == 2
    #If partial is True, First column will be fully matched, Second column partially matched
    assert isinstance(columns[0], str)
    assert isinstance(columns[1], str)
    
    out = []
    for i in range(data.shape[0]):
        val1 = data[columns[0]].iloc[i]
        val2 = data[columns[1]].iloc[i]
        if pd.isna(val1) or val1 == "" or pd.isna(val2) or val2 == "":
            out.append(np.nan)
        else:
            if partial and sort:
                out.append(fuzz.partial_token_sort_ratio(val1, val2))
            elif partial:
                out.append(fuzz.partial_ratio(val1, val2))
            elif sort:
                out.append(fuzz.token_sort_ratio(val1, val2))
            else:
                out.append(fuzz.ratio(val1, val2))
    return out

train = pd.read_csv("training_set_final.csv")
test = pd.read_csv("golden_set.csv")

#lev distance for title
train['title_match'] = get_lev_distance_or_NA(train, ['title_ia', 'title_wiki'])
test['title_match'] = get_lev_distance_or_NA(test, ['title_ia', 'title_wiki'])

# using partial matching because title_ia is usually more descriptive
train['title_match_partial'] = get_lev_distance_or_NA(train, ['title_wiki', 'title_ia'], partial = True, sort = True)
test['title_match_partial'] = get_lev_distance_or_NA(test, ['title_wiki', 'title_ia'], partial = True, sort = True)

#for author
train.author_ia = train.author_ia.astype('str')
train.author_wiki = train.author_wiki.astype('str')

test.author_ia = test.author_ia.astype('str')
test.author_wiki = test.author_wiki.astype('str')

for var in ['author_ia', 'author_wiki']:
    train[var] = train[var].replace({'nan': np.nan})
    test[var] = test[var].replace({'nan': np.nan})

train['author_match'] = get_lev_distance_or_NA(train, ['author_ia', 'author_wiki'])
test['author_match'] = get_lev_distance_or_NA(test, ['author_ia', 'author_wiki'])

train['author_sort'] = get_lev_distance_or_NA(train, ['author_ia', 'author_wiki'], sort = True)
test['author_sort'] = get_lev_distance_or_NA(test, ['author_ia', 'author_wiki'], sort = True)

#for publisher
train['publisher_match'] = get_lev_distance_or_NA(train, ['publisher_ia', 'publisher_wiki'])
test['publisher_match'] = get_lev_distance_or_NA(test, ['publisher_ia', 'publisher_wiki'])

# using partial matching because publisher_ia is usually more descriptive
train['publisher_match_partial'] = get_lev_distance_or_NA(train, ['publisher_wiki', 'publisher_ia'], partial = True)
test['publisher_match_partial'] = get_lev_distance_or_NA(test, ['publisher_wiki', 'publisher_ia'], partial = True)

#year
def clean(string):
    if pd.isna(string):
        return np.nan
    string = str(string)
    if any(char.isdigit() for char in string) == False:
        return np.nan
    string = re.subn(r'\.[0-9]+',"",string)[0]
    string = re.subn(r'\.',"",string)[0]
    if string:
        try:
            return int(''.join(filter(str.isdigit, string)))
        except:
            print(string)
    else:
        return np.nan

train.year_wiki = train.year_wiki.apply(clean)
train.date_ia = train.date_ia.apply(clean)
test.year_wiki = test.year_wiki.apply(clean)
test.date_ia = test.date_ia.apply(clean)

year_res = []
for i in range(train.shape[0]):
    val1 = train.date_ia.iloc[i]
    val2 = train.year_wiki.iloc[i]
    if val1 and val2:
        year_res.append(float(val1) == float(val2))
    else:
        year_res.append(np.nan)

train['year_match'] = year_res
        
year_res = []
for i in range(test.shape[0]):
    val1 = test.date_ia.iloc[i]
    val2 = test.year_wiki.iloc[i]
    if val1 and val2:
        year_res.append(float(val1) == float(val2))
    else:
        year_res.append(np.nan)
        
test['year_match'] = year_res

#year NA
train['year_NA'] = [np.where(pd.isna(train.year_wiki.iloc[i]) or pd.isna(train.date_ia.iloc[i]), 1, 0) for i in range(train.shape[0])]
test['year_NA'] = [np.where(pd.isna(test.year_wiki.iloc[i]) or pd.isna(test.date_ia.iloc[i]), 1, 0) for i in range(test.shape[0])]

#Author NA
train['author_NA'] = [np.where(pd.isna(train.author_ia.iloc[i]) or pd.isna(train.author_wiki.iloc[i]), 1, 0) for i in range(train.shape[0])]
test['author_NA'] = [np.where(pd.isna(test.author_ia.iloc[i]) or pd.isna(test.author_wiki.iloc[i]),1, 0) for i in range(test.shape[0])]

#Publisher NA
train['publisher_NA'] = [np.where(pd.isna(train.publisher_ia.iloc[i]) or pd.isna(train.publisher_wiki.iloc[i]), 1, 0) for i in range(train.shape[0])]
test['publisher_NA'] = [np.where(pd.isna(test.publisher_ia.iloc[i]) or pd.isna(test.publisher_wiki.iloc[i]), 1, 0) for i in range(test.shape[0])]

#downsampling
has_match = train.groupby('query_count')['citebook_match'].transform(lambda x : any(x == 1)).to_list()
train = train[has_match]

tmp_match = train.query('citebook_match == 1')
tmp_unmatch = train.query('citebook_match == 0') \
    .groupby('query_count') \
    .sample(1)
train = pd.concat([tmp_match, tmp_unmatch], ignore_index = True)

train = shuffle(train)

#store NA data for later
train_w_NAs = train
test_w_NAs = test

# replace missing values with mean
for var in ['title_match','author_match', 'publisher_match', 'year_match', 'title_match_partial', 'publisher_match_partial', 'author_sort']:
    train[var] = train[var].fillna(train[var].mean())
    test[var] = test[var].fillna(test[var].mean())

In [4]:
#splitting data
train_x = train[['title_match','author_match', 'publisher_match', 'year_match', 'year_NA', 'author_NA', 'publisher_NA', 'title_match_partial', 'publisher_match_partial', 'author_sort']]
train_y = train['citebook_match']
test_x = test[['title_match','author_match', 'publisher_match', 'year_match', 'year_NA', 'author_NA', 'publisher_NA', 'title_match_partial', 'publisher_match_partial', 'author_sort']]
test_y = test['citebook_match']

scaler = StandardScaler()

train_x = scaler.fit_transform(train_x)
test_x = scaler.transform(test_x)

def get_model_results(model_name, model, param_grid, cv, train_x, train_y, test_x, test_y, save = False):
    
    time1 = time.time()
    
    if cv == 1:
        search = model.fit(train_x, train_y)
        best_params = ""
        
    else:
        search = GridSearchCV(model, param_grid, cv=cv, scoring=('precision', 'recall', 'accuracy'), refit='accuracy')
        search.fit(train_x, train_y)

    y_train_hat = search.predict(train_x)
    y_train_hat_probs = search.predict_proba(train_x)[:,1]

    train_accuracy = accuracy_score(train_y, y_train_hat)*100
    train_precision = precision_score(train_y, y_train_hat)*100
    train_recall = recall_score(train_y, y_train_hat)*100
    train_auc_roc = roc_auc_score(train_y, y_train_hat_probs)*100

    y_test_hat = search.predict(test_x)
    y_test_hat_probs = search.predict_proba(test_x)[:,1]

    test_accuracy = accuracy_score(test_y, y_test_hat)*100
    test_precision = precision_score(test_y, y_test_hat)*100
    test_recall = recall_score(test_y, y_test_hat)*100
    test_auc_roc = roc_auc_score(test_y, y_test_hat_probs)*100
    
    if cv != 1:
        best_params = str(search.best_params_)

    res = {
        'model_name' : model_name,
        'Training Accuracy' : train_accuracy,
        'Training Precision' : train_precision,
        'Training Recall' : train_recall,
        'Test Accuracy' : test_accuracy,
        'Test Precision' : test_precision,
        'Test Recall' :test_recall,
        'Best Params' : best_params
    }
    
    time2 = time.time()
    time_elapsed = round((time2 - time1)/60, 2)
    
    print(f'For the {model_name} model, the time elapsed is {time_elapsed} minutes.')
    
    if save and cv == 1: # only save when applied to all the data
        filename = 'finalized_model.sav'
        pickle.dump(search, open(filename, 'wb'))
    
    return res

In [3]:
# - - - - - - - - - - - #
# Logistic Regression
# - - - - - - - - - - - #

model_name = "Logistic Regression"

model = linear_model.LogisticRegression(solver = 'saga', max_iter = 150) #saga supports all types of model penalties

param_grid = {
    'penalty': ['l2' ,'l1',None]
}

res = get_model_results(model_name, model, param_grid, 5, train_x, train_y, test_x, test_y)

res = pd.DataFrame(res, index=[0])

if 'final_res' not in locals():
    final_res = res
else:
    final_res = pd.concat([final_res, res])

For the Logistic Regression model, the time elapsed is 0.11 minutes.


In [4]:
# - - - - - - - - - - - #
# Random Forest
# - - - - - - - - - - - #

model = ensemble.RandomForestClassifier()
model_name = "Random Forest Clasifier"

param_grid = {
    'n_estimators': [10, 50, 100, 500],
    'max_depth' : [None, 3, 5],
    'max_features' : [2, 3, 4, 5, None]
}

res = get_model_results(model_name, model, param_grid, 5, train_x, train_y, test_x, test_y)

res = pd.DataFrame(res, index=[0])

if 'final_res' not in locals():
    final_res = res
else:
    final_res = pd.concat([final_res, res])

For the Random Forest Clasifier model, the time elapsed is 36.04 minutes.


In [5]:
# - - - - - - - - - - - #
# K Neighbors
# - - - - - - - - - - - #

model = neighbors.KNeighborsClassifier()
model_name = "KNeighbors"

param_grid = {
    'n_neighbors': [3, 5, 10, 25]
}

res = get_model_results(model_name, model, param_grid, 5, train_x, train_y, test_x, test_y)

res = pd.DataFrame(res, index=[0])

if 'final_res' not in locals():
    final_res = res
else:
    final_res = pd.concat([final_res, res])

For the KNeighbors model, the time elapsed is 1.56 minutes.


In [5]:
#bringing back data w NAs with GBT which handles these
train = train_w_NAs
test = test_w_NAs

In [6]:
# - - - - - - - - - - - #
# Gradient Boosted Tree Model w NAs
# - - - - - - - - - - - #

#Model with all data
train_x = train[['title_match','author_match', 'publisher_match', 'year_match', 'year_NA', 'author_NA', 'publisher_NA', 'title_match_partial', 'publisher_match_partial', 'author_sort']]
train_y = train['citebook_match']
test_x = test[['title_match','author_match', 'publisher_match', 'year_match', 'year_NA', 'author_NA', 'publisher_NA', 'title_match_partial', 'publisher_match_partial', 'author_sort']]
test_y = test['citebook_match']

model = ensemble.HistGradientBoostingClassifier()

param_grid = {
    'learning_rate': [0.1, 0.2, 0.5],
    'max_depth' : [None, 3, 5],
    'l2_regularization' : [0, 0.01, 0.1]
}

res = get_model_results("GBT", model, param_grid, 5, train_x, train_y, test_x, test_y)

res = pd.DataFrame(res, index=[0])

if 'final_res' not in locals():
    final_res = res
else:
    final_res = pd.concat([final_res, res])

For the GBT model, the time elapsed is 2.48 minutes.


In [8]:
param_grid = {}

res = get_model_results("GBT - default", model, param_grid, 5, train_x, train_y, test_x, test_y)

res = pd.DataFrame(res, index=[0])

if 'final_res' not in locals():
    final_res = res
else:
    final_res = pd.concat([final_res, res])

For the GBT - default model, the time elapsed is 0.09 minutes.


In [9]:
final_res.sort_values("Test Precision", ascending=False)

Unnamed: 0,model_name,Training Accuracy,Training Precision,Training Recall,Test Accuracy,Test Precision,Test Recall,Best Params
0,GBT - default,85.556595,84.16482,94.713135,98.943197,95.238095,95.238095,{}
0,Logistic Regression,83.185246,82.886433,92.119578,98.282695,94.936709,89.285714,{'penalty': 'l2'}
0,Random Forest Clasifier,92.307243,91.038527,97.267725,97.490092,94.520548,82.142857,"{'max_depth': None, 'max_features': 4, 'n_esti..."
0,Random Forest Clasifier,92.307243,91.038527,97.267725,97.490092,94.520548,82.142857,"{'max_depth': None, 'max_features': 4, 'n_esti..."
0,GBT,87.827595,86.226143,95.835347,98.414795,91.860465,94.047619,"{'l2_regularization': 0.1, 'learning_rate': 0...."
0,KNeighbors,87.939635,88.008691,93.436618,96.961691,85.882353,86.904762,{'n_neighbors': 5}


In [8]:
get_model_results("GBT - default", model, param_grid, 1, train_x, train_y, test_x, test_y, save=True)

For the GBT - default model, the time elapsed is 0.03 minutes.


{'model_name': 'GBT - default',
 'Training Accuracy': 85.54003234543364,
 'Training Precision': 84.07611624725342,
 'Training Recall': 94.82691438457582,
 'Test Accuracy': 99.20739762219286,
 'Test Precision': 97.5609756097561,
 'Test Recall': 95.23809523809523,
 'Best Params': ''}