# Model Selection

## A comparison of the NLP only, NLP+Features, and Features only apporaches to identify the best model and approach, followed by hyperparameter tuning to obtain the final model

- RidgeClassifier, LogisticRegression, XGBoost, and SVC were found to yield the highest custom accuracy scores in intial analyses (which compared all 15 models suggest by Pycaret, but had a large runtime)
- Thus we consider these 4 models in our cross-validations here

In [17]:
import numpy as np
import pandas as pd
import fasttext
import random
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import recall_score

In [18]:
model_fasttext = fasttext.load_model("transactions_repmodel.bin")



In [19]:
def consumer_accuracy(test_frame, prediction_incorrect): 
    tot_incorrect = test_frame[test_frame[prediction_incorrect] == True].groupby('consumer_ref').sum().count().iloc[0]
    tot = test_frame.groupby('consumer_ref').sum().count().iloc[0]
    accuracy = 1 - tot_incorrect / tot
    return accuracy

In [20]:
final_data = pd.read_csv("final_data.csv")

In [21]:
customer_list = list(final_data.consumer_ref.unique())
random.seed(42)
length = len(customer_list)
cv_random_split_sample = random.sample(customer_list, length)
CV = 5

### *Note that the cv column in the cross validations is only showing the mean value of cv (prints 3 when 5 folds were created)*

### NLP Only Approach

In [27]:
def cross_val_nlp_only(CV, model):
    results = pd.DataFrame(columns = ['cv','train_accuracy_custom', 'test_accuracy_custom', 'balanced_accuracy_train', 
                                  'balanced_accuracy_test', 'recall_train', 'recall_test'])
    for i in range(CV):
        size = int(length/CV)
        lb = size*i
        ub = lb+size
        
        test_cons_list= cv_random_split_sample[lb:ub]
        train_cons_list = set(cv_random_split_sample) - set(test_cons_list)
        train_frame = pd.DataFrame() 
        for customer in train_cons_list: 
            frame = final_data[final_data.consumer_ref == customer] 
            train_frame = pd.concat([train_frame, frame], axis=0)
        test_frame = pd.DataFrame() 
        for customer in test_cons_list: 
            frame = final_data[final_data.consumer_ref == customer] 
            test_frame = pd.concat([test_frame, frame], axis=0)
            
        train_vector = train_frame.pattern.apply(model_fasttext.get_sentence_vector)
    
        X_train = pd.DataFrame.from_records(train_vector.array)
        y_train = train_frame["is_salary"]
        model.fit(X_train, y_train)
        
        test_vector = test_frame.pattern.apply(model_fasttext.get_sentence_vector)
        X_test = pd.DataFrame.from_records(test_vector.array)
        y_test = test_frame["is_salary"]
        
        test_frame["salary_predict"] = model.predict(X_test)
        test_frame['incorrect_predictor'] = test_frame['is_salary'] != test_frame['salary_predict']
        train_frame["salary_predict"] = model.predict(X_train)
        train_frame['incorrect_predictor'] = train_frame['is_salary'] != train_frame['salary_predict']
        
        test_accuracy = consumer_accuracy(test_frame, 'incorrect_predictor')
        train_accuracy = consumer_accuracy(train_frame, 'incorrect_predictor')
        bac_train = balanced_accuracy_score(train_frame.is_salary, train_frame.salary_predict)
        bac_test = balanced_accuracy_score(test_frame.is_salary, test_frame.salary_predict)
        rec_train = recall_score(train_frame.is_salary, train_frame.salary_predict)
        rec_test = recall_score(test_frame.is_salary, test_frame.salary_predict)
        
        results_iter = pd.DataFrame([i + 1,train_accuracy, test_accuracy, bac_train, bac_test, rec_train,
                                     rec_test]).T
        results_iter.columns = results.columns
        results = pd.concat([results, results_iter], axis = 0)
    return pd.DataFrame(results).mean().to_dict()

In [28]:
results_cv_nlp_only = {}
metrics_nlp_only = pd.DataFrame(columns = ['cv','train_accuracy_custom', 'test_accuracy_custom', 'balanced_accuracy_train', 
                                  'balanced_accuracy_test', 'recall_train', 'recall_test'])

In [29]:
def add_model_nlp_only(model):
    results_cv_nlp_only[str(model).split('(')[0]] = cross_val_nlp_only(CV, model)
    return pd.concat([metrics_nlp_only,pd.DataFrame.from_dict(results_cv_nlp_only).T])

In [30]:
ridge = RidgeClassifier()
logistic = LogisticRegression()
xgboost = XGBClassifier()
svc = SVC()

In [31]:
add_model_nlp_only(ridge)

Unnamed: 0,cv,train_accuracy_custom,test_accuracy_custom,balanced_accuracy_train,balanced_accuracy_test,recall_train,recall_test
RidgeClassifier,3.0,0.698,0.69,0.821944,0.810719,0.64582,0.623312


In [32]:
add_model_nlp_only(logistic)

Unnamed: 0,cv,train_accuracy_custom,test_accuracy_custom,balanced_accuracy_train,balanced_accuracy_test,recall_train,recall_test
RidgeClassifier,3.0,0.698,0.69,0.821944,0.810719,0.64582,0.623312
LogisticRegression,3.0,0.7485,0.752,0.862803,0.849585,0.728444,0.704405


In [33]:
add_model_nlp_only(xgboost)

Unnamed: 0,cv,train_accuracy_custom,test_accuracy_custom,balanced_accuracy_train,balanced_accuracy_test,recall_train,recall_test
RidgeClassifier,3.0,0.698,0.69,0.821944,0.810719,0.64582,0.623312
LogisticRegression,3.0,0.7485,0.752,0.862803,0.849585,0.728444,0.704405
XGBClassifier,3.0,0.815,0.728,0.940116,0.863262,0.882618,0.72999


In [34]:
metrics_nlp_only = add_model_nlp_only(svc)

In [37]:
metrics_nlp_only.sort_values(by = ['test_accuracy_custom'], ascending = False)

Unnamed: 0,cv,train_accuracy_custom,test_accuracy_custom,balanced_accuracy_train,balanced_accuracy_test,recall_train,recall_test
SVC,3.0,0.817,0.778,0.924849,0.880458,0.851936,0.764901
LogisticRegression,3.0,0.7485,0.752,0.862803,0.849585,0.728444,0.704405
XGBClassifier,3.0,0.815,0.728,0.940116,0.863262,0.882618,0.72999
RidgeClassifier,3.0,0.698,0.69,0.821944,0.810719,0.64582,0.623312


### NLP+Features

In [44]:
features = ['wagesal_pattern']  # , 'large_amount', 'week_bin_4','num_similar_bool']
# wagesal_pattern was found to yield highest accuracy, other variables commented out are correlated but lowered
# custom accuracy when included

In [45]:
def cross_val_nlp_features(CV, features, model):
    results = pd.DataFrame(columns = ['cv','train_accuracy_custom', 'test_accuracy_custom', 'balanced_accuracy_train', 
                                  'balanced_accuracy_test', 'recall_train', 'recall_test'])
    for i in range(CV):
        size = int(length/CV)
        lb = size*i
        ub = lb+size
        test_cons_list= cv_random_split_sample[lb:ub]
        train_cons_list = set(cv_random_split_sample) - set(test_cons_list)
        train_frame = pd.DataFrame() 
        for customer in train_cons_list: 
            frame = final_data[final_data.consumer_ref == customer] 
            train_frame = pd.concat([train_frame, frame], axis=0)
        test_frame = pd.DataFrame() 
        for customer in test_cons_list: 
            frame = final_data[final_data.consumer_ref == customer] 
            test_frame = pd.concat([test_frame, frame], axis=0)
        train_vector = train_frame.pattern.apply(model_fasttext.get_sentence_vector)
        count_vect_df = pd.DataFrame.from_records(train_vector.array)
        X_train = pd.concat([train_frame.reset_index()[features],
                             count_vect_df], axis=1)
        y_train = train_frame["is_salary"]
        model.fit(X_train, y_train)
        test_vector = test_frame.pattern.apply(model_fasttext.get_sentence_vector)
        test_count_vect_df = pd.DataFrame.from_records(test_vector.array)
        X_test = pd.concat([test_frame.reset_index()[features],
                            test_count_vect_df], axis=1)
        y_test = test_frame["is_salary"]
        test_frame["salary_predict"] = model.predict(X_test)
        test_frame['incorrect_predictor'] = test_frame['is_salary'] != test_frame['salary_predict']
        train_frame["salary_predict"] = model.predict(X_train)
        train_frame['incorrect_predictor'] = train_frame['is_salary'] != train_frame['salary_predict']
        
        test_accuracy = consumer_accuracy(test_frame, 'incorrect_predictor')
        train_accuracy = consumer_accuracy(train_frame, 'incorrect_predictor')
        bac_train = balanced_accuracy_score(train_frame.is_salary, train_frame.salary_predict)
        bac_test = balanced_accuracy_score(test_frame.is_salary, test_frame.salary_predict)
        rec_train = recall_score(train_frame.is_salary, train_frame.salary_predict)
        rec_test = recall_score(test_frame.is_salary, test_frame.salary_predict)
        
        results_iter = pd.DataFrame([i + 1,train_accuracy, test_accuracy, bac_train, bac_test, rec_train,
                                     rec_test]).T
        results_iter.columns = results.columns
        results = pd.concat([results, results_iter], axis = 0)
    return pd.DataFrame(results).mean().to_dict()

In [46]:
results_cv_nlp_features = {}
metrics_nlp_features = pd.DataFrame(columns = ['cv','train_accuracy_custom', 'test_accuracy_custom', 'balanced_accuracy_train', 
                                  'balanced_accuracy_test', 'recall_train', 'recall_test'])

In [47]:
def add_model_nlp_features(model):
    results_cv_nlp_features[str(model).split('(')[0]] = cross_val_nlp_features(CV, features, model)
    return pd.concat([metrics_nlp_features,pd.DataFrame.from_dict(results_cv_nlp_features).T])

In [48]:
add_model_nlp_features(ridge)

Unnamed: 0,cv,train_accuracy_custom,test_accuracy_custom,balanced_accuracy_train,balanced_accuracy_test,recall_train,recall_test
RidgeClassifier,3.0,0.763,0.76,0.84887,0.846767,0.701081,0.697025


In [49]:
add_model_nlp_features(logistic)

Unnamed: 0,cv,train_accuracy_custom,test_accuracy_custom,balanced_accuracy_train,balanced_accuracy_test,recall_train,recall_test
RidgeClassifier,3.0,0.763,0.76,0.84887,0.846767,0.701081,0.697025
LogisticRegression,3.0,0.769,0.762,0.856669,0.852172,0.716258,0.709798


In [50]:
add_model_nlp_features(xgboost)

Unnamed: 0,cv,train_accuracy_custom,test_accuracy_custom,balanced_accuracy_train,balanced_accuracy_test,recall_train,recall_test
RidgeClassifier,3.0,0.763,0.76,0.84887,0.846767,0.701081,0.697025
LogisticRegression,3.0,0.769,0.762,0.856669,0.852172,0.716258,0.709798
XGBClassifier,3.0,0.844,0.77,0.941936,0.867897,0.885647,0.738674


In [51]:
metrics_nlp_features = add_model_nlp_features(svc)

In [52]:
metrics_nlp_features.sort_values(by = ['test_accuracy_custom'], ascending = False)

Unnamed: 0,cv,train_accuracy_custom,test_accuracy_custom,balanced_accuracy_train,balanced_accuracy_test,recall_train,recall_test
SVC,3.0,0.821,0.786,0.916836,0.874895,0.835536,0.753104
XGBClassifier,3.0,0.844,0.77,0.941936,0.867897,0.885647,0.738674
LogisticRegression,3.0,0.769,0.762,0.856669,0.852172,0.716258,0.709798
RidgeClassifier,3.0,0.763,0.76,0.84887,0.846767,0.701081,0.697025


### Features Only

In [53]:
features_only = ['wagesal_pattern','large_amount', 'num_similar', 'pattern_is_similar_bool','week_bin_4']

In [54]:
def cross_val_features_only(CV, features, model):
    results = pd.DataFrame(columns = ['cv','train_accuracy_custom', 'test_accuracy_custom', 'balanced_accuracy_train', 
                                  'balanced_accuracy_test', 'recall_train', 'recall_test'])
    for i in range(CV):
        size = int(length/CV)
        lb = size*i
        ub = lb+size
        test_cons_list= cv_random_split_sample[lb:ub]
        train_cons_list = set(cv_random_split_sample) - set(test_cons_list)
        train_frame = pd.DataFrame() 
        for customer in train_cons_list: 
            frame = final_data[final_data.consumer_ref == customer] 
            train_frame = pd.concat([train_frame, frame], axis=0)
        test_frame = pd.DataFrame() 
        for customer in test_cons_list: 
            frame = final_data[final_data.consumer_ref == customer] 
            test_frame = pd.concat([test_frame, frame], axis=0)
        X_train = train_frame[features_only]
        y_train = train_frame["is_salary"]
        model.fit(X_train, y_train)
        X_test = test_frame[features_only]
        y_test = test_frame["is_salary"]
        test_frame["salary_predict"] = model.predict(X_test)
        test_frame['incorrect_predictor'] = test_frame['is_salary'] != test_frame['salary_predict']
        train_frame["salary_predict"] = model.predict(X_train)
        train_frame['incorrect_predictor'] = train_frame['is_salary'] != train_frame['salary_predict']
        
        test_accuracy = consumer_accuracy(test_frame, 'incorrect_predictor')
        train_accuracy = consumer_accuracy(train_frame, 'incorrect_predictor')
        bac_train = balanced_accuracy_score(train_frame.is_salary, train_frame.salary_predict)
        bac_test = balanced_accuracy_score(test_frame.is_salary, test_frame.salary_predict)
        rec_train = recall_score(train_frame.is_salary, train_frame.salary_predict)
        rec_test = recall_score(test_frame.is_salary, test_frame.salary_predict)
        
        results_iter = pd.DataFrame([i + 1,train_accuracy, test_accuracy, bac_train, bac_test, rec_train,
                                     rec_test]).T
        results_iter.columns = results.columns
        results = pd.concat([results, results_iter], axis = 0)
    return pd.DataFrame(results).mean().to_dict()

In [55]:
results_cv_features_only = {}
metrics_features_only = pd.DataFrame(columns = ['cv','train_accuracy_custom', 'test_accuracy_custom', 'balanced_accuracy_train', 
                                  'balanced_accuracy_test', 'recall_train', 'recall_test'])

In [56]:
def add_model_features_only(model):
    results_cv_features_only[str(model).split('(')[0]] = cross_val_features_only(CV, features_only, model)
    return pd.concat([metrics_features_only,pd.DataFrame.from_dict(results_cv_features_only).T])

In [57]:
add_model_features_only(ridge)

Unnamed: 0,cv,train_accuracy_custom,test_accuracy_custom,balanced_accuracy_train,balanced_accuracy_test,recall_train,recall_test
RidgeClassifier,3.0,0.754,0.754,0.842698,0.843471,0.688611,0.690212


In [58]:
add_model_features_only(logistic)

Unnamed: 0,cv,train_accuracy_custom,test_accuracy_custom,balanced_accuracy_train,balanced_accuracy_test,recall_train,recall_test
RidgeClassifier,3.0,0.754,0.754,0.842698,0.843471,0.688611,0.690212
LogisticRegression,3.0,0.754,0.754,0.842698,0.843471,0.688611,0.690212


In [59]:
add_model_features_only(xgboost)

Unnamed: 0,cv,train_accuracy_custom,test_accuracy_custom,balanced_accuracy_train,balanced_accuracy_test,recall_train,recall_test
RidgeClassifier,3.0,0.754,0.754,0.842698,0.843471,0.688611,0.690212
LogisticRegression,3.0,0.754,0.754,0.842698,0.843471,0.688611,0.690212
XGBClassifier,3.0,0.737,0.726,0.888101,0.883439,0.78406,0.775767


In [60]:
metrics_features_only = add_model_features_only(svc)

In [61]:
metrics_features_only.sort_values(by = ['test_accuracy_custom'], ascending = False)

Unnamed: 0,cv,train_accuracy_custom,test_accuracy_custom,balanced_accuracy_train,balanced_accuracy_test,recall_train,recall_test
RidgeClassifier,3.0,0.754,0.754,0.842698,0.843471,0.688611,0.690212
LogisticRegression,3.0,0.754,0.754,0.842698,0.843471,0.688611,0.690212
SVC,3.0,0.754,0.754,0.842287,0.843128,0.687699,0.689432
XGBClassifier,3.0,0.737,0.726,0.888101,0.883439,0.78406,0.775767


## Looking at all three approaches side by side

In [66]:
metrics_nlp_only.sort_values(by = ['test_accuracy_custom'], ascending = False).head(1)

Unnamed: 0,cv,train_accuracy_custom,test_accuracy_custom,balanced_accuracy_train,balanced_accuracy_test,recall_train,recall_test
SVC,3.0,0.817,0.778,0.924849,0.880458,0.851936,0.764901


In [67]:
metrics_nlp_features.sort_values(by = ['test_accuracy_custom'], ascending = False).head(1)

Unnamed: 0,cv,train_accuracy_custom,test_accuracy_custom,balanced_accuracy_train,balanced_accuracy_test,recall_train,recall_test
SVC,3.0,0.821,0.786,0.916836,0.874895,0.835536,0.753104


In [68]:
metrics_features_only.sort_values(by = ['test_accuracy_custom'], ascending = False).head(1)

Unnamed: 0,cv,train_accuracy_custom,test_accuracy_custom,balanced_accuracy_train,balanced_accuracy_test,recall_train,recall_test
RidgeClassifier,3.0,0.754,0.754,0.842698,0.843471,0.688611,0.690212


### The SVC model applied to the NLP (fasttext)  + features (wagesal_pattern) approach yields the highest custom accuracy, so this is the model we will proceed with. 

## Hyperparameter Tuning for SVC (nlp + features)

In [70]:
from sklearn.model_selection import RandomizedSearchCV 
from scipy import stats

In [73]:
# Generate train and test frames

In [75]:
random.seed(42)
test_cons_list= random.sample(customer_list, 100)
train_cons_list = set(customer_list) - set(test_cons_list)
train_frame = pd.DataFrame()  
for customer in train_cons_list: 
    frame = final_data[final_data.consumer_ref == customer] 
    train_frame = pd.concat([train_frame, frame], axis=0)
test_frame = pd.DataFrame() 
for customer in test_cons_list: 
    frame = final_data[final_data.consumer_ref == customer] 
    test_frame = pd.concat([test_frame, frame], axis=0)

In [76]:
# Get X_rain, y_train, X_test, and y_test

In [77]:
train_vector = train_frame.pattern.apply(model_fasttext.get_sentence_vector)
X_train = pd.DataFrame.from_records(train_vector.array)
X_train = pd.concat([train_frame['wagesal_pattern'].reset_index(),X_train], axis = 1).drop('index', axis = 1)
y_train = train_frame["is_salary"]
          
test_vector = test_frame.pattern.apply(model_fasttext.get_sentence_vector)
X_test = pd.DataFrame.from_records(test_vector.array)
X_test = pd.concat([test_frame['wagesal_pattern'].reset_index(),X_test], axis = 1).drop('index', axis = 1)
y_test = test_frame["is_salary"]

In [80]:
param_rand = {"C": stats.uniform(2, 10),
             "gamma": stats.uniform(0.1, 1)}

# we can't score on custom accuracy, precision yields the next best score
rand_search = RandomizedSearchCV(svc, param_rand, refit = True, verbose = 3, scoring = 'precision', cv = 3, 
                                 random_state = 42, n_jobs = -1) 

In [81]:
rand_search.fit(X_train, y_train)


Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 out of  30 | elapsed:   10.6s remaining:    1.6s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   11.0s finished


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                                 class_weight=None, coef0=0.0,
                                 decision_function_shape='ovr', degree=3,
                                 gamma='scale', kernel='rbf', max_iter=-1,
                                 probability=False, random_state=None,
                                 shrinking=True, tol=0.001, verbose=False),
                   iid='deprecated', n_iter=10, n_jobs=-1,
                   param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x1a2a67d390>,
                                        'gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x1a2382ce10>},
                   pre_dispatch='2*n_jobs', random_state=42, refit=True,
                   return_train_score=False, scoring='precision', verbose=3)

In [82]:
print(rand_search.best_score_) 
print(rand_search.best_estimator_) 
print(rand_search.best_params_)

0.9511151407616123
SVC(C=2.2058449429580245, break_ties=False, cache_size=200, class_weight=None,
    coef0=0.0, decision_function_shape='ovr', degree=3,
    gamma=1.0699098521619943, kernel='rbf', max_iter=-1, probability=False,
    random_state=None, shrinking=True, tol=0.001, verbose=False)
{'C': 2.2058449429580245, 'gamma': 1.0699098521619943}


In [83]:
predictions = rand_search.predict(X_test) 

In [84]:
from sklearn.metrics import classification_report, confusion_matrix 
print(classification_report(y_test, predictions)) 

              precision    recall  f1-score   support

           0       0.97      0.99      0.98      4220
           1       0.93      0.69      0.79       438

    accuracy                           0.97      4658
   macro avg       0.95      0.84      0.89      4658
weighted avg       0.96      0.97      0.96      4658



In [85]:
test_frame['prediction'] = predictions
test_frame['prediction_incorrect'] = test_frame['is_salary'] != test_frame['prediction']
consumer_accuracy(test_frame,'prediction_incorrect')

0.78

### Custom accuracy after hyperparameter tuning is 78%