In [9]:
import numpy as np
import pandas as pd
import fasttext
import random
from sklearn.svm import SVC
model_fasttext = fasttext.load_model("transactions_repmodel.bin")



In [2]:
# custom accuracy metric
def consumer_accuracy(test_frame, prediction_incorrect): 
    tot_incorrect = test_frame[test_frame[prediction_incorrect] == True].groupby('consumer_ref').sum().count().iloc[0]
    tot = test_frame.groupby('consumer_ref').sum().count().iloc[0]
    accuracy = 1 - tot_incorrect / tot
    return accuracy

In [3]:
all_labels = pd.read_csv("final_data.csv")

### Generate Train and Test folds

In [7]:
# get train and test frames for cross validation folds
customer_list = list(all_labels.consumer_ref.unique())
random.seed(42)
cons_list= random.sample(customer_list, 500)
def get_train_test_frame(section):
    ub = 100*section
    lb = ub - 100
    test_cons_list = cons_list[lb:ub]
    test_frame = pd.DataFrame() 
    for customer in test_cons_list: 
        frame = all_labels[all_labels.consumer_ref == customer] 
        test_frame = pd.concat([test_frame, frame], axis=0)
    train_cons_list = set(customer_list) - set(test_cons_list)
    train_frame = pd.DataFrame()  
    for customer in train_cons_list: 
        frame = all_labels[all_labels.consumer_ref == customer] 
        train_frame = pd.concat([train_frame, frame], axis=0)
    return train_frame, test_frame

In [8]:
train_1, test_1 = get_train_test_frame(1)
train_2, test_2 = get_train_test_frame(2)
train_3, test_3 = get_train_test_frame(3)
train_4, test_4 = get_train_test_frame(4)
train_5, test_5 = get_train_test_frame(5)

In [10]:
# get X_train, y_train, X_test, y_test for cross_validation folds
def apply_fasttext(train_frame,test_frame):
    train_vector = train_frame.pattern.apply(model_fasttext.get_sentence_vector)
    X_train = pd.DataFrame.from_records(train_vector.array)
    X_train = pd.concat([train_frame['wagesal_pattern'].reset_index(),X_train], axis = 1).drop('index', axis = 1)
    y_train = train_frame["is_salary"]
    test_vector = test_frame.pattern.apply(model_fasttext.get_sentence_vector)
    X_test = pd.DataFrame.from_records(test_vector.array)
    X_test = pd.concat([test_frame['wagesal_pattern'].reset_index(),X_test], axis = 1).drop('index', axis = 1)
    y_test = test_frame["is_salary"]
    return X_train, X_test, y_train, y_test

In [11]:
X_train_1, X_test_1, y_train_1, y_test_1 = apply_fasttext(train_1,test_1)
X_train_2, X_test_2, y_train_2, y_test_2 = apply_fasttext(train_2,test_2)
X_train_3, X_test_3, y_train_3, y_test_3 = apply_fasttext(train_3,test_3)
X_train_4, X_test_4, y_train_4, y_test_4 = apply_fasttext(train_4,test_4)
X_train_5, X_test_5, y_train_5, y_test_5 = apply_fasttext(train_5,test_5)

### Estimate final custom-accuracy through cross-validation

In [19]:
def cross_val_custom(X_train, y_train, X_test, test_frame):
    # define best model obtained from hyperparameter tuning in Comparison of Approaches and Model Selection notebook
    best_svc = SVC(C=2.2058449429580245, break_ties=False, cache_size=200, class_weight=None,
    coef0=0.0, decision_function_shape='ovr', degree=3,
    gamma=1.0699098521619943, kernel='rbf', max_iter=-1, probability=False,
    random_state=42, shrinking=True, tol=0.001, verbose=False) 
    best_svc.fit(X_train, y_train)
    predictions = best_svc.predict(X_test)
    test_frame['prediction'] = predictions
    test_frame['prediction_incorrect'] = test_frame['is_salary'] != test_frame['prediction']
    consumer_accuracy_i = consumer_accuracy(test_frame, 'prediction_incorrect')
    return consumer_accuracy_i

In [20]:
consumer_accuracy_1 = cross_val_custom(X_train_1, y_train_1, X_test_1, test_1)
consumer_accuracy_2 = cross_val_custom(X_train_2, y_train_2, X_test_2, test_2)
consumer_accuracy_3 = cross_val_custom(X_train_3, y_train_3, X_test_3, test_3)
consumer_accuracy_4 = cross_val_custom(X_train_4, y_train_4, X_test_4, test_4)
consumer_accuracy_5 = cross_val_custom(X_train_5, y_train_5, X_test_5, test_5)

In [21]:
print(consumer_accuracy_1)
print(consumer_accuracy_2)
print(consumer_accuracy_3)
print(consumer_accuracy_4)
print(consumer_accuracy_5)

0.78
0.81
0.75
0.79
0.78


In [22]:
final_custom_accuracy_avg = (consumer_accuracy_1+consumer_accuracy_2+consumer_accuracy_3+
                             consumer_accuracy_4+consumer_accuracy_5)/5



In [23]:
final_custom_accuracy_avg

0.782

# Final Avg Custom Accuracy Estimate = 78.2%