In [31]:
import numpy as np
import pandas as pd

# models
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

# evaluation metrics 
from sklearn.metrics import accuracy_score, roc_auc_score

import warnings
warnings.filterwarnings("ignore")

In [32]:
X_train = pd.read_csv('Xtrain.csv')
X_val = pd.read_csv('Xval.csv')
X_test = pd.read_csv('Xtest.csv')

y_train = X_train.pop('final_status')
y_val = X_val.pop('final_status')
y_test = X_test.pop('final_status')

features = pd.read_csv('selected_features.csv', header=None)
features = features.iloc[:, 0].to_list()
features

['disable_communication',
 'backers_count',
 'AU',
 'GB',
 'US',
 'GBP',
 'USD',
 '2013',
 '2014',
 '3d',
 'action',
 'album',
 'android',
 'animated',
 'anthology',
 'app',
 'art',
 'band',
 'based',
 'burning man',
 'business',
 'card game',
 'cd',
 'clothing',
 'comedy',
 'create',
 'debut',
 'debut album',
 'detroit',
 'documentary',
 'dream',
 'edition',
 'energy',
 'ep',
 'explores',
 'fantasy',
 'fashion',
 'feature',
 'feature film',
 'festival',
 'film',
 'final',
 'first',
 'first full',
 'folk',
 'food',
 'food truck',
 'free',
 'full length',
 'game',
 'get',
 'girl',
 'help',
 'help us',
 'hip',
 'hip hop',
 'independent',
 'installation',
 'last',
 'length album',
 'life',
 'like',
 'little',
 'local',
 'looking',
 'man',
 'many',
 'mobile',
 'need help',
 'new',
 'new album',
 'night',
 'online',
 'painting',
 'paintings',
 'people',
 'piece',
 'platform',
 'playing cards',
 'pre order',
 'presents',
 'press',
 'print',
 'prints',
 'produce',
 'real',
 'reality',
 'recor

#### Subsetting to features selected

In [33]:
X_train = X_train[features]
X_val = X_val[features]
X_test = X_test[features]

In [34]:
X_train.shape

(70374, 114)

In [35]:
def evaluate_models(models, X_train, X_val, y_train, y_val):
    names = [i.__name__ for i in models]
    models_ev = pd.DataFrame(np.zeros((len(models), 4)), index=names, 
                             columns=['train_accuracy', 'train_roc_auc', 'val_accuracy', 'val_roc_auc'])
    
    for model in models:
        model = model()
        model.fit(X_train, y_train)
        train_preds = model.predict(X_train)
        val_preds = model.predict(X_val)
        
        models_ev.loc[model.__class__.__name__, 'train_accuracy'] = accuracy_score(y_train, train_preds)
        models_ev.loc[model.__class__.__name__, 'train_roc_auc'] = roc_auc_score(y_train, train_preds)
        
        models_ev.loc[model.__class__.__name__, 'val_accuracy'] = accuracy_score(y_val, val_preds)
        models_ev.loc[model.__class__.__name__, 'val_roc_auc'] = roc_auc_score(y_val, val_preds)
        
    return models_ev

In [36]:
models= [MultinomialNB, LinearSVC, LogisticRegression]
eval_mat = evaluate_models(models, X_train, X_val, y_train, y_val)
eval_mat.sort_values(by='val_roc_auc', ascending=False)

Unnamed: 0,train_accuracy,train_roc_auc,val_accuracy,val_roc_auc
LogisticRegression,0.775855,0.67886,0.77666,0.677217
LinearSVC,0.76956,0.663585,0.771556,0.6637
MultinomialNB,0.713204,0.54664,0.717939,0.548591


It seems like LinearSVC model has performed better with validation ROC-AUC score of about 78%. We will fine-tune this model to realize its utmost performance

In [37]:
param_grid = {'penalty':['l1', 'l2'],
              'loss':['hinge', 'squared_hinge'],
              'C':[1, 0.1, 0.01, 0.001, 0.0001],
             }
lsvc = LinearSVC(random_state=200)

from sklearn.model_selection import GridSearchCV

model_cv = GridSearchCV(lsvc, param_grid, cv=3, n_jobs=-1)
model_cv.fit(X_train, y_train)
model_cv.best_params_

{'C': 1, 'loss': 'hinge', 'penalty': 'l2'}

In [38]:
lsvc = LinearSVC(random_state=200, C=1, loss='hinge', penalty='l2')
lsvc.fit(X_train, y_train)

preds_test = lsvc.predict(X_test)

print("Test Accuracy Score = "+str(accuracy_score(y_test, preds_test)))
print("Test ROC-AUC Score = "+str(roc_auc_score(y_test, preds_test)))

Test Accuracy Score = 0.7783251231527094
Test ROC-AUC Score = 0.6771481977457874


This is not too far off the training and validation scores and this model will be selected for deployment for our application