In [1]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import glob
import os

In [2]:
train_path = max(glob.glob('./data/train/*.csv'), key=os.path.getctime) 
df = pd.read_csv(train_path)
test_path = max(glob.glob('./data/test/*.csv'), key=os.path.getctime) 
test = pd.read_csv(test_path)


<h1>Get to the correct data format</h1>

In [3]:
X_train, y_train = df.drop('state', axis=1), df['state']
X_test, y_test = test.drop('state', axis=1), test['state']

In [4]:
X_train, y_train

(       reward_tiers  min_reward  max_reward      goal  staff_pick  has_video  \
 0          0.041667    0.000167    0.004995  0.000090           1          1   
 1          0.058333    0.000067    0.004495  0.000050           1          1   
 2          0.058333    0.000033    0.000525  0.000006           0          1   
 3          0.041667    0.000133    0.010990  0.000340           0          1   
 4          0.066667    0.000007    0.004995  0.000200           1          1   
 ...             ...         ...         ...       ...         ...        ...   
 24751      0.233333    0.000007    0.002495  0.000015           0          0   
 24752      0.100000    0.000033    0.004995  0.000022           0          1   
 24753      0.083333    0.000133    0.000345  0.000005           0          0   
 24754      0.075000    0.000067    0.024995  0.000150           1          1   
 24755      0.000000    0.000667    0.000495  0.000035           0          1   
 
        rewards_0  rewards

In [5]:
X_train.dtypes.unique()

array([dtype('float64'), dtype('int64')], dtype=object)

<h3>Training using Logisitic Regression Model</h3>

In [6]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)


(24756, 575)
(24756,)
(4369, 575)
(4369,)


In [None]:
LRModel = LogisticRegression(max_iter = 200000)
LRModel.fit(X_train, y_train)
# Train set
LRModel_train_score = LRModel.score(X_train, y_train)
print('Train Set Accuracy Score: ', LRModel_train_score)
pred = LRModel.predict(X_train)
print("Train Set Confusion Matrix \n", confusion_matrix(y_train, pred))
print("Train Set",classification_report(y_train, pred))

area_under_curve_train = metrics.roc_auc_score(y_train, pred)
print("Area Under Curve:", area_under_curve_train)

#Test Set
LRModel_test_score = LRModel.score(X_test, y_test)
print('Test Set Accuracy Score: ', LRModel_test_score)
test_pred = LRModel.predict(X_test)
print("Test Set Confusion Matrix \n", confusion_matrix(y_test, test_pred))
print("Test Set",classification_report(y_test, test_pred))

area_under_curve_test = metrics.roc_auc_score(y_test, test_pred)
print("Area Under Curve:", area_under_curve_test)

## Hyper parameter tuning

In [None]:
#define models and parameters for Logistic Regression
model = LogisticRegression(max_iter=200000)
solvers = ['newton-cg', 'lbfgs', 'liblinear', 'saga']
penalty = ['l1','l2', 'none']
c_values = 10. ** np.arange(-3,8)

#define grid seach
grid = dict(solver=solvers,penalty=penalty,C=c_values)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X_train, y_train)

#results 
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

<h3> Testing with training model output </h3>