In [1]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import glob
import os

In [2]:
train_path = max(glob.glob('./data/train/*.csv'), key=os.path.getctime) 
df = pd.read_csv(train_path)

## Get to the correct data format

In [3]:
to_drop = ['rewards', 'all_reward_amount', 'deadline', 'launched_at', 'rewards_processed',
           'description_processed', 'description_story_processed','description_risks_processed',
           'id', 'name', 'description', 'description_story', 'description_risks', 'video', 'state',
          'pledged', 'goal']

#features that are dependent on time and the final outcome
to_drop_more = to_drop + ['staff_pick', 'spotlight', 'backers_count', 'update_count', 'faq_count']

X_train, y_train = df.drop(to_drop_more, axis=1).to_numpy(), df['state'].to_numpy()

## Grid Search with K-fold CV applied to SVC

5 folds 
- approx 311 per fold 
- 62 validation data 
- 249 train data

### 1. Training

In [None]:
# Hyper-parameters
C = [0.1,1, 10, 100]
gamma = [1,0.1,0.01,0.001]
kernels = ['rbf', 'poly', 'sigmoid']

params = {'C': C, 
          'gamma': gamma, 
          'kernel': kernels}

grid = GridSearchCV(SVC(), params, refit=True, verbose=2)
grid.fit(X_train, y_train)



Fitting 5 folds for each of 48 candidates, totalling 240 fits
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.5s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.5s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.5s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.5s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.5s


### 2. Best Hyper-parameters

In [None]:
print(grid.best_estimator_)

### 3. Testing

In [None]:
test_path = max(glob.glob('./data/test/*.csv'), key=os.path.getctime) 
test = pd.read_csv(test_path)

X_test, y_test = test.drop(to_drop_more, axis=1).to_numpy(), test['state'].to_numpy()

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
grid_predictions = grid.predict(X_test)
print(confusion_matrix(y_test,grid_predictions))
print(classification_report(y_test,grid_predictions))

auc = roc_auc_score(y_test, grid_predictions)
print("AUC:", auc)

In [None]:
for c in test.drop(to_drop_more, axis=1).columns: print(c)