In [54]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.preprocessing import MinMaxScaler
import glob
import os

In [55]:
train_path = max(glob.glob('/Users/ivankoh/Library/CloudStorage/OneDrive-NationalUniversityofSingapore/1D/NUS Y3S1/BT4222/Data/train/*.csv'), key=os.path.getctime) 
print(train_path)
df = pd.read_csv(train_path)

/Users/ivankoh/Library/CloudStorage/OneDrive-NationalUniversityofSingapore/1D/NUS Y3S1/BT4222/Data/train/kickstarter_train_final_20221015-144713.csv


## Get to the correct data format

In [56]:
X_train, y_train = df.drop('state', axis=1).to_numpy(), df['state'].to_numpy()

## Grid Search with K-fold CV applied to SVC

5 folds 
- approx 311 per fold 
- 62 validation data 
- 249 train data

### 1. Training

In [None]:
# Hyper-parameters
C = [0.1, 1, 10, 100]
gamma = [1, 0.1, 0.01, 0.001]
kernels = ['rbf']  #poly takes forever to train 'sigmoid'

params = {'C': C, 'gamma': gamma, 'kernel': kernels}

grid = GridSearchCV(SVC(), params, refit=True, verbose=2)
grid.fit(X_train, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time= 7.9min
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time= 7.9min
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time= 7.8min


### 2. Best Hyper-parameters

In [5]:
print(grid.best_estimator_)

SVC(C=1, gamma=0.001)


### 3. Testing

In [49]:
test_path = max(glob.glob('/Users/ivankoh/Library/CloudStorage/OneDrive-NationalUniversityofSingapore/1D/NUS Y3S1/BT4222/Data/test/*.csv'), key=os.path.getctime) 
print(test_path)
test = pd.read_csv(test_path)

X_test, y_test = test.drop('state', axis=1).to_numpy(), test['state'].to_numpy()

/Users/ivankoh/Library/CloudStorage/OneDrive-NationalUniversityofSingapore/1D/NUS Y3S1/BT4222/Data/test/kickstarter_test_final_20221015-012736.csv


In [7]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(1557, 540)
(1557,)
(293, 540)
(293,)


In [8]:
grid_predictions = grid.predict(X_test)
print(confusion_matrix(y_test,grid_predictions))
print(classification_report(y_test,grid_predictions))

auc = roc_auc_score(y_test, grid_predictions)
print("AUC:", auc)

[[ 30 114]
 [ 10 139]]
              precision    recall  f1-score   support

           0       0.75      0.21      0.33       144
           1       0.55      0.93      0.69       149

    accuracy                           0.58       293
   macro avg       0.65      0.57      0.51       293
weighted avg       0.65      0.58      0.51       293

AUC: 0.5706096196868009
