In [1]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import glob
import os

In [24]:
train_path = max(glob.glob('/Users/ivankoh/Library/CloudStorage/OneDrive-NationalUniversityofSingapore/1D/NUS Y3S1/BT4222/Data/train/*.csv'), key=os.path.getctime) 
df = pd.read_csv(train_path)

## Get to the correct data format

In [26]:
to_drop = ['rewards', 'deadline', 'launched_at', 'rewards_processed', 'created_at',
           'description_processed', 'description_story_processed','description_risks_processed',
           'id', 'name', 'description', 'description_story', 'description_risks', 'video', 'state',
          'location', 'category']

X_train, y_train = df.drop(to_drop, axis=1).to_numpy(), df['state'].to_numpy()

## Grid Search with K-fold CV applied to SVC

5 folds 
- approx 311 per fold 
- 62 validation data 
- 249 train data

### 1. Training

In [None]:
# Hyper-parameters
C = [0.1,1, 10, 100]
gamma = [1,0.1,0.01,0.001]
kernels = ['rbf'] #poly takes forever to train 'sigmoid'

params = {'C': C, 
          'gamma': gamma, 
          'kernel': kernels}

grid = GridSearchCV(SVC(), params, refit=True, verbose=2)
grid.fit(X_train, y_train)



Fitting 5 folds for each of 16 candidates, totalling 80 fits


### 2. Best Hyper-parameters

In [5]:
print(grid.best_estimator_)

SVC(C=1, gamma=0.001)


### 3. Testing

In [6]:
test_path = max(glob.glob('/Users/ivankoh/Library/CloudStorage/OneDrive-NationalUniversityofSingapore/1D/NUS Y3S1/BT4222/Data/test/*.csv'), key=os.path.getctime) 
test = pd.read_csv(test_path)

X_test, y_test = test.drop(to_drop, axis=1).to_numpy(), test['state'].to_numpy()

In [7]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(1557, 540)
(1557,)
(293, 540)
(293,)


In [8]:
grid_predictions = grid.predict(X_test)
print(confusion_matrix(y_test,grid_predictions))
print(classification_report(y_test,grid_predictions))

auc = roc_auc_score(y_test, grid_predictions)
print("AUC:", auc)

[[ 30 114]
 [ 10 139]]
              precision    recall  f1-score   support

           0       0.75      0.21      0.33       144
           1       0.55      0.93      0.69       149

    accuracy                           0.58       293
   macro avg       0.65      0.57      0.51       293
weighted avg       0.65      0.58      0.51       293

AUC: 0.5706096196868009


In [9]:
for c in test.drop(to_drop_more, axis=1).columns: print(c)

reward_tiers
min_reward
max_reward
has_video
rewards_0
rewards_1
rewards_2
rewards_3
rewards_4
rewards_5
rewards_6
rewards_7
rewards_8
rewards_9
rewards_10
rewards_11
rewards_12
rewards_13
rewards_14
rewards_15
rewards_16
rewards_17
rewards_18
rewards_19
rewards_20
rewards_21
rewards_22
rewards_23
rewards_24
rewards_25
rewards_26
rewards_27
rewards_28
rewards_29
rewards_30
rewards_31
rewards_32
rewards_33
rewards_34
rewards_35
rewards_36
rewards_37
rewards_38
rewards_39
rewards_40
rewards_41
rewards_42
rewards_43
rewards_44
rewards_45
rewards_46
rewards_47
rewards_48
rewards_49
rewards_50
rewards_51
rewards_52
rewards_53
rewards_54
rewards_55
rewards_56
rewards_57
rewards_58
rewards_59
rewards_60
rewards_61
rewards_62
rewards_63
rewards_64
rewards_65
rewards_66
rewards_67
rewards_68
rewards_69
rewards_70
rewards_71
rewards_72
rewards_73
rewards_74
rewards_75
rewards_76
rewards_77
rewards_78
rewards_79
rewards_80
rewards_81
rewards_82
rewards_83
rewards_84
rewards_85
rewards_86
rewards_