In [1]:
import pandas as pd 
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import glob
import os

In [2]:
train_path = max(glob.glob('./data/train/*.csv'), key=os.path.getctime) 
df = pd.read_csv(train_path)
df['faq_count'] = df['faq_count'].apply(lambda x: float(x.replace(",", ""))) # transfer over to feature eng

## Get to the correct data format

In [3]:
to_drop = ['rewards', 'all_reward_amount', 'deadline', 'launched_at', 'rewards_processed',
           'description_processed', 'description_story_processed','description_risks_processed',
           'id', 'name', 'description', 'description_story', 'description_risks', 'video', 'state']

X_train, y_train = df.drop(to_drop, axis=1).to_numpy(), df['state'].to_numpy()

## Grid Search with K-fold CV applied to SVC

5 folds 
- approx 311 per fold 
- 62 validation data 
- 249 train data

### 1. Training

In [4]:
# Hyper-parameters
C = [0.1,1, 10, 100]
gamma = [1,0.1,0.01,0.001]
kernels = ['rbf', 'poly', 'sigmoid']

params = {'C': C, 
          'gamma': gamma, 
          'kernel': kernels}

grid = GridSearchCV(SVC(), params, refit=True, verbose=2)
grid.fit(X_train, y_train)



Fitting 5 folds for each of 48 candidates, totalling 240 fits
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.5s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.5s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.5s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.5s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.5s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   0.0s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   0.0s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   0.0s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   0.0s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   0.0s
[CV] END .....................C=0.1, gamma=1, kernel=sigmoid; total time=   0.5s
[CV] END .....................C=0.1, gamma=1, k

[CV] END ....................C=1, gamma=0.01, kernel=sigmoid; total time=   0.5s
[CV] END ....................C=1, gamma=0.01, kernel=sigmoid; total time=   0.5s
[CV] END ....................C=1, gamma=0.01, kernel=sigmoid; total time=   0.5s
[CV] END ....................C=1, gamma=0.01, kernel=sigmoid; total time=   0.5s
[CV] END .......................C=1, gamma=0.001, kernel=rbf; total time=   0.5s
[CV] END .......................C=1, gamma=0.001, kernel=rbf; total time=   0.5s
[CV] END .......................C=1, gamma=0.001, kernel=rbf; total time=   0.5s
[CV] END .......................C=1, gamma=0.001, kernel=rbf; total time=   0.5s
[CV] END .......................C=1, gamma=0.001, kernel=rbf; total time=   0.5s
[CV] END ......................C=1, gamma=0.001, kernel=poly; total time=   0.0s
[CV] END ......................C=1, gamma=0.001, kernel=poly; total time=   0.0s
[CV] END ......................C=1, gamma=0.001, kernel=poly; total time=   0.0s
[CV] END ...................

[CV] END ...................C=100, gamma=0.1, kernel=sigmoid; total time=   0.5s
[CV] END ...................C=100, gamma=0.1, kernel=sigmoid; total time=   0.5s
[CV] END ...................C=100, gamma=0.1, kernel=sigmoid; total time=   0.5s
[CV] END ...................C=100, gamma=0.1, kernel=sigmoid; total time=   0.5s
[CV] END ...................C=100, gamma=0.1, kernel=sigmoid; total time=   0.5s
[CV] END ......................C=100, gamma=0.01, kernel=rbf; total time=   0.5s
[CV] END ......................C=100, gamma=0.01, kernel=rbf; total time=   0.5s
[CV] END ......................C=100, gamma=0.01, kernel=rbf; total time=   0.5s
[CV] END ......................C=100, gamma=0.01, kernel=rbf; total time=   0.5s
[CV] END ......................C=100, gamma=0.01, kernel=rbf; total time=   0.5s
[CV] END .....................C=100, gamma=0.01, kernel=poly; total time=   0.0s
[CV] END .....................C=100, gamma=0.01, kernel=poly; total time=   0.0s
[CV] END ...................

### 2. Best Hyper-parameters

In [5]:
print(grid.best_estimator_)

SVC(C=0.1, gamma=1, kernel='poly')


### 3. Testing

In [6]:
test_path = max(glob.glob('./data/test/*.csv'), key=os.path.getctime) 
test = pd.read_csv(test_path)
test['faq_count'] = test['faq_count'].apply(lambda x: float(x.replace(",", ""))) # transfer over to feature eng

to_drop = ['rewards', 'all_reward_amount', 'deadline', 'launched_at', 'rewards_processed',
           'description_processed', 'description_story_processed','description_risks_processed',
           'id', 'name', 'description', 'description_story', 'description_risks', 'video', 'state']

X_test, y_test = test.drop(to_drop, axis=1).to_numpy(), df['state'].to_numpy()

In [7]:
grid_predictions = grid.predict(X_test)
print(confusion_matrix(y_test,grid_predictions))
print(classification_report(y_test,grid_predictions))

auc = roc_auc_score(y_test, grid_predictions)
print("AUC:", auc)

ValueError: X has 464 features, but SVC is expecting 545 features as input.