In [10]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, RepeatedStratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.preprocessing import MinMaxScaler
import glob
import os
import time 

In [11]:
train_path = max(glob.glob('/Users/ivankoh/Library/CloudStorage/OneDrive-NationalUniversityofSingapore/1D/NUS Y3S1/BT4222/Data/train/*.csv'), key=os.path.getctime) 
print(train_path)
df = pd.read_csv(train_path)

/Users/ivankoh/Library/CloudStorage/OneDrive-NationalUniversityofSingapore/1D/NUS Y3S1/BT4222/Data/train/kickstarter_train_final_20221022-142838.csv


## Get to the correct data format

In [12]:
X_train, y_train = df.drop('state', axis=1).to_numpy(), df['state'].to_numpy()

## Randomized Search with K-fold CV applied to SVC

**Hyperparameter Tuning**

We will use RandomizedSearchCV for hyperparameter tuning.

### 1. Training

In [None]:
# Hyper-parameters
gamma = [1e-7,1e-4,1e-3,1e-1,1e0,1e1]
C = [1e-3,1e-2,1e0,1e1,1e3,1e5]
kernels = ['rbf', 'linear']  #poly takes forever to train 'sigmoid'

params = {'C': C, 'gamma': gamma, 'kernel': kernels}
cv_method = RepeatedStratifiedKFold(n_splits=5, 
                                    n_repeats=3, 
                                    random_state=2022)
svc_randomcv = RandomizedSearchCV(
    estimator=SVC(),
    param_distributions=params,
    n_iter=10, 
    cv=cv_method,
    verbose=2,
    random_state=2022,
    # n_jobs=-1, # use all processors,
    scoring='roc_auc'
)

svc_randomcv.fit(X_train, y_train)

Fitting 15 folds for each of 10 candidates, totalling 150 fits
[CV] END ..................C=100000.0, gamma=1.0, kernel=rbf; total time= 9.5min
[CV] END ..................C=100000.0, gamma=1.0, kernel=rbf; total time= 9.6min
[CV] END ..................C=100000.0, gamma=1.0, kernel=rbf; total time= 9.7min
[CV] END ..................C=100000.0, gamma=1.0, kernel=rbf; total time= 9.7min
[CV] END ..................C=100000.0, gamma=1.0, kernel=rbf; total time= 9.5min
[CV] END ..................C=100000.0, gamma=1.0, kernel=rbf; total time= 9.6min
[CV] END ..................C=100000.0, gamma=1.0, kernel=rbf; total time= 9.5min
[CV] END ..................C=100000.0, gamma=1.0, kernel=rbf; total time= 9.5min
[CV] END ..................C=100000.0, gamma=1.0, kernel=rbf; total time= 9.4min
[CV] END ..................C=100000.0, gamma=1.0, kernel=rbf; total time= 9.4min
[CV] END ..................C=100000.0, gamma=1.0, kernel=rbf; total time= 9.5min
[CV] END ..................C=100000.0, gamma=1

### 2. Best Hyper-parameters

In [None]:
print(svc_randomcv.best_params_)
print(svc_randomcv.best_score_)

### 3. Testing

In [9]:
test_path = max(glob.glob('/Users/ivankoh/Library/CloudStorage/OneDrive-NationalUniversityofSingapore/1D/NUS Y3S1/BT4222/Data/test/*.csv'), key=os.path.getctime) 
print(test_path)
test = pd.read_csv(test_path)

X_test, y_test = test.drop('state', axis=1).to_numpy(), test['state'].to_numpy()

/Users/ivankoh/Library/CloudStorage/OneDrive-NationalUniversityofSingapore/1D/NUS Y3S1/BT4222/Data/test/kickstarter_test_final_20221022-162938.csv


In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
best_clf = SVC(**svc_randomcv.best_params_)

start_time = time.time()
best_clf.fit(X_train, y_train)
y_pred_test = best_clf.predict(X_test)

time_taken = time.time() - start_time
print("Total time taken for the program execution", time_taken) # seconds

print(classification_report(y_test, y_pred_test, digits=5))
roc_auc_score(y_test, y_pred_test)