In [16]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, RepeatedStratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import glob
import os
import time

In [None]:
base = "/Users/ivankoh/Library/CloudStorage/OneDrive-NationalUniversityofSingapore/1D/NUS Y3S1/BT4012/Data/"
train_path = base + "twitter_data_train_multiclass.csv" 
df = pd.read_csv(train_path)

## Get to the correct data format

In [None]:
to_drop = [
    'account_type', 
    'account_type_multi',
    'id', 
    'name', 
    'screen_name',
    'profile_image_url',
    'protected',
    'verified',
    'description',
    'description_processed',
    'tweets_list',
    'tweets_list_processed',
    'url'
    ]

X_train, y_train = df.drop(to_drop, axis=1).to_numpy(), df['account_type'].to_numpy()

In [None]:
df.columns[df.isna().any()]
a = df.drop(to_drop, axis=1)
a.columns.to_series()[np.isinf(a).any()]


## Grid Search with K-fold CV applied to SVC

5 folds 
- approx 311 per fold 
- 62 validation data 
- 249 train data

### 1. Training

In [None]:
# Hyper-parameters
gamma = [1e-7,1e-6,1e-5,1e-4,1e-3,1e-2,1e-1,1e0,1e1]
C = [1e-3,1e-2,1e-1,1e0,1e1,1e2,1e3,1e4,1e5]
kernels = ['rbf', 'linear'] #poly takes forever to train 'sigmoid'

params = {'C': C, 
          'gamma': gamma, 
          'kernel': kernels,
          'decision_function_shape':['ovr']
         }

cv_method = RepeatedStratifiedKFold(n_splits=5, 
                                    n_repeats=3, 
                                    random_state=2022)

svc_randomcv = RandomizedSearchCV(
    estimator=SVC(),
    param_distributions=params,
    n_iter=10, 
    cv=cv_method, # k-fold cv
    verbose=2,
    random_state=2022,
    n_jobs=-1, # use all processors
    scoring='f1_weighted'
)
svc_randomcv.fit(X_train, y_train)

### 2. Best Hyper-parameters

In [None]:
print(svc_randomcv.best_params_)
print(svc_randomcv.best_score_)

### 3. Testing

In [None]:
test_path = base + "twitter_data_test_multiclass.csv"
test = pd.read_csv(test_path)

X_test, y_test = test.drop(to_drop, axis=1).to_numpy(), test['account_type'].to_numpy()

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
best_clf = SVC(**svc_randomcv.best_params_)

start_time = time.time()
best_clf.fit(X_train, y_train)
y_pred_test = best_clf.predict(X_test)

time_taken = time.time() - start_time
print("Total time taken for the program execution", time_taken) # seconds

print(classification_report(y_test, y_pred_test, digits=5))
roc_auc_score(y_test, y_pred_test)

grid_predictions = grid.predict(X_test)
print(confusion_matrix(y_test,grid_predictions))
print(classification_report(y_test,grid_predictions))

auc = roc_auc_score(y_test, grid_predictions)
print("AUC:", auc)