# SVM methods

In [1]:
import os
import joblib
import json
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

from sklearn.svm import SVC

In [2]:
base_dir = os.getcwd()
datafile = os.path.join(base_dir, "data", "data_cleaned.csv")

In [3]:
df = pd.read_csv(datafile, index_col=None)
df.head()

Unnamed: 0,label,tokens,numbers,has_url
0,0,"[""'hpl"", 'nom', 'may', 'see', 'attached', 'fil...",0.266667,False
1,0,"[""'nom"", 'actual', 'vols', 'th', 'forwarded', ...",0.267943,True
2,0,"[""'enron"", 'actuals', 'march', 'april', 'estim...",0.37931,False
3,0,"[""'hpl"", 'nom', 'may', 'see', 'attached', 'fil...",0.266667,False
4,0,"[""'hpl"", 'nom', 'june', 'see', 'attached', 'fi...",0.266667,False


In [4]:
train_df, test_df = train_test_split(df, test_size=0.3, random_state=33)
x_train = train_df['tokens']
x_test = test_df['tokens']
y_train = train_df['label']
y_test = test_df['label']

In [5]:
vectorizer = CountVectorizer(lowercase=True, stop_words='english', max_features=100000)
x_train_bow = vectorizer.fit_transform(x_train)
x_test_bow  = vectorizer.transform(x_test)

In [7]:
x_train_bow.shape

(57740, 100000)

## Train and grid-search SVM parameters

In [15]:
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['rbf', 'linear']
}


In [16]:
svc = SVC()
grid = GridSearchCV(svc, param_grid, cv=5, scoring='accuracy', verbose=2, n_jobs=-1)
grid.fit(x_train_bow, y_train)


Fitting 5 folds for each of 32 candidates, totalling 160 fits
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time= 5.7min
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time= 7.6min
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time= 7.6min
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time= 5.9min
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time= 5.8min
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=22.1min
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=22.2min
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=22.2min
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=22.2min
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=22.2min
[CV] END ....................C=0.1, gamma=0.1, kernel=linear; total time= 5.8min
[CV] END ....................C=0.1, gamma=0.1, 



[CV] END ......................C=1, gamma=0.1, kernel=linear; total time= 8.0min
[CV] END ......................C=1, gamma=0.1, kernel=linear; total time= 5.7min
[CV] END .........................C=1, gamma=0.1, kernel=rbf; total time=57.3min
[CV] END ......................C=1, gamma=0.1, kernel=linear; total time= 7.7min
[CV] END .........................C=1, gamma=0.1, kernel=rbf; total time=61.6min
[CV] END .........................C=1, gamma=0.1, kernel=rbf; total time=62.9min
[CV] END .....................C=1, gamma=0.01, kernel=linear; total time= 7.5min
[CV] END .....................C=1, gamma=0.01, kernel=linear; total time= 5.6min
[CV] END ........................C=1, gamma=0.01, kernel=rbf; total time=36.8min
[CV] END .....................C=1, gamma=0.01, kernel=linear; total time= 7.7min
[CV] END .........................C=1, gamma=0.1, kernel=rbf; total time=55.4min
[CV] END ........................C=1, gamma=0.01, kernel=rbf; total time=37.2min
[CV] END ...................

In [17]:
print("Best Parameters:", grid.best_params_)
print("Best Cross-Validation Accuracy:", grid.best_score_)

best_svc = grid.best_estimator_
y_pred = best_svc.predict(x_test_bow)
print(classification_report(y_test, y_pred))


Best Parameters: {'C': 0.1, 'gamma': 1, 'kernel': 'linear'}
Best Cross-Validation Accuracy: 0.982074818150329
              precision    recall  f1-score   support

           0       0.99      0.98      0.98     11951
           1       0.98      0.99      0.98     12795

    accuracy                           0.98     24746
   macro avg       0.98      0.98      0.98     24746
weighted avg       0.98      0.98      0.98     24746



### Save the model and best params

In [18]:
joblib.dump(best_svc, os.path.join(base_dir, 'models', 'SVC.joblib'))
with open(os.path.join(base_dir, 'models', 'SVC.json'), 'w') as f:
    f.write(json.dumps(grid.best_params_))