In [1]:
import pandas as pd
import numpy as np
from pyMetaheuristic.algorithm import salp_swarm_algorithm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE, RandomOverSampler

import sys
sys.path.append('../')
from visualization import visualization_table
import pickle
from tqdm import tqdm
from scipy.sparse import load_npz



# **Load Data**

In [2]:
X_train = load_npz('X_train.npz')
X_test  = load_npz('X_test.npz')
y_train = pd.read_csv('y_train.csv')
y_test = pd.read_csv('y_test.csv')

X_train_ros= load_npz('X_train_ros.npz')
X_test_ros = load_npz('X_test_ros.npz')
y_train_ros = pd.read_csv('y_train_ros.csv')
y_test_ros = pd.read_csv('y_test_ros.csv')

X_train_smote = load_npz('X_train_smote.npz')
X_test_smote = load_npz('X_test_smote.npz')
y_train_smote = pd.read_csv('y_train_smote.csv')
y_test_smote = pd.read_csv('y_test_smote.csv')

In [3]:
y_train.drop(['Unnamed: 0'], inplace = True, axis = 1)
y_test.drop(['Unnamed: 0'], inplace = True, axis = 1)

y_train_ros.drop(['Unnamed: 0'], inplace = True, axis = 1)
y_test_ros.drop(['Unnamed: 0'], inplace = True, axis = 1)

y_train_smote.drop(['Unnamed: 0'], inplace = True, axis = 1)
y_test_smote.drop(['Unnamed: 0'], inplace = True, axis = 1)

In [4]:
X_train

<28321x28177 sparse matrix of type '<class 'numpy.float64'>'
	with 311311 stored elements in Compressed Sparse Row format>

In [5]:
X_test

<9441x28177 sparse matrix of type '<class 'numpy.float64'>'
	with 103055 stored elements in Compressed Sparse Row format>

# **Datasets  Prepation**

# **Modelling** + **Visualization**

## No Oversampling

In [6]:
param_grid = {
    'kernel' : ['linear', 'poly', 'rbf'],
    'C': [1, 5, 10, 50],
    'gamma': [0.0001, 0.001, 0.005],
    'degree': [2, 3, 4]
}   


y_train = y_train['label']
y_test = y_test['label']

model = SVC()
grid_search = GridSearchCV(model, param_grid, cv=2)

%time grid_search.fit(X_train, y_train)
print('\nBest SVM Parameter for Normal Dataset: ', grid_search.best_params_)

CPU times: user 1h 1min 49s, sys: 25.4 s, total: 1h 2min 15s
Wall time: 1h 2min 17s

Best SVM Parameter for Normal Dataset:  {'C': 1, 'degree': 2, 'gamma': 0.0001, 'kernel': 'linear'}


In [7]:
svc_best = grid_search.best_estimator_
svc_best.fit(X_train, y_train)
y_pred = svc_best.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.96      0.90      6704
           1       0.80      0.66      0.72      2183
           2       0.66      0.14      0.22       554

    accuracy                           0.84      9441
   macro avg       0.77      0.59      0.62      9441
weighted avg       0.83      0.84      0.82      9441



## ROS (Before Splitting)

In [11]:
param_grid = {
    'kernel' : ['linear', 'poly', 'rbf'],
    'C': [1, 5, 10, 50],
    'gamma': [0.0001, 0.001, 0.005],
    'degree': [2, 3, 4]
}   

y_train_ros = y_train_ros
y_test_ros = y_test_ros

model = SVC()
grid_search = GridSearchCV(model, param_grid, cv=2)

%time grid_search.fit(X_train_ros, y_train_ros)
print('\nBest SVM Parameter for ROS : ', grid_search.best_params_)

CPU times: user 9h 6min 34s, sys: 3min 7s, total: 9h 9min 42s
Wall time: 9h 14min 13s

Best SVM Parameter for ROS :  {'C': 50, 'degree': 2, 'gamma': 0.0001, 'kernel': 'linear'}


In [12]:
svc_best = grid_search.best_estimator_
svc_best.fit(X_train_ros, y_train_ros)
y_pred_ros = svc_best.predict(X_test_ros)

print(classification_report(y_test_ros, y_pred_ros))

              precision    recall  f1-score   support

           0       0.96      0.86      0.91      6750
           1       0.91      0.95      0.93      6650
           2       0.93      0.99      0.96      6687

    accuracy                           0.93     20087
   macro avg       0.94      0.93      0.93     20087
weighted avg       0.94      0.93      0.93     20087



## SMOTE (Before Splitting)

In [15]:
param_grid = {
    'kernel' : ['linear', 'poly', 'rbf'],
    'C': [1, 5, 10, 50],
    'gamma': [0.0001, 0.001, 0.005],
    'degree': [2, 3, 4]
}   

y_train_smote = y_train_smote
y_test_smote = y_test_smote

model = SVC()
grid_search = GridSearchCV(model, param_grid, cv=2)

%time grid_search.fit(X_train_smote, y_train_smote)
print('\nBest SVM Parameter for SMOTE : ', grid_search.best_params_)

CPU times: user 9h 2min 17s, sys: 2min 45s, total: 9h 5min 2s
Wall time: 9h 6min 2s

Best SVM Parameter for SMOTE :  {'C': 10, 'degree': 2, 'gamma': 0.0001, 'kernel': 'linear'}


In [18]:
svc_best = grid_search.best_estimator_
svc_best.fit(X_train_smote, y_train_smote)
y_pred_smote = svc_best.predict(X_test_smote)

print(classification_report(y_test_smote, y_pred_smote))

              precision    recall  f1-score   support

           0       0.95      0.85      0.90      6739
           1       0.91      0.94      0.93      6687
           2       0.92      0.99      0.96      6661

    accuracy                           0.93     20087
   macro avg       0.93      0.93      0.93     20087
weighted avg       0.93      0.93      0.93     20087



## ROS (After Splitting)

In [19]:
oversample = RandomOverSampler(sampling_strategy = 'auto')
Xtrain_over_ros, ytrain_over_ros = oversample.fit_resample(X_train, y_train)

In [20]:
svc_best = SVC(C = 50, degree = 2, gamma = 0.0001, kernel = 'linear')
svc_best.fit(Xtrain_over_ros, ytrain_over_ros)
y_pred_ros = svc_best.predict(X_test)

print(classification_report(y_test, y_pred_ros))

              precision    recall  f1-score   support

           0       0.88      0.87      0.87      6704
           1       0.68      0.66      0.67      2183
           2       0.30      0.40      0.35       554

    accuracy                           0.79      9441
   macro avg       0.62      0.64      0.63      9441
weighted avg       0.80      0.79      0.80      9441



## SMOTE (After Splitting)

In [21]:
oversample = SMOTE()
Xtrain_over_smote, ytrain_over_smote = oversample.fit_resample(X_train, y_train)

In [22]:
svc_best = SVC(C = 50, degree = 2, gamma = 0.0001, kernel = 'linear')
svc_best.fit(Xtrain_over_smote, ytrain_over_smote)
y_pred_smote = svc_best.predict(X_test)

print(classification_report(y_test, y_pred_smote))

              precision    recall  f1-score   support

           0       0.88      0.86      0.87      6704
           1       0.68      0.65      0.66      2183
           2       0.29      0.41      0.34       554

    accuracy                           0.79      9441
   macro avg       0.62      0.64      0.63      9441
weighted avg       0.80      0.79      0.79      9441



# Summary

In [23]:
summary = {'Algorithm': ['SVM (No Oversample)', 'SVM (ROS)', 'SVM (SMOTE)', 'SVM (ROS)', 'SVM (SMOTE)'],
        'Splitting' : ['-', 'Before', 'Before', 'After', 'After'] ,
        'Accuracy': ['84%', '93%', '93%', '79%', '79%'] ,
        'Precision' : ['83%', '94%', '93%', '80%', '80%'],
        'Recall' : ['84%', '94%', '93%', '79%', '79%'],
        'f1-score' : ['62%', '93%', '93%', '80%', '79%'],
        'Support' :['9441', '20087', '20087', '9441', '9441'],
        'Time': ['1h 2min 15s', '9h 9min 42s', '9h 5mins 2s', '-', '-']}

summary_table = pd.DataFrame(summary)

summary_table

Unnamed: 0,Algorithm,Splitting,Accuracy,Precision,Recall,f1-score,Support,Time
0,SVM (No Oversample),-,84%,83%,84%,62%,9441,1h 2min 15s
1,SVM (ROS),Before,93%,94%,94%,93%,20087,9h 9min 42s
2,SVM (SMOTE),Before,93%,93%,93%,93%,20087,9h 5mins 2s
3,SVM (ROS),After,79%,80%,79%,80%,9441,-
4,SVM (SMOTE),After,79%,80%,79%,79%,9441,-
