In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("./Data/Preprocessed/Word2vec_scratch_no_weight_dataset.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2859 entries, 0 to 2858
Columns: 101 entries, 0 to Target
dtypes: float64(100), int64(1)
memory usage: 2.2 MB


In [2]:

from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight

from collections import Counter

# Split into X and y
X = df.drop('Target', axis=1)  # all columns except the 'Target' column
y = df['Target']              # the 'Target' column

X_train, X_test, y_train, y_test= train_test_split(X,y, test_size=0.2, random_state=42, stratify=y)

print(f'Train split Shape: {Counter(y_train)}')
print(f'Test split Shape: {Counter(y_test)}')

Train split Shape: Counter({1: 1086, 0: 721, 2: 200, 5: 127, 4: 86, 3: 67})
Test split Shape: Counter({1: 272, 0: 180, 2: 50, 5: 32, 4: 22, 3: 16})


In [3]:
# Class Weights using compute_class_weights
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights_dict = dict(enumerate(class_weights))
print(class_weights_dict)

{0: 0.5286638927415627, 1: 0.35098219766728056, 2: 1.9058333333333333, 3: 5.689054726368159, 4: 4.432170542635659, 5: 3.0013123359580054}


# SVC

In [8]:
from sklearn.metrics import make_scorer, f1_score
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV

# Define the parameter grid for SVC
param_svc = {
    'C': [0.01, 0.1, 1, 10, 100],  # Regularization parameter
    'kernel': ['rbf', 'poly'],  # Kernel types
    'gamma': ['scale', 'auto', 1, 0.1, 0.01, 0.001],  # Kernel coefficient for 'rbf', 'poly', and 'sigmoid'
    'degree': [2, 3, 4, 5],  # Degree of the polynomial kernel function ('poly' kernel)
    'coef0': [0.0, 0.1, 0.5, 1],  # Independent term in 'poly' and 'sigmoid' kernels
    'shrinking': [True, False],  # Use the shrinking heuristic
    'tol': [1e-3, 1e-4],  # Tolerance for stopping criterion
    'max_iter': [1000, 10000, -1],  # Maximum number of iterations (-1 for no limit)
}

# Initialize the SVC model with fixed parameters
svc = SVC(probability=True, class_weight=class_weights_dict, random_state=42)

# Define a scorer for least misclassification (e.g., F1-score weighted)
scorer = make_scorer(f1_score, average='weighted')

# Set up RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=svc, param_distributions=param_svc,verbose=3, scoring=scorer, n_iter=200,random_state= 42, n_jobs=-1)

# Perform the grid search
random_search.fit(X_train, y_train)

# Print the best parameters and the corresponding score
print("Best Parameters:", random_search.best_params_)
print("Best F1 Score:", random_search.best_score_)


Fitting 5 folds for each of 200 candidates, totalling 1000 fits




[CV 1/5] END C=0.01, coef0=0.1, degree=5, gamma=1, kernel=poly, max_iter=-1, shrinking=False, tol=0.001;, score=0.202 total time=   9.9s
[CV 5/5] END C=100, coef0=0.0, degree=2, gamma=0.01, kernel=rbf, max_iter=1000, shrinking=False, tol=0.0001;, score=0.232 total time=  11.1s
[CV 2/5] END C=10, coef0=0.1, degree=4, gamma=0.01, kernel=poly, max_iter=-1, shrinking=False, tol=0.001;, score=0.002 total time=  10.9s
[CV 1/5] END C=0.01, coef0=0.1, degree=5, gamma=0.001, kernel=poly, max_iter=1000, shrinking=False, tol=0.001;, score=0.002 total time=  10.6s
[CV 4/5] END C=0.1, coef0=0.5, degree=4, gamma=0.1, kernel=poly, max_iter=10000, shrinking=True, tol=0.0001;, score=0.130 total time=  10.4s
[CV 2/5] END C=10, coef0=1, degree=5, gamma=1, kernel=rbf, max_iter=1000, shrinking=True, tol=0.001;, score=0.310 total time=  10.1s
[CV 1/5] END C=0.1, coef0=0.0, degree=3, gamma=1, kernel=poly, max_iter=10000, shrinking=False, tol=0.001;, score=0.212 total time=   9.6s
[CV 2/5] END C=1, coef0=0.0,



[CV 1/5] END C=100, coef0=0.0, degree=2, gamma=0.01, kernel=rbf, max_iter=1000, shrinking=False, tol=0.0001;, score=0.253 total time=  12.6s
[CV 5/5] END C=0.1, coef0=0.1, degree=4, gamma=0.001, kernel=rbf, max_iter=-1, shrinking=False, tol=0.0001;, score=0.002 total time=  12.9s
[CV 4/5] END C=100, coef0=1, degree=2, gamma=0.1, kernel=poly, max_iter=-1, shrinking=True, tol=0.0001;, score=0.256 total time=   9.1s
[CV 2/5] END C=0.1, coef0=0.5, degree=4, gamma=0.1, kernel=poly, max_iter=10000, shrinking=True, tol=0.0001;, score=0.054 total time=  11.2s
[CV 5/5] END C=0.1, coef0=1, degree=3, gamma=auto, kernel=rbf, max_iter=10000, shrinking=True, tol=0.0001;, score=0.002 total time=  12.7s
[CV 3/5] END C=1, coef0=0.0, degree=4, gamma=0.1, kernel=rbf, max_iter=-1, shrinking=False, tol=0.001;, score=0.226 total time=  12.0s
[CV 1/5] END C=1, coef0=0.0, degree=5, gamma=auto, kernel=poly, max_iter=1000, shrinking=False, tol=0.0001;, score=0.002 total time=  10.2s
[CV 4/5] END C=10, coef0=0.5



[CV 3/5] END C=100, coef0=0.0, degree=2, gamma=0.01, kernel=rbf, max_iter=1000, shrinking=False, tol=0.0001;, score=0.250 total time=  12.3s
[CV 4/5] END C=0.1, coef0=0.1, degree=4, gamma=0.001, kernel=rbf, max_iter=-1, shrinking=False, tol=0.0001;, score=0.006 total time=  12.7s
[CV 3/5] END C=100, coef0=1, degree=2, gamma=0.1, kernel=poly, max_iter=-1, shrinking=True, tol=0.0001;, score=0.306 total time=   8.9s
[CV 5/5] END C=0.01, coef0=0.1, degree=5, gamma=0.001, kernel=poly, max_iter=1000, shrinking=False, tol=0.001;, score=0.002 total time=  10.8s
[CV 3/5] END C=0.1, coef0=1, degree=3, gamma=auto, kernel=rbf, max_iter=10000, shrinking=True, tol=0.0001;, score=0.006 total time=  12.6s
[CV 5/5] END C=10, coef0=1, degree=5, gamma=1, kernel=rbf, max_iter=1000, shrinking=True, tol=0.001;, score=0.280 total time=  10.0s
[CV 3/5] END C=0.1, coef0=0.0, degree=3, gamma=1, kernel=poly, max_iter=10000, shrinking=False, tol=0.001;, score=0.240 total time=  10.2s
[CV 1/5] END C=10, coef0=0.5,



[CV 2/5] END C=1, coef0=0.1, degree=2, gamma=auto, kernel=rbf, max_iter=-1, shrinking=True, tol=0.001;, score=0.022 total time=  11.7s
[CV 3/5] END C=10, coef0=0.5, degree=4, gamma=0.01, kernel=poly, max_iter=1000, shrinking=False, tol=0.001;, score=0.162 total time=  13.0s
[CV 1/5] END C=10, coef0=0.0, degree=4, gamma=scale, kernel=rbf, max_iter=10000, shrinking=True, tol=0.001;, score=0.248 total time=  14.2s
[CV 4/5] END C=0.1, coef0=0.0, degree=3, gamma=scale, kernel=rbf, max_iter=-1, shrinking=False, tol=0.0001;, score=0.152 total time=  16.8s
[CV 1/5] END C=10, coef0=0.5, degree=3, gamma=scale, kernel=poly, max_iter=1000, shrinking=False, tol=0.0001;, score=0.271 total time=  10.9s
[CV 4/5] END C=10, coef0=0.5, degree=3, gamma=scale, kernel=poly, max_iter=1000, shrinking=False, tol=0.0001;, score=0.277 total time=  10.5s
[CV 2/5] END C=1, coef0=0.0, degree=2, gamma=auto, kernel=rbf, max_iter=-1, shrinking=False, tol=0.0001;, score=0.022 total time=  16.1s
[CV 5/5] END C=0.01, coe



[CV 3/5] END C=10, coef0=0.0, degree=4, gamma=scale, kernel=rbf, max_iter=10000, shrinking=True, tol=0.001;, score=0.254 total time=  13.8s
[CV 1/5] END C=0.01, coef0=1, degree=4, gamma=scale, kernel=poly, max_iter=-1, shrinking=False, tol=0.0001;, score=0.129 total time=  13.0s
[CV 2/5] END C=1, coef0=0.1, degree=5, gamma=0.01, kernel=rbf, max_iter=-1, shrinking=True, tol=0.0001;, score=0.022 total time=  16.7s
[CV 1/5] END C=0.01, coef0=0.5, degree=4, gamma=1, kernel=rbf, max_iter=-1, shrinking=True, tol=0.001;, score=0.002 total time=  16.5s
[CV 5/5] END C=1, coef0=0.0, degree=2, gamma=auto, kernel=rbf, max_iter=-1, shrinking=False, tol=0.0001;, score=0.014 total time=  17.2s
[CV 2/5] END C=0.01, coef0=0.1, degree=5, gamma=0.001, kernel=rbf, max_iter=-1, shrinking=False, tol=0.0001;, score=0.002 total time=  16.1s
[CV 5/5] END C=0.01, coef0=0.0, degree=3, gamma=1, kernel=rbf, max_iter=-1, shrinking=True, tol=0.001;, score=0.002 total time=  16.6s
[CV 4/5] END C=0.01, coef0=1, degree



[CV 5/5] END C=1, coef0=0.1, degree=5, gamma=0.01, kernel=rbf, max_iter=-1, shrinking=True, tol=0.0001;, score=0.014 total time=  16.4s
[CV 1/5] END C=1, coef0=0.0, degree=2, gamma=auto, kernel=rbf, max_iter=-1, shrinking=False, tol=0.0001;, score=0.023 total time=  16.2s
[CV 3/5] END C=0.01, coef0=1, degree=3, gamma=1, kernel=poly, max_iter=1000, shrinking=False, tol=0.0001;, score=0.142 total time=  12.7s
[CV 4/5] END C=10, coef0=0.5, degree=4, gamma=auto, kernel=rbf, max_iter=1000, shrinking=True, tol=0.001;, score=0.250 total time=  15.0s
[CV 2/5] END C=0.01, coef0=0.0, degree=3, gamma=1, kernel=rbf, max_iter=-1, shrinking=True, tol=0.001;, score=0.002 total time=  16.7s
[CV 1/5] END C=0.01, coef0=1, degree=4, gamma=0.01, kernel=poly, max_iter=1000, shrinking=False, tol=0.0001;, score=0.002 total time=  13.4s
[CV 4/5] END C=100, coef0=1, degree=5, gamma=0.01, kernel=rbf, max_iter=10000, shrinking=False, tol=0.001;, score=0.209 total time=  15.0s
[CV 2/5] END C=0.1, coef0=0.1, degre



[CV 5/5] END C=1, coef0=0.1, degree=2, gamma=1, kernel=poly, max_iter=10000, shrinking=False, tol=0.001;, score=0.279 total time=   7.0s
[CV 3/5] END C=1, coef0=0.1, degree=2, gamma=auto, kernel=rbf, max_iter=-1, shrinking=True, tol=0.001;, score=0.151 total time=  13.4s
[CV 1/5] END C=1, coef0=1, degree=3, gamma=0.01, kernel=poly, max_iter=-1, shrinking=True, tol=0.0001;, score=0.126 total time=  13.1s
[CV 4/5] END C=10, coef0=0.0, degree=4, gamma=scale, kernel=rbf, max_iter=10000, shrinking=True, tol=0.001;, score=0.234 total time=  13.6s
[CV 2/5] END C=0.01, coef0=1, degree=4, gamma=scale, kernel=poly, max_iter=-1, shrinking=False, tol=0.0001;, score=0.132 total time=  12.6s
[CV 3/5] END C=1, coef0=0.1, degree=5, gamma=0.01, kernel=rbf, max_iter=-1, shrinking=True, tol=0.0001;, score=0.151 total time=  16.3s
[CV 2/5] END C=0.01, coef0=0.5, degree=4, gamma=1, kernel=rbf, max_iter=-1, shrinking=True, tol=0.001;, score=0.002 total time=  16.5s
[CV 4/5] END C=1, coef0=0.0, degree=2, gam



[CV 4/5] END C=1, coef0=0.1, degree=2, gamma=auto, kernel=rbf, max_iter=-1, shrinking=True, tol=0.001;, score=0.154 total time=  12.5s
[CV 2/5] END C=1, coef0=1, degree=3, gamma=0.01, kernel=poly, max_iter=-1, shrinking=True, tol=0.0001;, score=0.103 total time=  13.6s
[CV 5/5] END C=10, coef0=0.0, degree=4, gamma=scale, kernel=rbf, max_iter=10000, shrinking=True, tol=0.001;, score=0.260 total time=  13.9s
[CV 3/5] END C=0.01, coef0=1, degree=4, gamma=scale, kernel=poly, max_iter=-1, shrinking=False, tol=0.0001;, score=0.130 total time=  12.3s
[CV 4/5] END C=1, coef0=0.1, degree=5, gamma=0.01, kernel=rbf, max_iter=-1, shrinking=True, tol=0.0001;, score=0.154 total time=  16.8s
[CV 4/5] END C=0.01, coef0=0.5, degree=4, gamma=1, kernel=rbf, max_iter=-1, shrinking=True, tol=0.001;, score=0.006 total time=  16.4s
[CV 2/5] END C=0.01, coef0=1, degree=3, gamma=1, kernel=poly, max_iter=1000, shrinking=False, tol=0.0001;, score=0.142 total time=  12.0s
[CV 3/5] END C=10, coef0=0.5, degree=4, g



[CV 1/5] END C=0.1, coef0=0.0, degree=3, gamma=scale, kernel=rbf, max_iter=-1, shrinking=False, tol=0.0001;, score=0.177 total time=  15.9s
[CV 4/5] END C=0.01, coef0=1, degree=4, gamma=scale, kernel=poly, max_iter=-1, shrinking=False, tol=0.0001;, score=0.117 total time=  12.3s
[CV 2/5] END C=10, coef0=0.5, degree=3, gamma=scale, kernel=poly, max_iter=1000, shrinking=False, tol=0.0001;, score=0.308 total time=  10.1s
[CV 3/5] END C=0.01, coef0=0.5, degree=4, gamma=1, kernel=rbf, max_iter=-1, shrinking=True, tol=0.001;, score=0.014 total time=  16.1s
[CV 1/5] END C=0.01, coef0=1, degree=3, gamma=1, kernel=poly, max_iter=1000, shrinking=False, tol=0.0001;, score=0.121 total time=  11.7s
[CV 2/5] END C=10, coef0=0.5, degree=4, gamma=auto, kernel=rbf, max_iter=1000, shrinking=True, tol=0.001;, score=0.175 total time=  14.9s
[CV 5/5] END C=0.01, coef0=0.1, degree=5, gamma=0.001, kernel=rbf, max_iter=-1, shrinking=False, tol=0.0001;, score=0.002 total time=  16.1s
[CV 3/5] END C=100, coef0=



[CV 4/5] END C=1, coef0=1, degree=3, gamma=0.01, kernel=poly, max_iter=-1, shrinking=True, tol=0.0001;, score=0.159 total time=  13.3s
[CV 2/5] END C=0.1, coef0=0.0, degree=3, gamma=scale, kernel=rbf, max_iter=-1, shrinking=False, tol=0.0001;, score=0.064 total time=  16.1s
[CV 5/5] END C=0.01, coef0=1, degree=4, gamma=scale, kernel=poly, max_iter=-1, shrinking=False, tol=0.0001;, score=0.058 total time=  12.6s
[CV 3/5] END C=10, coef0=0.5, degree=3, gamma=scale, kernel=poly, max_iter=1000, shrinking=False, tol=0.0001;, score=0.274 total time=  10.2s
[CV 5/5] END C=0.01, coef0=0.5, degree=4, gamma=1, kernel=rbf, max_iter=-1, shrinking=True, tol=0.001;, score=0.002 total time=  16.3s
[CV 4/5] END C=0.01, coef0=1, degree=3, gamma=1, kernel=poly, max_iter=1000, shrinking=False, tol=0.0001;, score=0.117 total time=  12.6s
[CV 5/5] END C=10, coef0=0.5, degree=4, gamma=auto, kernel=rbf, max_iter=1000, shrinking=True, tol=0.001;, score=0.165 total time=  15.1s
[CV 3/5] END C=0.01, coef0=0.0, 



Best Parameters: {'tol': 0.0001, 'shrinking': True, 'max_iter': -1, 'kernel': 'poly', 'gamma': 1, 'degree': 5, 'coef0': 0.1, 'C': 100}
Best F1 Score: 0.4878322757181916
[CV 3/5] END C=0.01, coef0=1, degree=4, gamma=scale, kernel=poly, max_iter=-1, shrinking=True, tol=0.001;, score=0.130 total time=  13.5s
[CV 5/5] END C=0.1, coef0=0.5, degree=5, gamma=0.001, kernel=poly, max_iter=-1, shrinking=True, tol=0.0001;, score=0.002 total time=  13.7s
[CV 3/5] END C=10, coef0=0.0, degree=3, gamma=0.1, kernel=rbf, max_iter=1000, shrinking=False, tol=0.001;, score=0.214 total time=  14.8s
[CV 1/5] END C=0.1, coef0=0.1, degree=2, gamma=auto, kernel=poly, max_iter=-1, shrinking=False, tol=0.0001;, score=0.002 total time=  13.7s
[CV 4/5] END C=10, coef0=0.5, degree=3, gamma=0.1, kernel=poly, max_iter=10000, shrinking=False, tol=0.0001;, score=0.175 total time=  12.3s
[CV 5/5] END C=100, coef0=0.0, degree=4, gamma=0.001, kernel=rbf, max_iter=1000, shrinking=True, tol=0.0001;, score=0.177 total time= 