In [None]:
!pip uninstall scikit-learn imbalanced-learn -y
!pip install scikit-learn imbalanced-learn

Found existing installation: scikit-learn 1.3.2
Uninstalling scikit-learn-1.3.2:
  Successfully uninstalled scikit-learn-1.3.2
Found existing installation: imbalanced-learn 0.11.0
Uninstalling imbalanced-learn-0.11.0:
  Successfully uninstalled imbalanced-learn-0.11.0
Collecting scikit-learn
  Using cached scikit_learn-1.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.8 MB)
Collecting imbalanced-learn
  Using cached imbalanced_learn-0.11.0-py3-none-any.whl (235 kB)
Installing collected packages: scikit-learn, imbalanced-learn
Successfully installed imbalanced-learn-0.11.0 scikit-learn-1.3.2


## Plan
- plot validation curves so that we can keep to a range of parameters and don't overfit/underfit
  - find range of C and gamma for rbf_svm
  - results: C between 10^-3 and 10^2, gamma between 10^-3 and 10^-1
- plot learning curves to learn about relationship between sample size and test/training scores
- randomized search to find parameters
- grid search to find parameters within the range known from validation curves


#### Load things from Data Processing

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer # for feature extraction
from imblearn.over_sampling import ADASYN # to do resampling
from sklearn import svm
from sklearn.svm import SVC

from sklearn.model_selection import validation_curve
from sklearn.model_selection import ValidationCurveDisplay # to plot validation curves
from sklearn.model_selection import LearningCurveDisplay # to plot learning curves

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, expon,loguniform

In [None]:
# Load your data that is in DataFrame using pandas

train_data = pd.read_csv('cola_dataset/in_domain_train.tsv', delimiter = '\t', header = None, names = ['sentence_source', 'label','label_original','sentence'])
train_texts = train_data.iloc[:,3]
train_labels = train_data.iloc[:,1]

valid_data = pd.read_csv('cola_dataset/in_domain_dev.tsv', delimiter = '\t', header = None, names = ['sentence_source', 'label','label_original','sentence'])
val_texts = valid_data.iloc[:,3]
val_labels = valid_data.iloc[:,1]

test_data = pd.read_csv('cola_dataset/out_of_domain_dev.tsv', delimiter = '\t', header = None, names = ['sentence_source', 'label','label_original','sentence'])
test_texts = test_data.iloc[:,3]
test_labels = test_data.iloc[:,1]

In [None]:
# Feature extraction using CountVectorizer and n_grams
vectorizer = CountVectorizer(ngram_range=(3, 3))  # This considers unigrams, bigrams, and trigrams
train_vec = vectorizer.fit_transform(train_texts)
val_vec = vectorizer.transform(val_texts)
test_vec = vectorizer.transform(test_texts)

In [None]:
# Resampling training data with Adasyn
ada = ADASYN(random_state=42)
vec_resampled, labels_resampled = ada.fit_resample(X=train_vec, y=train_labels)

In [None]:
# classifiers
linear_svm = svm.SVC(kernel='linear') #linear svm
rbf_svm = svm.SVC(kernel='rbf') #rbf kernel svm

#### Radius Basis Function Kernel SVM: validation and learning curves

In [None]:
#validation curve for rbf svm using param = C
ValidationCurveDisplay.from_estimator(rbf_svm, vec_resampled, labels_resampled,
                                      param_name="C", param_range=np.logspace(-7, 3, 10))

In [None]:
#validation curve for rbf svm using param = gamma
ValidationCurveDisplay.from_estimator(rbf_svm, vec_resampled, labels_resampled,
                                      param_name="gamma", param_range=np.logspace(-7, 3, 10))

In [None]:
# learning curve for rbf svm
LearningCurveDisplay.from_estimator(rbf_svm, vec_resampled, labels_resampled,
                                      train_sizes=np.linspace(0.1, 1.0, 10), cv=5)

#### Randomized search


In [None]:
# hyperparameter tuning using randomized search for rbf
model = rbf_svm

# define the hyperparameter search space
param_distributions = {
    'C': loguniform(1e-2, 10),
    'gamma': [0.03]
}

# set up randomized search
random_search = RandomizedSearchCV(rbf_svm, param_distributions, n_iter=100,
                                   cv=5, verbose=1, random_state=42, n_jobs=-1)

# perform randomized search
random_search.fit(vec_resampled, labels_resampled)

print("Best parameters found: ", random_search.best_params_)
print("Best score found: ", random_search.best_score_)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best parameters found:  {'C': 9.795846277645586, 'gamma': 1.4922453771381408}
Best score found:  0.7797724456518587


#### Grid search

In [None]:
# hyperparameter tuning using grid search
model = SVC()

# Set up hyperparameter grid to test
param_grid = {
    'C': [0.001, 0.001, 0.01, 0.1, 1, 10, 100],
    'gamma': [0.001, 0.01, 0.1, 1],
    'kernel': ['rbf', 'linear']
}

# set up grid search
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')

# perform grid search
grid_search.fit(vec_resampled, labels_resampled)

# best parameters and best score
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

#### Results of Randomized Search and Grid Search

In [None]:
from sklearn.metrics import matthews_corrcoef, roc_auc_score
from sklearn.metrics import accuracy_score, classification_report
model = SVC(kernel='rbf', C=0.20526315789473684, gamma=0.04)
model.fit(vec_resampled, labels_resampled)

validation_pred = model.predict(val_vec)
validation_roc_auc = roc_auc_score(val_labels, validation_pred)
validation_mcc = matthews_corrcoef(val_labels, validation_pred)

test_pred = model.predict(test_vec)
test_roc_auc = roc_auc_score(test_labels, test_pred)
test_mcc = matthews_corrcoef(test_labels, test_pred)

# Metrics
print("Validation:")
print(classification_report(val_labels, validation_pred))
print(f"Validation ROC_AUC: {validation_roc_auc}")
print(f"Validation MCC: {validation_mcc}")
print("Test:")
print(classification_report(test_labels, test_pred))
print(f"Test ROC_AUC: {test_roc_auc}")
print(f"Test MCC: {test_mcc}")

1.'C': loguniform(1e-2, 1e0), 'gamma': loguniform(1e-3, 1e-1)
* Best parameters found:  {'C': 0.0533653306637961, 'gamma': 0.08781408196485979}
* Best score found:  0.7022315649339227
* Test ROC_AUC: 0.5129402301262547
* Test MCC: 0.037457219149803475

2.'C': loguniform(1e-1, 10), 'gamma': loguniform(1e-2, 1e-1)
* Best parameters found:  {'C': 0.10994335574766201, 'gamma': 0.09330606024425668}
* Best score found:  0.7297635855049285
* Test ROC_AUC: 0.5127641766059845
* Test MCC: 0.036959313166822445

3.'C': loguniform(1e-2, 5), 'gamma': loguniform(1e-4, 1e-2)
* Best parameters found:  {'C': 1.5109731920685008, 'gamma': 0.006197015748809144}
* Best score found:  0.6528668535781784
* Test ROC_AUC: 0.5126429545693002
* Test MCC: 0.051782133103726834

4.'C': loguniform(0.01, 0.1), 'gamma': loguniform(0.01, 0.1)
* Best parameters found:  {'C': 0.02310093735409802, 'gamma': 0.09370916815598128}
* Best score found:  0.7218186434245144
* Test ROC_AUC: 0.5129402301262547
* Test MCC: 0.037457219149803475

5.'C': loguniform(1e-1, 10), 'gamma': loguniform(1e-3, 1e-1)
* Best parameters found:  {'C': 0.14096175149815865, 'gamma': 0.09413993046829945}
* Best score found:  0.7333080514279762
* Test ROC_AUC: 0.5129402301262547
* Test MCC: 0.037457219149803475

6.ran with 80 iterations and 3 folds
'C': loguniform(1e-2, 10), 'gamma': [0.03]
* Best parameters found:  {'C': 0.3752055855124282, 'gamma': 0.03}
* Best score found:  0.6746127297535919
* Test ROC_AUC: 0.5264176605984515
* Test MCC: 0.0906660628563406

7.'C': loguniform(1e-2, 10), 'gamma': [0.03]
* Best parameters found:  {'C': 0.26100256506134767, 'gamma': 0.03}
* Best score found:  0.6974958325090712
* Test ROC_AUC: 0.5250052312199205
* Test MCC: 0.08679336705643383

8.'C': loguniform(0.2, 0.4), 'gamma': [0.03]
* Best parameters found:  {'C': 0.2592852466099094, 'gamma': 0.03}
* Best score found:  0.6974958325090712
* Test ROC_AUC: 0.5250052312199205
* Test MCC: 0.08679336705643383

9.grid search -
  'C': np.linspace(0.2, 0.4, 20), 'gamma': [0.03]
* Best parameters: {'C': 0.26315789473684215, 'gamma': 0.03}
* Best score: 0.6975761215496171
* Test ROC_AUC: 0.5250052312199205
* Test MCC: 0.08679336705643383

10.grid search
* Best parameters: {'C': 10, 'gamma': 0.05}
* Best score: 0.6605730227535018
* Test ROC_AUC: 0.5234672822019375
* Test MCC: 0.07963104884022297

11.grid search
* 'C': np.linspace(0.1, 0.6, 20), 'gamma': [0.03]
* Best parameters: {'C': 0.3894736842105263, 'gamma': 0.03}
* Best score: 0.6745324664900876
* Test ROC_AUC: 0.5264176605984515
* Test MCC: 0.0906660628563406

12. grid search
* 'C': np.linspace(0.1, 0.6, 20), 'gamma': [0.02, 0.03, 0.04]
* Test ROC_AUC: 0.5071144590918603
* Test MCC: 0.02123039581451165
