In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import ADASYN
import optuna
from sklearn.metrics import roc_auc_score
from sklearn.svm import SVC
from cuml.svm import SVC as cuSVC
from cuml.metrics import roc_auc_score
import cupy as cp

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_parquet('../data/processed/selected_features_df.parquet')

X = df.drop(columns=['target'])
y = df['target']

# Logistic Regression

SelectFromModel for selecting features for Logistic Regression

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

logisticRegression = LogisticRegression()

selector = SelectFromModel(estimator=logisticRegression).fit(X_train_scaled, y_train)
X_train_selected = selector.transform(X_train_scaled)
X_test_selected = selector.transform(X_test_scaled)

Balancing the dataset using ADASYN

In [4]:
adasyn = ADASYN()
X_train_res, y_train_res = adasyn.fit_resample(X_train_selected, y_train)

Hyperparameter tuning using Optuna

In [5]:
def objective(trial):
    C = trial.suggest_float('C', 1e-4, 1, log = True)
    max_iter = trial.suggest_int('max_iter', 100, 1000)
    class_weight = trial.suggest_categorical('class_weight', ['balanced', None])
    logisticRegression = LogisticRegression(C=C, max_iter=max_iter, class_weight=class_weight)
    logisticRegression.fit(X_train_res, y_train_res)
    y_pred = logisticRegression.predict(X_test_selected)
    return roc_auc_score(y_test, y_pred)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

[I 2024-11-17 07:57:27,197] A new study created in memory with name: no-name-c47e2c26-c11b-4f72-8028-31388a3b1f8a
[I 2024-11-17 07:57:27,725] Trial 0 finished with value: 0.5096410512924194 and parameters: {'C': 0.1657464492866667, 'max_iter': 398, 'class_weight': None}. Best is trial 0 with value: 0.5096410512924194.
[I 2024-11-17 07:57:28,142] Trial 1 finished with value: 0.5084046721458435 and parameters: {'C': 0.005990284078731089, 'max_iter': 800, 'class_weight': 'balanced'}. Best is trial 0 with value: 0.5096410512924194.
[I 2024-11-17 07:57:28,515] Trial 2 finished with value: 0.5096498727798462 and parameters: {'C': 0.08363229517387659, 'max_iter': 256, 'class_weight': None}. Best is trial 2 with value: 0.5096498727798462.
[I 2024-11-17 07:57:28,867] Trial 3 finished with value: 0.5092841386795044 and parameters: {'C': 0.0038872515841483503, 'max_iter': 893, 'class_weight': None}. Best is trial 2 with value: 0.5096498727798462.
[I 2024-11-17 07:57:29,293] Trial 4 finished with 

In [6]:
print(f"Best ROC AUC: {study.best_value}")
print(f"Best hyperparameters: {study.best_params}")

Best ROC AUC: 0.511711597442627
Best hyperparameters: {'C': 0.00010694612368649073, 'max_iter': 723, 'class_weight': 'balanced'}


# Support Vector Machines

SelectFromModel for selecting features for Support Vector Machines

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

svc = cuSVC(kernel = "linear")

selector = SelectFromModel(estimator=svc).fit(X_train_scaled, y_train)
X_train_selected = selector.transform(X_train_scaled)
X_test_selected = selector.transform(X_test_scaled)

[W] [07:58:02.462215] SVC with the linear kernel can be much faster using the specialized solver provided by LinearSVC. Consider switching to LinearSVC if tranining takes too long.


Balancing the dataset using ADASYN

In [8]:
adasyn = ADASYN()
X_train_res, y_train_res = adasyn.fit_resample(X_train_selected, y_train)

Hyperparameter tuning using Optuna

In [None]:
X_train_res_cp = cp.array(X_train_res)
y_train_res_cp = cp.array(y_train_res)
X_test_selected_cp = cp.array(X_test_selected)
y_test_cp = cp.array(y_test)

def objective(trial):
    C = trial.suggest_float('C', 1e-5, 1e5, log = True)
    gamma = trial.suggest_float('gamma', 1e-5, 1e5, log= True )
    kernel = trial.suggest_categorical('kernel', ['poly', 'rbf', 'sigmoid'])
    svm = cuSVC(C=C, gamma=gamma, kernel=kernel)
    svm.fit(X_train_res_cp, y_train_res_cp)
    y_pred = svm.predict(X_test_selected_cp) 
    return roc_auc_score(y_test_cp, y_pred)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)

[I 2024-11-17 07:58:57,929] A new study created in memory with name: no-name-3a22408a-edfa-484c-b6d9-8b69b86829b6
[I 2024-11-17 08:00:56,498] Trial 0 finished with value: 0.5 and parameters: {'C': 0.0002943101979845989, 'gamma': 0.0004973045828976662, 'kernel': 'sigmoid'}. Best is trial 0 with value: 0.5.
[I 2024-11-17 08:03:07,775] Trial 1 finished with value: 0.5 and parameters: {'C': 0.21706116167670755, 'gamma': 2.0162253973954183e-05, 'kernel': 'rbf'}. Best is trial 0 with value: 0.5.
