In [1]:
!pip install hyperbox-brain
!pip install optuna



In [2]:
from preprocessing.tcw_builder import TCWBuilder
from preprocessing.svd_extractor import SVDExtractor
from sklearn.metrics import accuracy_score
import pandas as pd
from hbbrain.numerical_data.incremental_learner.iol_gfmm import ImprovedOnlineGFMM
import numpy as np

builder = TCWBuilder()
extractor = SVDExtractor()
socc = pd.read_csv('dataset/SOCC/processed/socc_method2.csv')



In [3]:
x = socc[['text']].to_numpy().flatten()
y = socc[['label']].to_numpy().flatten()

In [4]:
# auto reload modules
%load_ext autoreload
%autoreload 2

In [5]:
builder.fit_transform(x)

In [6]:
extractor.fit_transform(builder.tcw)

In [7]:
labels = np.where(y == 'normal', 0, 1)
labels

array([0, 1, 0, ..., 0, 0, 0])

In [8]:
features = np.hstack((extractor.features_matrix, socc[['confidence']].to_numpy()))

In [9]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score
import optuna

skf = StratifiedKFold(n_splits=6, shuffle=True, random_state=42)
accs = dict()
recalls =dict()
f1s = dict()
precisions = dict()

# Hyperparameters
is_draw = False
# theta = 0.3
# gamma = 0.9293632816790488

theta_values = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7]

def objective(trial):
    theta = trial.suggest_categorical('theta', theta_values)
    gamma = trial.suggest_float('gamma', 0, 1)

    acc = []
    recall = []
    f1 = []
    precision = []
    
    for train_index, test_index in skf.split(features, labels):
        clf = ImprovedOnlineGFMM(theta=theta, gamma=gamma, is_draw=is_draw)
    
        clf.fit(features[train_index], labels[train_index])
    
        acc.append(accuracy_score(labels[test_index], clf.predict(features[test_index])))
        recall.append(recall_score(labels[test_index], clf.predict(features[test_index]), average='binary'))
        f1.append(recall_score(labels[test_index], clf.predict(features[test_index]), average='binary'))
        precision.append(recall_score(labels[test_index], clf.predict(features[test_index]), average='binary'))

    _objective_value = np.mean(recall)
    
    accs[f'{theta} + {gamma}'] = np.mean(acc)
    recalls[f'{theta} + {gamma}'] = _objective_value
    f1s[f'{theta} + {gamma}'] = np.mean(f1)
    precisions[f'{theta} + {gamma}'] = np.mean(precision)

    return _objective_value


In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=5)

In [None]:
study.best_params

In [None]:
results = pd.DataFrame({
    'Accuracy Score': accs,
    'Recall Score': recalls,
    'F1 Score': f1s,
    'Precision Score': precisions,
})

In [None]:
display(results)

In [None]:
np.sum(labels == 1)
np.sum(labels == 0)