In [1]:
!pip install hyperbox-brain
!pip install optuna



In [2]:
from preprocessing.tcw_builder import TCWBuilder
from preprocessing.svd_extractor import SVDExtractor
from sklearn.metrics import accuracy_score
import pandas as pd
from hbbrain.numerical_data.incremental_learner.iol_gfmm import ImprovedOnlineGFMM
import numpy as np

builder = TCWBuilder()
extractor = SVDExtractor()
socc = pd.read_csv('dataset/SOCC/processed/socc_method2.csv')



In [3]:
x = socc[['text']].to_numpy().flatten()
y = socc[['label']].to_numpy().flatten()

In [4]:
# auto reload modules
%load_ext autoreload
%autoreload 2

In [5]:
builder.fit_transform(x)

In [6]:
extractor.fit_transform(builder.tcw)

In [7]:
labels = np.where(y == 'normal', 0, 1)
labels

array([0, 0, 0, ..., 0, 0, 0])

In [8]:
features = np.hstack((extractor.features_matrix, socc[['confidence']].to_numpy()))

In [9]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score
import optuna

skf = StratifiedKFold(n_splits=6, shuffle=True, random_state=42)
accs = dict()
recalls =dict()
f1s = dict()
precisions = dict()

# Hyperparameters
is_draw = False
# theta = 0.3
# gamma = 0.9293632816790488

theta_values = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7]

def objective(trial):
    theta = trial.suggest_categorical('theta', theta_values)
    gamma = trial.suggest_float('gamma', 0, 1)

    acc = []
    recall = []
    f1 = []
    precision = []
    
    for train_index, test_index in skf.split(features, labels):
        clf = ImprovedOnlineGFMM(theta=theta, gamma=gamma, is_draw=is_draw)
    
        clf.fit(features[train_index], labels[train_index])
    
        acc.append(accuracy_score(labels[test_index], clf.predict(features[test_index])))
        recall.append(recall_score(labels[test_index], clf.predict(features[test_index]), average='binary'))
        f1.append(recall_score(labels[test_index], clf.predict(features[test_index]), average='binary'))
        precision.append(recall_score(labels[test_index], clf.predict(features[test_index]), average='binary'))

    _objective_value = np.mean(recall)
    
    accs[f'{theta} + {gamma}'] = np.mean(acc)
    recalls[f'{theta} + {gamma}'] = _objective_value
    f1s[f'{theta} + {gamma}'] = np.mean(f1)
    precisions[f'{theta} + {gamma}'] = np.mean(precision)

    return _objective_value


In [10]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=5)

[I 2025-03-19 13:58:09,985] A new study created in memory with name: no-name-b7f56911-58b0-4350-b104-52025c70ad8f


>>> The testing sample 1 with the coordinate [ 1.16735772e-03 -3.03294444e-03  3.65485001e-04 ... -9.89952257e-17
 -1.30355216e-17  6.30800000e-01] is outside the range [0, 1]. Membership value = 0.964079. The prediction is more likely incorrect.
>>> The testing sample 2 with the coordinate [ 2.65789182e-03 -6.68299630e-03  1.33618753e-03 ...  5.57821982e-17
  6.29050323e-17  7.79900000e-01] is outside the range [0, 1]. Membership value = 0.988909. The prediction is more likely incorrect.
>>> The testing sample 3 with the coordinate [ 0.00871683 -0.02100923  0.00611657 ...  0.00290461 -0.00097477
  0.2579    ] is outside the range [0, 1]. Membership value = 0.949427. The prediction is more likely incorrect.
>>> The testing sample 4 with the coordinate [ 2.35457532e-02 -4.67622405e-02  9.15696902e-03 ... -3.25804161e-17
  1.26276398e-17  9.47100000e-01] is outside the range [0, 1]. Membership value = 0.990475. The prediction is more likely incorrect.
>>> The testing sample 5 with the co

[I 2025-03-19 14:57:57,184] Trial 4 finished with value: 0.4973316912972085 and parameters: {'theta': 0.2, 'gamma': 0.8545223317371856}. Best is trial 3 with value: 0.6007799671592774.


>>> The testing sample 199 with the coordinate [ 3.58420935e-02 -5.53438798e-02  1.26318249e-02 ...  3.03576608e-18
 -1.14925430e-17  1.00000000e+00] is outside the range [0, 1]. Membership value = 0.811920. The prediction is more likely incorrect.
>>> The testing sample 200 with the coordinate [ 1.13487281e-03 -2.23998751e-03  4.67078198e-04 ... -9.21707372e-17
 -4.43370926e-17  1.00000000e+00] is outside the range [0, 1]. Membership value = 0.560356. The prediction is more likely incorrect.
>>> The testing sample 201 with the coordinate [ 4.63012542e-03 -1.05398431e-02  3.30402102e-03 ...  4.66206934e-17
 -1.24683250e-18  1.00000000e+00] is outside the range [0, 1]. Membership value = 0.899778. The prediction is more likely incorrect.


In [11]:
study.best_params

{'theta': 0.3, 'gamma': 0.33147713591561423}

In [12]:
results = pd.DataFrame({
    'Accuracy Score': accs,
    'Recall Score': recalls,
    'F1 Score': f1s,
    'Precision Score': precisions,
})

In [13]:
display(results)

Unnamed: 0,Accuracy Score,Recall Score,F1 Score,Precision Score
0.2 + 0.09768578100601133,0.890822,0.497332,0.497332,0.497332
0.1 + 0.9226196882260475,0.875117,0.456691,0.456691,0.456691
0.5 + 0.3575385771923504,0.870996,0.399425,0.399425,0.399425
0.3 + 0.33147713591561423,0.906581,0.60078,0.60078,0.60078
0.2 + 0.8545223317371856,0.891651,0.497332,0.497332,0.497332


In [14]:
np.sum(labels == 1)
np.sum(labels == 0)

1036