# Compare predict then fit and normal

In [2]:
# Change the syspath
import sys
import os

root_path = os.path.abspath(os.path.join('..', '..'))

if root_path not in sys.path:
    sys.path.append(root_path)

In [3]:
# auto reload modules
%load_ext autoreload
%autoreload 2

In [6]:
# Fetch dataset
import pandas as pd

def retrieve_dataset(path):
    dataset_path = os.path.join(root_path, *path.split('/'))
    dataset = pd.read_csv(dataset_path)

    return dataset

socc = retrieve_dataset('dataset/SOCC/processed/socc.csv')
# detox = retrieve_dataset('dataset/DETOX/processed/detox.csv')
hasoc = retrieve_dataset('dataset/HASOC/processed/en_train.csv')

# Disable because currently not support other Languages.
# hasoc_de = retrieve_dataset('dataset/HASOC/processed/de_train.csv')
# hasoc_hi = retrieve_dataset('dataset/HASOC/processed/hi_train.csv')

# trawling = retrieve_dataset('dataset/Trawling/processed/trawling.csv')

In [10]:
len(hasoc)

3708

In [5]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score
import optuna
from hbbrain.numerical_data.incremental_learner.iol_gfmm import ImprovedOnlineGFMM
from preprocessing.tcw_builder import TCWBuilder
from preprocessing.svd_extractor import SVDExtractor
import numpy as np
from sklearn.preprocessing import MinMaxScaler

In [6]:
def preprocess(dataset):
    x = dataset['text']
    labels = dataset['label']
    y = labels.copy()

    # Chuẩn hóa nhãn
    label_values = labels.unique().tolist()
    label_mapping = {label: i for i, label in enumerate(label_values)}
    y = labels.map(label_mapping)  # Ánh xạ nhãn về số nguyên
    y = y.to_numpy().ravel()  # Chuyển về numpy array 1D

    builder = TCWBuilder()
    extractor = SVDExtractor(k=20)
    scaler = MinMaxScaler()

    builder.fit_transform(x)
    extractor.fit_transform(builder.tcw)
    features = scaler.fit_transform(extractor.features_matrix)
    

    return features, y, label_values
    

In [7]:
x = dict()
y = dict()  # Numerical labels of dataset
labels = dict()  # Text labels of dataset

x['socc'], y['socc'], labels['socc'] = preprocess(socc)
x['socc'] = np.hstack((x['socc'], socc[['confidence']].to_numpy()))
x['hasoc'], y['hasoc'], labels['hasoc'] = preprocess(hasoc)

In [8]:
labels['socc']

['normal', 'toxic']

In [9]:
import copy  # Do something I don't know.

is_draw = False
skip_step = 5

def normal_train(x, y, theta, gamma):
    accs = []
    f1s = []
    recalls = []
    precisions = []

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    count = 0  # Counting the number of splits.

    for train_index, test_index in skf.split(x, y):
        count += 1
        print(f'\n\n=================\nStratifiedKFold at the number {count}\n===============\n\n')
        clf = ImprovedOnlineGFMM(theta=theta, gamma=gamma, is_draw=is_draw)
        clf.fit(x[train_index], y[train_index])
        print(f'\n\n=================\nValidatating -> {count}\n===============\n\n')
        y_pred = clf.predict(x[test_index])

        accs.append(accuracy_score(y[test_index], y_pred))
        f1s.append(f1_score(y[test_index], y_pred, average='weighted'))
        recalls.append(recall_score(y[test_index], y_pred, average='weighted'))
        precisions.append(precision_score(y[test_index], y_pred, average='weighted'))

    return np.mean(accs), np.mean(recalls), np.mean(precisions), np.mean(f1s)
        
        

# Predict then fit.
# Return lists of metrics.
def ptf_train(x, y, theta, gamma):
    accs = dict()
    f1s = dict()
    recalls = dict()
    precisions = dict()

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    count = 0  # Counting the number of splits.

    for train_index, test_index in skf.split(x, y):
        count += 1
        print(f'\n\n=================\nStratifiedKFold at the number {count}\n===============\n\n')
        source_clf = ImprovedOnlineGFMM(theta=theta, gamma=gamma, is_draw=is_draw)
        source_clf.fit(x[train_index], y[train_index])
        print(f'\n\n=================\nValidatating -> {count}\n===============\n\n')
        n_test = len(x[test_index])
        skip = 0

        while skip < n_test:
            y_pred = []  # Storing predicting results.

            clf = copy.deepcopy(source_clf)
            
            print(f'\n\n=================\nThe skip number of -> {skip}\n===============\n\n')            
            skipping_value = skip
            for sample, true_predict in zip(x[test_index], y[test_index]):
                # Store predicted result.
                y_pred.append(clf.predict(np.atleast_2d(sample))[0])

                # Check if there is a skip for not updating model.
                if skipping_value > 0:
                    print(f'\n\n=================\nSkipping the sample {skip - skipping_value}\n===============\n\n')
                    skipping_value -= 1
                    continue
    
                # Update model
                clf.fit(np.atleast_2d(sample), np.array([true_predict]))
    
            y_pred = np.array(y_pred)

            if skip not in recalls:
                accs[skip] = []
                f1s[skip] = []
                recalls[skip] = []
                precisions[skip] = []
            
            accs[skip].append(accuracy_score(y[test_index], y_pred))
            f1s[skip].append(f1_score(y[test_index], y_pred, average='weighted'))
            recalls[skip].append(recall_score(y[test_index], y_pred, average='weighted'))
            precisions[skip].append(precision_score(y[test_index], y_pred, average='weighted'))

            # Increment the value of skip
            skip += skip_step

        # Final test, no updates.
        print(f'\n\n=================\nThis time the model will not be updated\n===============\n\n')
        clf = ImprovedOnlineGFMM(theta=theta, gamma=gamma, is_draw=is_draw)
        clf.fit(x[train_index], y[train_index])
        print(f'\n\n=================\nValidatating -> {count}\n===============\n\n')

        y_pred = clf.predict(x[test_index])
        skip = n_test
        
        if skip not in recalls:
            accs[skip] = []
            f1s[skip] = []
            recalls[skip] = []
            precisions[skip] = []
        accs[skip].append(accuracy_score(y[test_index], y_pred))
        f1s[skip].append(f1_score(y[test_index], y_pred, average='weighted'))
        recalls[skip].append(recall_score(y[test_index], y_pred, average='weighted'))
        precisions[skip].append(precision_score(y[test_index], y_pred, average='weighted'))


    for skip in recalls:
        accs[skip] = np.mean(accs[skip])
        f1s[skip] = np.mean(f1s[skip])
        recalls[skip] = np.mean(recalls[skip])
        precisions[skip] = np.mean(precisions[skip])

    return accs, recalls, precisions, f1s

In [None]:
thetas = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7]
gamma = 1

# Dictionary to store results.
ptf = dict()

for theta in thetas:
    ptf[theta] = {
        'socc': {
            'accuracy': dict(),
            'recall': dict(),
            'precision': dict(),
            'f1': dict(),
        },
        'hasoc': {
            'accuracy': dict(),
            'recall': dict(),
            'precision': dict(),
            'f1': dict(),
        },
    }

for theta in thetas:
    print(f'\n\n=================\nComputing result theta {theta}\n===============\n\n')
    for dataset in ptf[theta]:
        print(f'\n\n=================\nLoading dataset {dataset}...\n===============\n\n')
        acc, rec, prec, f1 = ptf_train(x[dataset], y[dataset], theta, gamma)
        ptf[theta][dataset]['accuracy'] = acc
        ptf[theta][dataset]['recall'] = rec
        ptf[theta][dataset]['precision'] = prec
        ptf[theta][dataset]['f1'] = f1
        






Computing result theta 0.1




Loading dataset socc...




StratifiedKFold at the number 1




Validatating -> 1




The skip number of -> 0


>>> The testing sample 1 with the coordinate [0.01748926 0.28717241 0.77654726 0.71617387 0.53561233 0.84411131
 0.52364958 0.39777854 0.77327015 0.6745703  0.53957215 0.20608947
 0.68521047 0.19994428 0.1926263  1.         1.         1.
 0.77984146 0.71429053 1.        ] is outside the range [0, 1]. Membership value = 0.329213. The prediction is more likely incorrect.


The skip number of -> 5




Skipping the sample 0




Skipping the sample 1




Skipping the sample 2




Skipping the sample 3




Skipping the sample 4


>>> The testing sample 1 with the coordinate [0.01748926 0.28717241 0.77654726 0.71617387 0.53561233 0.84411131
 0.52364958 0.39777854 0.77327015 0.6745703  0.53957215 0.20608947
 0.68521047 0.19994428 0.1926263  1.         1.         1.
 0.77984146 0.71429053 1.        ] is outside the range [0, 1]. Membership value = 0.32