### Initialization
Installing requirements, auto reload changing to code, imports and some configurations

In [None]:
%pip install -r ../requirements.txt

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd

import sys
sys.path.append('..')

from helpers import utils, pipelines, models

from sklearn.model_selection import train_test_split

import multiprocessing

from data_augmentaion.data_augmentator import DataAugmentor 

import json

import time

pd.set_option('display.max_columns', None)

import warnings
warnings.filterwarnings('ignore')

In [None]:
# disable tqdm progress bar by default
from tqdm import tqdm
from functools import partialmethod

tqdm.__init__ = partialmethod(tqdm.__init__, disable=True)

### Define research parameters

In [None]:
regression_task = False
# continuous_features = []
metric = 'accuracy'
test_size_proportion = 0.2
augment_sample = 0.5
metrics = ['f1_weighted', 'accuracy', 'balanced_accuracy', 'precision_weighted', 'recall_weighted', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted'] 

search_pipelines = pipelines.get_classification_pipelines()
search_parameters = models.parameters

settings = [
    {'method': 'random'},
    {'method': 'smote'},
    {'method': 'cf_random'},
    # {'method': 'cf_genetic', 'kw_args': {'proximity_weight': 0.2, 'diversity_weight': 5, 'sparsity_weight': 0.2}},
    # {'method': 'cf_genetic', 'kw_args': {'proximity_weight': 0.2, 'diversity_weight': 5, 'sparsity_weight': 1}},
    {'method': 'cf_genetic', 'kw_args': {'proximity_weight': 5, 'diversity_weight': 0.2, 'sparsity_weight': 0.2}},
    # {'method': 'cf_genetic', 'kw_args': {'proximity_weight': 5, 'diversity_weight': 0.2, 'sparsity_weight': 1}},
    # {'method': 'cf_genetic', 'kw_args': {'proximity_weight': 1, 'diversity_weight': 1, 'sparsity_weight': 0.2}},
    # {'method': 'cf_genetic', 'kw_args': {'proximity_weight': 1, 'diversity_weight': 1, 'sparsity_weight': 1}},
    # {'method': 'cf_kdtree', 'kw_args': {'sparsity_weight': 0.2}},
    # {'method': 'cf_kdtree', 'kw_args': {'sparsity_weight': 1}},
]

In [None]:
from sklearn.datasets import make_classification

# Define the parameters of the synthetic dataset
n_samples = 600  # Total number of samples
n_features = 12   # Number of features
n_classes = 3    # Number of classes
class_weights = [0.5, 0.2, 0.3]  # Class imbalance ratio

# Create the imbalanced dataset
X, y = make_classification(n_samples=n_samples, 
                           n_features=n_features,
                           n_informative=3,
                           n_classes=n_classes,
                           n_clusters_per_class=2,
                           weights=class_weights,
                           class_sep=0.5, # 1
                           random_state=42)
X = pd.DataFrame(X, columns=[f"feature_{i}" for i in range(n_features)])
y = pd.Series(y, name="target")

In [None]:
continuous_features = X.columns.tolist()

#### split data

In [None]:
# all data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size_proportion, random_state=42)

In [None]:
display(y_train.value_counts())

In [None]:
full_results_df = pd.DataFrame()

### Whole dataset scores

In [None]:
print(f'whole adult dataset {metric} scores:')
whole_best_ests, whole_scores = utils.fit_and_evaluate(X_train, y_train, X_test, y_test,
                    search_estimators=search_pipelines, search_params=search_parameters, scoring=metric)
whole_dataset_result_df = pd.DataFrame.from_dict(whole_scores, orient='index')
whole_dataset_result_df.columns = pd.MultiIndex.from_product([['whole']] + [whole_dataset_result_df.columns])
whole_dataset_result_df

In [None]:
full_results_df = pd.concat([full_results_df, whole_dataset_result_df], axis=1)

### running experiments

In [None]:
best_estimators = {}
best_scores = {}
total_time = time.time()
for i, s in enumerate(settings):
    start = time.time()
    print(f'{i} / {len(settings)}, {s}', end=' ')
    augmentor = DataAugmentor(X_train, y_train, X_test, y_test,
                             method=s['method'], regression=regression_task,
                             continuous_feats=continuous_features,
                             cf_scoring = metric,
                             kw_args=s.get('kw_args', {})
                             )

    X_train_augmented_balanced, y_train_augmented_balanced = augmentor.augment(balance=True)
    best_estimators[f'{i}_balanced'], best_scores[f'{i}_balanced'] = \
     utils.fit_and_evaluate(X_train_augmented_balanced, y_train_augmented_balanced, X_test, y_test,
                            search_estimators=search_pipelines, search_params=search_parameters, scoring=metrics)
    result_df_balanced = pd.DataFrame.from_dict(best_scores[f'{i}_balanced'], orient='index')
    result_df_balanced.columns = pd.MultiIndex.from_product([[f'{(list(s.values())[0])} balanced']] + [result_df_balanced.columns])
    # result_df_balanced.columns = pd.MultiIndex.from_product([[f'{json.dumps((list(s.values())))} balanced']] + [result_df_balanced.columns])


    X_train_augmented, y_train_augmented = augmentor.augment(balance=False, size=augment_sample)
    best_estimators[f'{i}'], best_scores[f'{i}'] = \
        utils.fit_and_evaluate(X_train_augmented, y_train_augmented, X_test, y_test,
                               search_estimators=search_pipelines, search_params=search_parameters, scoring=metrics)
    result_df = pd.DataFrame.from_dict(best_scores[f'{i}'], orient='index')
    result_df.columns = pd.MultiIndex.from_product([[f'{(list(s.values())[0])}']] + [result_df.columns])
    # print(list(s.values())[0])

    full_results_df = pd.concat([full_results_df, result_df], axis=1)
    print(f'{time.time() - start} seconds for settings {i}')
print(f'\nTotal time: {time.time() - total_time}')


In [None]:
full_results_df

In [None]:
print('best methods:')
display(utils.get_best_methods(full_results_df))

In [None]:
utils.save_results_as_latex_tables(full_results_df, task_name="artifical")

In [None]:
wanted_cols = ['whole', 'random','smote','cf_random','cf_genetic']
metric_names = {'f1_weighted': 'F1',
                'accuracy': 'Accuracy',
                'balanced_accuracy': 'Balanced accuracy',
                'precision_weighted': 'Precision',
                'recall_weighted': 'Recall',
                'roc_auc_ovr_weighted': 'ROC AUC OVR',
                'roc_auc_ovo_weighted': 'ROC AUC OVO'}
utils.spider_plot(full_results_df, 'lg', wanted_cols, metric_names, 'Logistic regression', save_task_name='artifical')
utils.spider_plot(full_results_df, 'rf', wanted_cols, metric_names, 'Random Forest', save_task_name='artifical')
utils.spider_plot(full_results_df, 'xgb', wanted_cols, metric_names, 'XGBoost', save_task_name='artifical')

In [None]:
full_results_df.to_csv(rf'../log/experiment_multiclass_artifical.csv')