In [1]:
#!pip install scikit-learn

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR, SVC
from sklearn.neural_network import MLPRegressor, MLPClassifier
import scipy.stats as stats
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, r2_score, hamming_loss, classification_report, f1_score, precision_score, recall_score
from skopt import BayesSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from skopt.space import Real, Categorical, Integer
from skopt.plots import plot_objective, plot_histogram
import matplotlib.pyplot as plt
from pathlib import Path 
from sklearn.multioutput import ClassifierChain
from sklearn.linear_model import LogisticRegression
from skmultilearn.problem_transform import BinaryRelevance, LabelPowerset
from skmultilearn.adapt import MLkNN


from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [3]:
%run Util.ipynb

In [4]:
def  read_dataset(feat_select, # _all, _rf ...
                  feat_type, #OG,HG,
                  scale, #standar, minMax
                  feature_path,
                  ranks_path):
    """
    """
    data = read_output(sufix='_1min_8alg', dir_path = ranks_path)
    data = data.drop(columns=["Problem"])
    
    df_input = pd.read_csv(f'{feature_path}/{feat_type}/features_{feat_select}_{scale}.csv')
    df_input = df_input.drop(columns=["problem"])
    return df_input, data

In [5]:
df_input, data = read_dataset(feat_select='all',feat_type='HG-F', scale="standard",
                             feature_path='../datasets/results/features/ianuarie2025-v2/',
                             ranks_path='../datasets/results/ranks/runTime_1min_sa_ts_multilabel_filtered')

In [6]:
data

Unnamed: 0,TS-Ei-LM,SA-Ei-LM,TSL-Ei-LM,SAL-Ei-LM,TS-Si-LM,SA-Si-LM,TSL-Si-LM,SAL-Si-LM
0,0,0,0,1,0,0,0,0
1,1,0,0,1,0,0,0,0
2,0,1,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0
4,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...
291,0,0,0,1,0,0,0,0
292,0,1,0,1,0,0,0,0
293,0,1,0,1,0,0,0,0
294,0,1,0,1,0,0,0,0


In [7]:
#not used
def get_fold(index, number_folds, test_fold_index):
    data_size = len(index)
    fold_size = data_size//number_folds
    start_index_test = test_fold_index*fold_size
    if test_fold_index != number_folds-1:
        end_index_test = test_fold_index*fold_size + fold_size
    else:
        end_index_test = data_size
    test_index = index[start_index_test:end_index_test]
    if start_index_test == 0:
        train_index = index[end_index_test:]
    elif end_index_test == data_size:
        train_index = index[:start_index_test]
    else:
        train_index = np.concatenate((index[:start_index_test], index[end_index_test:]))
    return train_index, test_index    
        

In [8]:
#not used
def cross_validation(number_folds, feat_select='_all', feat_type='op', reg_type='rfr'):    
    df_all_input, df_all_output = read_dataset(feat_select=feat_select, feat_type=feat_type)
    index = np.random.permutation(len(df_all_input))
    scores_train = []
    scores_test = []
    for f in range(number_folds):
        train_index, test_index = get_fold(index, number_folds, f)
        train_scaler = MinMaxScaler()
        x_train_scaled = train_scaler.fit_transform(df_all_input.iloc[train_index])
        y_train = df_all_output.iloc[train_index]
        #y_train = train_scaler.fit_transform(df_all_output.iloc[train_index])
        # unscaled output
        test_scaler = MinMaxScaler()
        x_test_scaled = test_scaler.fit_transform(df_all_input.iloc[test_index])
        y_test = df_all_output.iloc[test_index]
        #print("x_train_scaled=",x_train_scaled)
        #print("x_test_scaled=",x_test_scaled)
        if reg_type == 'gbr':
            reg = MultiOutputRegressor(GradientBoostingRegressor(n_estimators=50,random_state=0)).fit(x_train_scaled, y_train)
        if reg_type == 'hgbr':
            reg = MultiOutputRegressor(HistGradientBoostingRegressor(max_iter=50,random_state=0)).fit(x_train_scaled, y_train)    
        if reg_type == 'rfr':
            reg = MultiOutputRegressor(RandomForestRegressor(n_estimators=20, random_state=0)).fit(x_train_scaled, y_train)
        if reg_type == 'svr':    
            reg = MultiOutputRegressor(SVR(C=5.0, epsilon=0.3, kernel='rbf')).fit(x_train_scaled, y_train) #  epsilon=0.3,
        if reg_type == 'mlp':    
            reg = MultiOutputRegressor(MLPRegressor(hidden_layer_sizes=(20,),max_iter=5000)).fit(x_train_scaled, y_train)
        score = reg.score(x_test_scaled, y_test)
        scores_test.append(score)
        score = reg.score(x_train_scaled, y_train)
        scores_train.append(score)
    return reg, scores_train, scores_test

### Regression model tuning using Grid Search and Randomized Search

### Classification models tuning using Bayesian Optimization

In [9]:
#!pip install scikit-optimize

In [10]:
from skopt import BayesSearchCV
from sklearn.datasets import load_digits
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from skopt.space import Real, Categorical, Integer
from skopt.plots import plot_objective, plot_histogram
import matplotlib.pyplot as plt

In [11]:
# Hyperparameter tuning based on Bayesian Optimization
import sys
import csv

target_names = ['TS-Ei', 'SA-Ei', 'TSL-Ei', 'SAL-Ei', 'TS-Si', 'SA-Si', 'TSL-Si', 'SAL-Si', ]

def classification_tuning(out_dir_path, ft_type, ft_select, classification_types, feature_path, ranks_path):
    #varianta veche de generarare, data_size  veenea ca parametru
    #index = np.random.permutation(data_size)
    # idx_X_train, idx_X_test, idx_y_train, idx_y_test = train_test_split(index, index, train_size = 0.8, test_size = 0.2, random_state=0)
    # print(data_size)
    # print('idx_X_train=',idx_X_train)
    # print('idx_X_test=',idx_X_test)
    Path(out_dir_path).mkdir(exist_ok=True)

    #generate in  fisierul DatasetsRanking
    idx_X_test = [99, 161, 95, 175, 45, 166, 224, 50, 249, 102, 107, 284, 177, 121, 136, 
                  241, 34, 260, 119, 199, 40, 24, 264, 169, 280, 51, 152, 110, 290, 4, 
                  6, 55, 77, 167, 217, 124, 278, 56, 144, 91, 170, 74, 159, 232, 27, 41, 
                  245, 164, 214, 230, 246, 263, 101, 18, 292, 92, 147, 115, 277, 15, 254, 62]

    idx_X_train = [30, 194, 248, 17, 81, 16, 58, 188, 160, 287, 129, 240, 213, 130, 256, 
                   79, 282, 84, 145, 36, 257, 220, 28, 134, 265, 286, 142, 195, 201, 66, 
                   273, 157, 128, 279, 150, 125, 109, 96, 26, 29, 227, 259, 276, 61, 73, 
                   209, 178, 215, 11, 80, 218, 163, 98, 253, 20, 225, 168, 205, 104, 200, 
                   197, 94, 106, 105, 118, 22, 187, 112, 202, 60, 237, 153, 75, 7, 294, 219, 
                   285, 151, 204, 222, 196, 156, 90, 193, 10, 72, 155, 1, 247, 57, 13, 131, 
                   113, 35, 5, 266, 139, 182, 38, 47, 12, 141, 207, 233, 123, 43, 88, 180, 
                   165, 46, 267, 203, 179, 242, 184, 3, 198, 25, 39, 281, 87, 234, 138, 132, 
                   126, 149, 68, 173, 216, 33, 171, 100, 86, 44, 255, 231, 23, 174, 71, 235, 
                   172, 283, 250, 89, 192, 143, 8, 14, 65, 78, 93, 146, 275, 272, 82, 293, 
                   262, 261, 133, 228, 212, 236, 70, 148, 116, 189, 226, 190, 210, 49, 52, 
                   67, 186, 103, 181, 221, 85, 42, 239, 140, 21, 223, 76, 63, 206, 291, 274, 
                   48, 0, 32, 238, 295, 185, 37, 288, 117, 162, 83, 137, 252, 64, 54, 251, 
                   53, 208, 108, 191, 229, 183, 154, 59, 270, 271, 244, 122, 31, 19, 69, 114,
                   135, 176, 111, 258, 158, 9, 120, 268, 2, 289, 97, 243, 211, 127, 269] 
      
    idx_y_train = idx_X_train
    idx_y_test = idx_X_test
    if not sys.warnoptions:
        import os, warnings
        warnings.simplefilter("ignore")
        os.environ["PYTHONWARNINGS"] = "ignore" # Also affect subprocesses

    classificators_format = ['mlknn', 'lp_rfc', 'br_rfc']
    for classification_type in classification_types:
        #print("classification_type", classification_type)
        test_results_score={}
        test_results_metaheuristic={}
        test_results={}
        for fs in ft_select:
            for ft in ft_type:  
                df_all_input, df_all_output = read_dataset(feat_select=fs, feat_type=ft, scale="standard",
                                                           feature_path=feature_path, ranks_path=ranks_path)
                #df_all_input=df_all_input.filter(regex='^(?!.*gini)(?!.*q25)(?!.*q50)(?!.*q75)(?!.*min)(?!.*max)')

                print(df_all_input.shape, df_all_output.shape)
                index = np.random.permutation(len(df_all_input))
                X_train = df_all_input.iloc[idx_X_train]
                X_test = df_all_input.iloc[idx_X_test]
                
                y_train = df_all_output.iloc[idx_y_train]
                y_test = df_all_output.iloc[idx_y_test]
                #print("y_test", y_test)
                
                train_scaler = MinMaxScaler()
                X_train = train_scaler.fit_transform(X_train)
                X_test = train_scaler.fit_transform(X_test)
                
                #y = df_all_output.iloc[index]
                #X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, test_size = 0.2, random_state=0)
                
                
                if classification_type == 'svc':
                    classification_model = MultiOutputClassifier(SVC())
                    param = {
                            'estimator__C': (1e-2, 1e+2, 'uniform'), # (1e-6, 1e+6, 'log-uniform'),
                            'estimator__gamma': (1e-2, 1e+1, 'uniform'),
                            'estimator__degree': (2, 4),  # integer valued parameter
                            'estimator__kernel': ['linear', 'poly', 'rbf'],  # categorical parameter
                            }
                if classification_type == 'gbc':
                    classification_model = MultiOutputClassifier(GradientBoostingClassifier())
                    param = {'estimator__n_estimators':[10,25,50,75,100,125,150],
                             'estimator__loss':[ 'log_loss'],#'exponential',
                             'estimator__min_samples_split':(0.1, 0.99, 'uniform'),
                             'estimator__min_samples_leaf':(0.01, 0.99, 'uniform')
                             }
                if classification_type == 'rfc':
                    classification_model = MultiOutputClassifier(RandomForestClassifier())
                    param = {'estimator__n_estimators':[10,25,50,75,100,125,150],
                             'estimator__criterion':['entropy', 'gini', 'log_loss'],
                             'estimator__min_samples_split':(0.05, 0.2, 'uniform'),
                             'estimator__max_features':['sqrt', 'log2', 1]
                             }
                if classification_type == 'mlp':
                    classification_model = MultiOutputClassifier(MLPClassifier())
                    param = {
                             'estimator__hidden_layer_sizes':[(64,),(32,),(40,)],#[(40,20), (64,32), (64,64)]), # (20,),(30,),(40,), (64,),
                             'estimator__activation':['logistic', 'tanh', 'relu'],
                             'estimator__max_iter':[500, 1000, 2000, 5000]
                             }
                if classification_type == 'mcc':
                    classification_model = ClassifierChain(LogisticRegression())

                    param = {
                        'base_estimator__C': (1e-6, 1e+6, 'log-uniform'),
                        'base_estimator__penalty': [None, 'l2']
                    }
                if classification_type == 'br_rfc':
                    classification_model = BinaryRelevance(RandomForestClassifier())
                    param = {'classifier__n_estimators':[10,25,50,75,100,125,150],
                             'classifier__criterion':['entropy', 'gini', 'log_loss'],
                             'classifier__min_samples_split':(0.05, 0.2, 'uniform'),
                             'classifier__max_features':['sqrt', 'log2', 1]
                             }
                if classification_type == 'lp_rfc':
                    classification_model = LabelPowerset(RandomForestClassifier())
                    param = {'classifier__n_estimators':[10,25,50,75,100,125,150],
                             'classifier__criterion':['entropy', 'gini', 'log_loss'],
                             'classifier__min_samples_split':(0.05, 0.2, 'uniform'),
                             'classifier__max_features':['sqrt', 'log2', 1]
                             }
                if classification_type == 'mcc_rfc':
                    classification_model = ClassifierChain(RandomForestClassifier())
                    param = {'base_estimator__n_estimators':[10,25,50,75,100,125,150],
                             'base_estimator__criterion':['entropy', 'gini', 'log_loss'],
                             'base_estimator__min_samples_split':(0.05, 0.2, 'uniform'),
                             'base_estimator__max_features':['sqrt', 'log2', 1]
                            }
                if classification_type == 'mlknn':
                    classification_model = MLkNN()
                    param = {
                             'k': [2, 3, 4, 5, 8, 10, 15,23],
                             's': [0.5, 0.7, 1.0]
                             }
                   
                opt = BayesSearchCV(classification_model,param, n_iter=32, # default value: 32, 50
                               cv=KFold(n_splits=5, random_state=100, shuffle=True),#5, # default value: 3
                               return_train_score = True, verbose = 0)

                
                opt.fit(np.array(X_train), np.array(y_train))
                #opt.fit(X_train, y_train)
                
                df_results=pd.DataFrame.from_dict(opt.cv_results_)
                df_results.to_csv(f'{out_dir_path}/feat_{fs}_{ft}_regr_{classification_type}.csv', sep=',')
        
                print("******* Feature select:",fs," Feature type:",ft, "Classification model:", classification_type, "******")
                        
                print('Test score: ', opt.score(X_test, y_test))
                best = opt.best_estimator_
                results_test  = best.predict(X_test)
                results_train = best.predict(X_train)
                print('Best estimator:', best)
                
                #print('Accurecy of the best estimator (training dataset): ', accuracy_score(y_pred = results_train, y_true = y_train))
                print('Accuracy Best score (validation) val. score : ', opt.best_score_)
                #print('R2 Best score (validation) test score: ', opt.score(X_test, y_test))
                test_score = None#accuracy_score(y_pred = results_test, y_true = y_test)
                #print('Hamming loss: {0}'.format(hamming_loss(y_pred = results_test, y_true = y_test)))
                #print('Accuracy of the best estimator (testing dataset): ',test_score)

                #br_f1=f1_score(y_pred = results_test, y_true = y_test, average='micro')
                #print('F1-score:',round(br_f1,3))
                y_true = np.array(y_test)
                y_pred = results_test


                # print('y_true.size', y_true.size)
                # for i in range(y_pred.shape[0]):
                #     print('true', y_true[i], 'pred', y_pred[i], np.not_equal(y_true[i], y_pred[i]), np.sum(np.not_equal(y_true[i], y_pred[i])))
                #print('Hamming loss :',  np.sum(np.not_equal(y_true, y_pred)),float(y_true.size))
                print("Hamming loss test:",   np.sum(np.not_equal(y_true, y_pred))/float(y_true.size))
                print('Hamming loss train:',  np.sum(np.not_equal(y_train, results_train))/float(y_train.size))
                #print("Hamming loss train:", np.sum(np.not_equal(y_train, results_train))/float(y_train.size))

                # print("Precision score:", precision_score(y_true, y_pred))
                # print("Recall score:", recall_score(y_true, y_pred))

                      
                # print(classification_report(y_pred = results_test, y_true = y_test, target_names=target_names))

                classes_identified_train =[]
                for i in range(results_train.shape[0]):
                     if classification_type in classificators_format:
                         classes_identified_train.append(np.count_nonzero(results_train[i].toarray()[0] == 1))
                     else:
                         classes_identified_train.append(np.count_nonzero(results_train[i] == 1))
                #print(classes_identified_train)
                print ("data size", results_train.shape[0], "found 0 alg", np.count_nonzero(np.array(classes_identified_train) == 0))
                
                metaheuristic=[]
                classes_identified_test = []
                test_results[f'res_{classification_type}_{fs}_{ft}'] = {}
                
                for i in range(results_test.shape[0]):
                    test_results[f'res_{classification_type}_{fs}_{ft}'][i] = results_test[i]
                    idx_best=np.argmin(results_test[i])
                    #print(idx_best, results_test[i].shape)
                    if classification_type in classificators_format:
                        metaheuristic.append(results_test[i].toarray()[0])
                        classes_identified_test.append(np.count_nonzero(results_test[i].toarray()[0] == 1))
                    else:
                        metaheuristic.append(results_test[i])
                        classes_identified_test.append(np.count_nonzero(results_test[i] == 1))
                #print("Results test:",metaheuristic)
                print("classes_identified_test test:",classes_identified_test,  np.count_nonzero(np.array(classes_identified_test) == 0))
                

                
                test_results_score[f'Acc_{classification_type}_{fs}_{ft}']=[test_score]
                test_results_metaheuristic[f'meta_{classification_type}_{fs}_{ft}']=metaheuristic
    # save results on test instances 
        df_results = pd.DataFrame(test_results)
        df_results.to_csv(f'{out_dir_path}/results_{classification_type}_test.csv')
        df_results_score = pd.DataFrame(test_results_score)
        df_results_score.to_csv(f'{out_dir_path}/results_{classification_type}_test_scores.csv')
        df_results_metaheuristic = pd.DataFrame(test_results_metaheuristic)
        df_results_metaheuristic.to_csv(f'{out_dir_path}/results_{classification_type}_test_metaheuristic.csv')
        

In [12]:
ft_type=['ST-F']#['HG']#'OG', 'OGQ', 'OGSM', 'OGET']#,'DG','HG']
classification_types=['rfc']#['rfc','svc','gbc']
ft_select=['all','LR10', 'LR20', 'LR30', 'LR40', 'RF', 'PCA99','PCA999','PCA9999','FS_S','FS_P','FS_K', 'LASSO', 'ELASTICNET']


dir_path = '../datasets/results/classification/februarie2025-v4/'
classification_tuning(dir_path, ft_type, ft_select, classification_types, 
                  feature_path='../datasets/results/features/februarie2025-v4/',
                  ranks_path='../datasets/results/ranks/runTime_1min_sa_ts_multilabel_filtered')

(296, 90) (296, 8)
******* Feature select: all  Feature type: ST-F Classification model: rfc ******
Test score:  0.08064516129032258
Best estimator: MultiOutputClassifier(estimator=RandomForestClassifier(criterion='entropy',
                                                       min_samples_split=0.05,
                                                       n_estimators=10))
Accuracy Best score (validation) val. score :  0.2053654024051804
Hamming loss test: 0.14919354838709678
Hamming loss train: TS-Ei-LM     0.007479
SA-Ei-LM     0.010150
TSL-Ei-LM    0.006410
SAL-Ei-LM    0.012821
TS-Si-LM     0.009081
SA-Si-LM     0.008547
TSL-Si-LM    0.006410
SAL-Si-LM    0.006410
dtype: float64
data size 234 found 0 alg 72
classes_identified_test test: [0, 0, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 2, 0, 1, 0, 1, 3, 2, 1, 0, 0, 0, 2, 2, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 2, 0, 0, 2, 0, 1, 1, 2, 0, 0, 0] 36
(296, 10) (296, 8)
******* Feature select: LR10  Feature t

In [51]:
ft_type=['ST-F']#['HG']#'OG', 'OGQ', 'OGSM', 'OGET']#,'DG','HG']
classification_types=['rfc']#['rfc','svc','gbc']
ft_select=['all','LR10', 'LR20', 'LR30', 'LR40', 'RF', 'PCA99','PCA999','PCA9999','FS_S','FS_P','FS_K']


dir_path = '../datasets/results/classification/februarie2025-v3/'
classification_tuning(dir_path, ft_type, ft_select, classification_types, 
                  feature_path='../datasets/results/features/februarie2025-v3/',
                  ranks_path='../datasets/results/ranks/runTime_1min_sa_ts_agregated_multilabel_filtered')

(296, 91) (296, 3)
******* Feature select: all  Feature type: ST-F Classification model: rfc ******
Test score:  0.2903225806451613
Best estimator: MultiOutputClassifier(estimator=RandomForestClassifier(max_features='log2',
                                                       min_samples_split=0.05))
Accuracy Best score (validation) val. score :  0.30786308973172993
Hamming loss test: 0.3172043010752688
Hamming loss train: stategy         0.024217
init            0.025641
perturbation    0.021368
dtype: float64
data size 234 found 0 alg 11
classes_identified_test test: [2, 3, 3, 1, 3, 0, 2, 2, 2, 2, 2, 3, 1, 3, 3, 3, 1, 3, 3, 2, 2, 3, 3, 2, 3, 1, 2, 2, 3, 3, 3, 3, 2, 1, 3, 3, 3, 2, 3, 3, 2, 2, 2, 2, 2, 2, 2, 3, 3, 2, 2, 3, 2, 3, 3, 3, 3, 3, 3, 2, 3, 2] 1
(296, 10) (296, 3)
******* Feature select: LR10  Feature type: ST-F Classification model: rfc ******
Test score:  0.1935483870967742
Best estimator: MultiOutputClassifier(estimator=RandomForestClassifier(max_features='log2',
        

In [32]:
ft_type=['OG-F']#['HG']#'OG', 'OGQ', 'OGSM', 'OGET']#,'DG','HG']
classification_types=['mlknn']
ft_select=['LR40']

dir_path = '../datasets/results/classification/ianuarie2025/'
classification_tuning(dir_path, ft_type, ft_select, classification_types, 
                  feature_path='../datasets/results/features/ianuarie2025-v2/',
                  ranks_path='../datasets/results/ranks/runTime_1min_sa_ts_multilabel_filtered')

******* Feature select: LR40  Feature type: OG-F Classification model: mlknn ******
Test score:  0.14516129032258066
Best estimator: MLkNN(k=5)
Accurecy of the best estimator (training dataset):  0.32905982905982906
Accuracy Best score (validation) val. score :  0.1666049953746531
Hamming loss: 0.16532258064516128
Accuracy of the best estimator (testing dataset):  0.14516129032258066
F1-score: 0.369
              precision    recall  f1-score   support

       TS-Ei       0.00      0.00      0.00         3
       SA-Ei       0.71      0.45      0.56        22
      TSL-Ei       0.00      0.00      0.00         7
      SAL-Ei       0.46      0.57      0.51        21
       TS-Si       0.00      0.00      0.00         6
       SA-Si       0.00      0.00      0.00         8
      TSL-Si       0.25      0.33      0.29         3
      SAL-Si       0.17      0.17      0.17         6

   micro avg       0.44      0.32      0.37        76
   macro avg       0.20      0.19      0.19        76
w

In [None]:
ft_type=['OG-F']#['HG']#'OG', 'OGQ', 'OGSM', 'OGET']#,'DG','HG']
classification_types=['mcc_rfc', 'mcc']
ft_select=['LR40']

dir_path = '../datasets/results/classification/ianuarie2025/'
classification_tuning(dir_path, ft_type, ft_select, classification_types, 
                  feature_path='../datasets/results/features/ianuarie2025-v2/',
                  ranks_path='../datasets/results/ranks/runTime_1min_sa_ts_multilabel_filtered')

In [200]:
ft_type=['OG-F']#['HG']#'OG', 'OGQ', 'OGSM', 'OGET']#,'DG','HG']
classification_types=['rfc', 'mcc_rfc']
ft_select=['LR40']

dir_path = '../datasets/results/classification/ianuarie2025/'
classification_tuning(dir_path, ft_type, ft_select, classification_types, 
                  feature_path='../datasets/results/features/ianuarie2025-v2/',
                  ranks_path='../datasets/results/ranks/runTime_1min_sa_ts_multilabel_filtered')

******* Feature select: LR40  Feature type: OG-F Classification model: rfc ******
Test score:  0.0967741935483871
Best estimator: MultiOutputClassifier(estimator=RandomForestClassifier(min_samples_split=0.05379679926080017,
                                                       n_estimators=10))
Accurecy of the best estimator (training dataset):  0.5170940170940171
Accuracy Best score (validation) val. score :  0.17946345975948197
Hamming loss: 0.2056451612903226
Accuracy of the best estimator (testing dataset):  0.0967741935483871
              precision    recall  f1-score   support

       TS-Ei       0.00      0.00      0.00         3
       SA-Ei       0.69      0.50      0.58        22
      TSL-Ei       0.23      0.43      0.30         7
      SAL-Ei       0.50      0.38      0.43        21
       TS-Si       0.33      0.17      0.22         6
       SA-Si       0.00      0.00      0.00         8
      TSL-Si       0.12      0.33      0.18         3
      SAL-Si       0.05      

In [201]:
ft_type=['OG-F']#['HG']#'OG', 'OGQ', 'OGSM', 'OGET']#,'DG','HG']
classification_types=['br_rfc', 'lp_rfc', 'mcc_rfc']
ft_select=['LR40']

dir_path = '../datasets/results/classification/ianuarie2025/'
classification_tuning(dir_path, ft_type, ft_select, classification_types, 
                  feature_path='../datasets/results/features/ianuarie2025-v2/',
                  ranks_path='../datasets/results/ranks/runTime_1min_sa_ts_multilabel_filtered')

******* Feature select: LR40  Feature type: OG-F Classification model: br_rfc ******
Test score:  0.14516129032258066
Best estimator: BinaryRelevance(classifier=RandomForestClassifier(min_samples_split=0.05409556809735245,
                                                  n_estimators=75),
                require_dense=[True, True])
Accurecy of the best estimator (training dataset):  0.5512820512820513
Accuracy Best score (validation) val. score :  0.18371877890841812
Hamming loss: 0.18346774193548387
Accuracy of the best estimator (testing dataset):  0.14516129032258066
              precision    recall  f1-score   support

       TS-Ei       0.00      0.00      0.00         3
       SA-Ei       0.71      0.45      0.56        22
      TSL-Ei       0.25      0.29      0.27         7
      SAL-Ei       0.50      0.38      0.43        21
       TS-Si       1.00      0.17      0.29         6
       SA-Si       0.00      0.00      0.00         8
      TSL-Si       0.50      0.67      0.57

In [19]:
from skmultilearn.cluster import LabelCooccurrenceGraphBuilder
from skmultilearn.cluster.networkx import NetworkXLabelGraphClusterer


# we define a helper function for visualization purposes
def to_membership_vector(partition):
    return {
        member :  partition_id
        for partition_id, members in enumerate(partition)
        for member in members
    }
    
def classification_graph(out_dir_path, ft_type, ft_select, classification_types, feature_path, ranks_path):
    Path(out_dir_path).mkdir(exist_ok=True)

    #generate in  fisierul DatasetsRanking
    idx_X_test = [99, 161, 95, 175, 45, 166, 224, 50, 249, 102, 107, 284, 177, 121, 136, 
                  241, 34, 260, 119, 199, 40, 24, 264, 169, 280, 51, 152, 110, 290, 4, 
                  6, 55, 77, 167, 217, 124, 278, 56, 144, 91, 170, 74, 159, 232, 27, 41, 
                  245, 164, 214, 230, 246, 263, 101, 18, 292, 92, 147, 115, 277, 15, 254, 62]

    idx_X_train = [30, 194, 248, 17, 81, 16, 58, 188, 160, 287, 129, 240, 213, 130, 256, 
                   79, 282, 84, 145, 36, 257, 220, 28, 134, 265, 286, 142, 195, 201, 66, 
                   273, 157, 128, 279, 150, 125, 109, 96, 26, 29, 227, 259, 276, 61, 73, 
                   209, 178, 215, 11, 80, 218, 163, 98, 253, 20, 225, 168, 205, 104, 200, 
                   197, 94, 106, 105, 118, 22, 187, 112, 202, 60, 237, 153, 75, 7, 294, 219, 
                   285, 151, 204, 222, 196, 156, 90, 193, 10, 72, 155, 1, 247, 57, 13, 131, 
                   113, 35, 5, 266, 139, 182, 38, 47, 12, 141, 207, 233, 123, 43, 88, 180, 
                   165, 46, 267, 203, 179, 242, 184, 3, 198, 25, 39, 281, 87, 234, 138, 132, 
                   126, 149, 68, 173, 216, 33, 171, 100, 86, 44, 255, 231, 23, 174, 71, 235, 
                   172, 283, 250, 89, 192, 143, 8, 14, 65, 78, 93, 146, 275, 272, 82, 293, 
                   262, 261, 133, 228, 212, 236, 70, 148, 116, 189, 226, 190, 210, 49, 52, 
                   67, 186, 103, 181, 221, 85, 42, 239, 140, 21, 223, 76, 63, 206, 291, 274, 
                   48, 0, 32, 238, 295, 185, 37, 288, 117, 162, 83, 137, 252, 64, 54, 251, 
                   53, 208, 108, 191, 229, 183, 154, 59, 270, 271, 244, 122, 31, 19, 69, 114,
                   135, 176, 111, 258, 158, 9, 120, 268, 2, 289, 97, 243, 211, 127, 269] 
      
    idx_y_train = idx_X_train
    idx_y_test = idx_X_test
    
    classificators_format = ['mlknn', 'lp_rfc', 'br_rfc']
    for classification_type in classification_types:
        #print("classification_type", classification_type)
        test_results_score={}
        test_results_metaheuristic={}
        test_results={}
        for fs in ft_select:
            for ft in ft_type:  
                df_all_input, df_all_output = read_dataset(feat_select=fs, feat_type=ft, scale="standard",
                                                           feature_path=feature_path, ranks_path=ranks_path)
                #print(df_all_output.sum(axis=0))
                index = np.random.permutation(len(df_all_input))
                X_train = df_all_input.iloc[idx_X_train]
                X_test = df_all_input.iloc[idx_X_test]
                
                y_train = df_all_output.iloc[idx_y_train]
                y_test = df_all_output.iloc[idx_y_test]
                
                train_scaler = MinMaxScaler()
                X_train = train_scaler.fit_transform(X_train)
                X_test = train_scaler.fit_transform(X_test)

                from skmultilearn.problem_transform import BinaryRelevance
                from sklearn.naive_bayes import GaussianNB
                
                
                # initialize Binary Relevance multi-label classifier
                # with a gaussian naive bayes base classifier
                classifier = BinaryRelevance(
                    classifier = GaussianNB(),
                    require_dense = [True, True]
                )
                # train
                classifier.fit(X_train, y_train)
                # predict
                predictions = classifier.predict(X_test)
                
                from sklearn.metrics import accuracy_score, f1_score, recall_score
                print(f1_score(y_test, predictions, average='weighted'), accuracy_score(y_test, predictions), recall_score(y_test, predictions, average='weighted'))

                from skmultilearn.problem_transform import ClassifierChain


                # initialize Classifier Chain multi-label classifier
                # with a gaussian naive bayes base classifier
                classifier = ClassifierChain(
                    classifier = GaussianNB(),
                    require_dense = [True, True]
                )
                # train
                classifier.fit(X_train, y_train)
                # predict
                predictions = classifier.predict(X_test)
                print(f1_score(y_test, predictions, average='weighted'), accuracy_score(y_test, predictions), recall_score(y_test, predictions, average='weighted'))

                from scipy import sparse

                ly_train=sparse.lil_matrix((y_train.shape[0],y_train.shape[1]))
                for i in range(y_train.shape[0]):
                    for j in range(len(y_train.iloc[i])):
                        if y_train.iloc[i,j] == 1:
                            ly_train[i,j]=1

                ##########NetworkX - nu merge doar daca numarul de noduri din clustere este egal
               
                # graph_builder = LabelCooccurrenceGraphBuilder(weighted=True,
                #                                               include_self_edges=False)
                
                # label_names=target_names#[i for i in range(nlabel)]
                # edge_map = graph_builder.transform(ly_train)#np.array(y_train))
                # print("{} labels, {} edges".format(len(label_names), len(edge_map)))

                # clusterer = NetworkXLabelGraphClusterer(graph_builder, method='louvain')

                # print(np.array(X_train).shape, np.array(y_train).shape, ly_train.shape[1],type(ly_train))
                # # for el in ly_train:
                # #     print("!!!!!!!!! shape", el.shape[1]) 
                # #partition = clusterer.fit_predict(_,ly_train[:4])

                # y=ly_train
                # edge_map = graph_builder.transform(y)

                # if graph_builder.is_weighted:
                #     weights_ = dict(weight=list(edge_map.values()))
                # else:
                #     weights_ = dict(weight=None)

                # import community
                # import networkx as nx
                # from networkx.algorithms.community import asyn_lpa_communities

                # graph_ = nx.Graph()
                # for n in range(y.shape[1]):
                #     graph_.add_node(n)
        
                # for e, w in edge_map.items():
                #     graph_.add_edge(e[0], e[1], weight=w)

                # ret_list_of_members=[]
                # if True:#self.method == 'louvain':
                #     partition_dict = community.best_partition(graph_)
                #     memberships = [partition_dict[i] for i in range(y.shape[1])]
                #     print('memberships',memberships)
                #     list_of_members = [[] for _ in range(1 + max(memberships))]
                #     for vertex_id, community_id in enumerate(memberships):
                #         list_of_members[community_id].append(vertex_id)
                #     print(list_of_members)
                #     ret_list_of_members = list_of_members
                
                # else:
                #     ret_list_of_members = [list(i) for i in asyn_lpa_communities(graph_, 'weight')]

                # membership_vector = to_membership_vector(ret_list_of_members)
                # print('There are', len(ret_list_of_members),'clusters')

                # import networkx as nx
                # names_dict = dict(enumerate(x for x in label_names))
                # import matplotlib.pyplot as plt
                # %matplotlib inline
                # nx.draw(
                #     graph_,
                #     pos=nx.spring_layout(graph_,k=4),
                #     labels=names_dict,
                #     with_labels = True,
                #     width = [10*x/y_train.shape[0] for x in edge_map.values()],
                #     node_color = [membership_vector[i] for i in range(y_train.shape[1])],
                #     cmap=plt.cm.viridis,
                #     node_size=550,
                #     font_size=6,
                #     font_color='red',
                #     alpha=0.8
                # )

                ###############iGraph
                # from skmultilearn.cluster import IGraphLabelGraphClusterer
                # import igraph as ig
                # graph_builder = LabelCooccurrenceGraphBuilder(weighted=True,
                #                                                include_self_edges=False)
                
                # label_names=target_names#[i for i in range(nlabel)]
                # edge_map = graph_builder.transform(ly_train)#np.array(y_train))
                # print("{} labels, {} edges".format(len(label_names), len(edge_map)))

                # clusterer_igraph = IGraphLabelGraphClusterer(graph_builder=graph_builder, method='walktrap')
                # partition = clusterer_igraph.fit_predict(X_train, ly_train)
                # partition

                ################Stochastic Blockmodel from graph-tool
                # import graph_tool - dificil de instalat pe windows
                # from skmultilearn.cluster.graphtool import GraphToolLabelGraphClusterer, StochasticBlockModel
                # model = StochasticBlockModel(nested=False, use_degree_correlation=True, allow_overlap=False, weight_model='real-normal')
                # clusterer_graphtool = GraphToolLabelGraphClusterer(graph_builder=graph_builder, model=model)
                # clusterer_graphtool.fit_predict(None, ly_train)
                import sys
                import joblib
                #import openne
                sys.modules['sklearn.externals.joblib'] = joblib
                
                #from skmultilearn.embedding import OpenNetworkEmbedder
                from skmultilearn.cluster import LabelCooccurrenceGraphBuilder
                graph_builder = LabelCooccurrenceGraphBuilder(weighted=True, include_self_edges=False)
                # openne_line_params = dict(batch_size=1000, order=3)
                # embedder = OpenNetworkEmbedder(
                #     graph_builder,
                #     'LINE',
                #     dimension = 5*ly_train.shape[1],
                #     aggregation_function = 'add',
                #     normalize_weights=True,
                #     param_dict = openne_line_params
                # )
                # from skmultilearn.embedding import EmbeddingClassifier
                # from sklearn.ensemble import RandomForestRegressor
                # from skmultilearn.adapt import MLkNN
                # clf = EmbeddingClassifier(
                #     embedder,
                #     RandomForestRegressor(n_estimators=10),
                #     MLkNN(k=5)
                # )
                
                # clf.fit(X_train, y_train)
                
                # predictions = clf.predict(X_test)
                # print(predictions)
                # from skmultilearn.embedding import CLEMS, EmbeddingClassifier
                # from sklearn.ensemble import RandomForestRegressor
                # from skmultilearn.adapt import MLkNN
                
                # dimensional_scaler_params = {'n_jobs': -1}
                
                # clf = EmbeddingClassifier(
                #     CLEMS(metrics.jaccard_similarity_score, is_score=True, params=dimensional_scaler_params),
                #     RandomForestRegressor(n_estimators=10, n_jobs=-1),
                #     MLkNN(k=1),
                #     regressor_per_dimension= True
                # )
                
                # clf.fit(X_train, y_train)
                
                # predictions = clf.predict(X_test)
                from skmultilearn.embedding import SKLearnEmbedder, EmbeddingClassifier
                from sklearn.manifold import SpectralEmbedding
                from sklearn.ensemble import RandomForestRegressor
                from skmultilearn.adapt import MLkNN

                from skmultilearn.ensemble import MajorityVotingClassifier
                from skmultilearn.cluster import FixedLabelSpaceClusterer
                from skmultilearn.problem_transform import ClassifierChain
                from sklearn.naive_bayes import GaussianNB
                
                classifier = MajorityVotingClassifier(
                    clusterer = FixedLabelSpaceClusterer(clusters = [[1,3,4], [0, 2, 5]]),
                    classifier = ClassifierChain(classifier=GaussianNB())
                )
                classifier.fit(X_train,y_train)
                predictions = classifier.predict(X_test)
                
                clf = EmbeddingClassifier(
                    SKLearnEmbedder(SpectralEmbedding(n_components = 10)),
                    RandomForestRegressor(n_estimators=10),
                    MLkNN(k=5)
                )
                
                clf.fit(X_train, y_train)
                
                predictions = clf.predict(X_test)

                  
ft_type=['ST-F']#['HG']#'OG', 'OGQ', 'OGSM', 'OGET']#,'DG','HG']
classification_types=['br_rfc']
ft_select=['LR40']

dir_path = '../datasets/results/classification/ianuarie2025/'
classification_graph(dir_path, ft_type, ft_select, classification_types, 
                  feature_path='../datasets/results/features/ianuarie2025-v2/',
                  ranks_path='../datasets/results/ranks/runTime_1min_sa_ts_multilabel_filtered')

0.3760781482913213 0.0 0.8026315789473685
0.3866226627924744 0.0 0.8289473684210527


ModuleNotFoundError: No module named 'openne'

In [15]:
from skmultilearn.dataset import load_dataset
from skmultilearn.cluster import LabelCooccurrenceGraphBuilder
from skmultilearn.cluster import NetworkXLabelGraphClusterer


X_train, y_train, feature_names, label_names = load_dataset('emotions', 'train')
X_test, y_test, _, _ = load_dataset('emotions', 'test')
print(X_test.shape, y_test.shape, label_names)
graph_builder = LabelCooccurrenceGraphBuilder(weighted=True,
                                              include_self_edges=False)

edge_map = graph_builder.transform(y_train)
print("{} labels, {} edges".format(len(label_names), len(edge_map)))
print(X_test[0].shape, y_test.shape, "y_test", y_test[10])

# print(type(y_test))
# for el in y_test:
#     print("!!!!!!!!! shape", el.shape, el) 

# we define a helper function for visualization purposes
def to_membership_vector(partition):
    return {
        member :  partition_id
        for partition_id, members in enumerate(partition)
        for member in members
    }
clusterer = NetworkXLabelGraphClusterer(graph_builder, method='louvain')
partition = clusterer.fit_predict(_,y_train[:6])
membership_vector = to_membership_vector(partition)
print('There are', len(partition),'clusters')

emotions:train - exists, not redownloading
emotions:test - exists, not redownloading
(202, 72) (202, 6) [('amazed-suprised', ['0', '1']), ('happy-pleased', ['0', '1']), ('relaxing-calm', ['0', '1']), ('quiet-still', ['0', '1']), ('sad-lonely', ['0', '1']), ('angry-aggresive', ['0', '1'])]
6 labels, 14 edges
(1, 72) (202, 6) y_test   (0, 1)	1
  (0, 2)	1
  (0, 3)	1


ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (4,) + inhomogeneous part.

In [69]:
# Bayesian optimization for all representations, feature types and regression models
ft_type=['OG-F']#['HG']#'OG', 'OGQ', 'OGSM', 'OGET']#,'DG','HG']

ft_select=['all','LR10', 'LR20', 'LR30', 
    'LR40', 'RF', 'PCA99','PCA999','PCA9999','FS_S','FS_P','FS_K']

dir_path = '../datasets/results/classification/ianuarie2025/'
#https://towardsdatascience.com/journey-to-the-center-of-multi-label-classification-384c40229bff

In [70]:
classification_types = ['rfr','gbr']
ft_type=['ST-F']

classification_tuning(dir_path, ft_type, ft_select, classification_types, 
                  feature_path='../datasets/results/features/ianuarie2025-v2/',
                  ranks_path='../datasets/results/ranks/runTime_1min_sa_ts_multilabel_filtered')

UnboundLocalError: cannot access local variable 'classification_model' where it is not associated with a value

In [127]:
classification_types = ['rfr','svr','gbr']
#'ft_select=['LR10']

classification_tuning(dir_path, ft_type, ft_select, classification_types, 
                  feature_path='../datasets/results/features/ianuarie2025-v2/',
                  ranks_path='../datasets/results/ranks/runTime_1min_sa_ts_multilabel_filtered')

******* Feature select: all  Feature type: OG-F Classification model: rfr ******
Accurecy of the best estimator (training dataset):  0.6452991452991453
Accuracy Best score (validation) val. score :  0.20536540240518036
Hamming loss: 0.1431451612903226
Accuracy of the best estimator (testing dataset):  0.11290322580645161
******* Feature select: LR10  Feature type: OG-F Classification model: rfr ******
Accurecy of the best estimator (training dataset):  0.5555555555555556
Accuracy Best score (validation) val. score :  0.17530064754856614
Hamming loss: 0.17943548387096775
Accuracy of the best estimator (testing dataset):  0.06451612903225806
******* Feature select: LR20  Feature type: OG-F Classification model: rfr ******
Accurecy of the best estimator (training dataset):  0.5299145299145299
Accuracy Best score (validation) val. score :  0.1752081406105458
Hamming loss: 0.1774193548387097
Accuracy of the best estimator (testing dataset):  0.1774193548387097
******* Feature select: LR30  

In [128]:
classification_types = ['br_rfc']
ft_select=['LR10']
target_names = ['TS-Ei', 'SA-Ei', 'TSL-Ei', 'SAL-Ei', 'TS-Si', 'SA-Si', 'TSL-Si', 'SAL-Si', ]

classification_tuning(dir_path, ft_type, ft_select, classification_types, 
                  feature_path='../datasets/results/features/ianuarie2025-v2/',
                  ranks_path='../datasets/results/ranks/runTime_1min_sa_ts_multilabel_filtered')

******* Feature select: LR10  Feature type: OG-F Classification model: br_rfc ******
Accurecy of the best estimator (training dataset):  0.5341880341880342
Accuracy Best score (validation) val. score :  0.1624421831637373
Hamming loss: 0.16532258064516128
Accuracy of the best estimator (testing dataset):  0.06451612903225806


In [103]:
classification_types = ['lp_rfc']
ft_select=['LR10']
target_names = ['TS-Ei', 'SA-Ei', 'TSL-Ei', 'SAL-Ei', 'TS-Si', 'SA-Si', 'TSL-Si', 'SAL-Si', ]

classification_tuning(dir_path, ft_type, ft_select, classification_types, 
                  feature_path='../datasets/results/features/ianuarie2025-v2/',
                  ranks_path='../datasets/results/ranks/runTime_1min_sa_ts_multilabel_filtered')

classification_type lp_rfc
TS-Ei-LM      21
SA-Ei-LM      87
TSL-Ei-LM     45
SAL-Ei-LM    113
TS-Si-LM      27
SA-Si-LM      29
TSL-Si-LM     31
SAL-Si-LM     31
dtype: int64
******* Feature select: LR10  Feature type: HG-F Classification model: lp_rfc ******
Accurecy of the best estimator (training dataset):  0.5384615384615384
Accuracy Best score (validation) val. score :  0.2605920444033303
Hamming loss: 0.18346774193548387
Accuracy of the best estimator (testing dataset):  0.25806451612903225
              precision    recall  f1-score   support

       TS-Ei       0.33      0.33      0.33         3
       SA-Ei       0.75      0.27      0.40        22
      TSL-Ei       0.25      0.14      0.18         7
      SAL-Ei       0.37      0.76      0.50        21
       TS-Si       0.00      0.00      0.00         6
       SA-Si       0.00      0.00      0.00         8
      TSL-Si       0.33      0.33      0.33         3
      SAL-Si       0.00      0.00      0.00         6

   micro 

In [70]:
classification_types = ['mcc']
ft_select=['LR10']
target_names = ['TS-Ei', 'SA-Ei', 'TSL-Ei', 'SAL-Ei', 'TS-Si', 'SA-Si', 'TSL-Si', 'SAL-Si', ]

classification_tuning(dir_path, ft_type, ft_select, classification_types, 
                  feature_path='../datasets/results/features/ianuarie2025-v2/',
                  ranks_path='../datasets/results/ranks/runTime_1min_sa_ts_multilabel_filtered')

classification_type mcc
TS-Ei-LM      21
SA-Ei-LM      87
TSL-Ei-LM     45
SAL-Ei-LM    113
TS-Si-LM      27
SA-Si-LM      29
TSL-Si-LM     31
SAL-Si-LM     31
dtype: int64
******* Feature select: LR10  Feature type: HG-F Classification model: mcc ******
Accurecy of the best estimator (training dataset):  0.1282051282051282
Accuracy Best score (validation) val. score :  0.10666049953746529
Hamming loss: 0.16532258064516128
Accuracy of the best estimator (testing dataset):  0.11290322580645161
              precision    recall  f1-score   support

       TS-Ei       0.00      0.00      0.00         3
       SA-Ei       0.67      0.09      0.16        22
      TSL-Ei       0.50      0.14      0.22         7
      SAL-Ei       0.38      0.52      0.44        21
       TS-Si       0.00      0.00      0.00         6
       SA-Si       0.00      0.00      0.00         8
      TSL-Si       0.00      0.00      0.00         3
      SAL-Si       0.00      0.00      0.00         6

   micro avg  

In [111]:
classification_types = ['mcc_rfc']
ft_select=['LR10']
target_names = ['TS-Ei', 'SA-Ei', 'TSL-Ei', 'SAL-Ei', 'TS-Si', 'SA-Si', 'TSL-Si', 'SAL-Si', ]

classification_tuning(dir_path, ft_type, ft_select, classification_types, 
                  feature_path='../datasets/results/features/ianuarie2025-v2/',
                  ranks_path='../datasets/results/ranks/runTime_1min_sa_ts_multilabel_filtered')

classification_type mcc_rfc
TS-Ei-LM      21
SA-Ei-LM      87
TSL-Ei-LM     45
SAL-Ei-LM    113
TS-Si-LM      27
SA-Si-LM      29
TSL-Si-LM     31
SAL-Si-LM     31
dtype: int64
******* Feature select: LR10  Feature type: HG-F Classification model: mcc_rfc ******
Accurecy of the best estimator (training dataset):  0.5085470085470085
Accuracy Best score (validation) val. score :  0.18362627197039777
Hamming loss: 0.1532258064516129
Accuracy of the best estimator (testing dataset):  0.11290322580645161
              precision    recall  f1-score   support

       TS-Ei       0.00      0.00      0.00         3
       SA-Ei       0.82      0.41      0.55        22
      TSL-Ei       0.00      0.00      0.00         7
      SAL-Ei       0.39      0.52      0.45        21
       TS-Si       0.00      0.00      0.00         6
       SA-Si       0.00      0.00      0.00         8
      TSL-Si       0.00      0.00      0.00         3
      SAL-Si       0.00      0.00      0.00         6

   micr

In [116]:
classification_types = ['mlknn']
ft_select=['LR10']
target_names = ['TS-Ei', 'SA-Ei', 'TSL-Ei', 'SAL-Ei', 'TS-Si', 'SA-Si', 'TSL-Si', 'SAL-Si', ]

classification_tuning(dir_path, ft_type, ft_select, classification_types, 
                  feature_path='../datasets/results/features/ianuarie2025-v2/',
                  ranks_path='../datasets/results/ranks/runTime_1min_sa_ts_multilabel_filtered')

classification_type mlknn
TS-Ei-LM      21
SA-Ei-LM      87
TSL-Ei-LM     45
SAL-Ei-LM    113
TS-Si-LM      27
SA-Si-LM      29
TSL-Si-LM     31
SAL-Si-LM     31
dtype: int64


AttributeError: 'DataFrame' object has no attribute 'getformat'

In [45]:
classification_types = ['rfr']
ft_select=['LR10']
target_names = ['TS-Ei', 'SA-Ei', 'TSL-Ei', 'SAL-Ei', 'TS-Si', 'SA-Si', 'TSL-Si', 'SAL-Si', ]

classification_tuning(dir_path, ft_type, ft_select, classification_types, 
                  feature_path='../datasets/results/features/ianuarie2025-v2/',
                  ranks_path='../datasets/results/ranks/runTime_1min_sa_ts_multilabel_filtered')

classification_type rfr
******* Feature select: LR10  Feature type: HG-F Classification model: rfr ******
Accurecy of the best estimator (training dataset):  0.49572649572649574
Accuracy Best score (validation) val. score :  0.17086031452358927
Hamming loss: 0.14717741935483872
Accuracy of the best estimator (testing dataset):  0.11290322580645161
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         3
           1       0.89      0.36      0.52        22
           2       0.00      0.00      0.00         7
           3       0.48      0.52      0.50        21
           4       0.00      0.00      0.00         6
           5       0.00      0.00      0.00         8
           6       0.00      0.00      0.00         3
           7       0.00      0.00      0.00         6

   micro avg       0.54      0.25      0.34        76
   macro avg       0.17      0.11      0.13        76
weighted avg       0.39      0.25      0.29        76


In [29]:

classification_types = ['gbr','rfr','svr']
#'ft_select=['LR10']

classification_tuning(dir_path, ft_type, ft_select, classification_types, 
                  feature_path='../datasets/results/features/ianuarie2025-v2/',
                  ranks_path='../datasets/results/ranks/runTime_1min_sa_ts_multilabel_filtered')

classification_type gbr
******* Feature select: all  Feature type: HG-F Classification model: gbr ******
Accurecy of the best estimator (training dataset):  0.9401709401709402
Accuracy Best score (validation) val. score :  0.2650323774283071
Hamming loss: 0.15725806451612903
Accuracy of the best estimator (testing dataset):  0.12903225806451613
Selected metaheuristics: ['TS-Ei-LM', 'TS-Ei-LM', 'TS-Ei-LM', 'TS-Ei-LM', 'TS-Ei-LM', 'TS-Ei-LM', 'TS-Ei-LM', 'TS-Ei-LM', 'TS-Ei-LM', 'TS-Ei-LM', 'TS-Ei-LM', 'SA-Ei-LM', 'TS-Ei-LM', 'TS-Ei-LM', 'TS-Ei-LM', 'TS-Ei-LM', 'TS-Ei-LM', 'TS-Ei-LM', 'TS-Ei-LM', 'TS-Ei-LM', 'TS-Ei-LM', 'TS-Ei-LM', 'TS-Ei-LM', 'TS-Ei-LM', 'TS-Ei-LM', 'TS-Ei-LM', 'TS-Ei-LM', 'TS-Ei-LM', 'TS-Ei-LM', 'TS-Ei-LM', 'TS-Ei-LM', 'TS-Ei-LM', 'TS-Ei-LM', 'TS-Ei-LM', 'TS-Ei-LM', 'TS-Ei-LM', 'TS-Ei-LM', 'TS-Ei-LM', 'TS-Ei-LM', 'TS-Ei-LM', 'TS-Ei-LM', 'TS-Ei-LM', 'TS-Ei-LM', 'TS-Ei-LM', 'TS-Ei-LM', 'TS-Ei-LM', 'TS-Ei-LM', 'TS-Ei-LM', 'TS-Ei-LM', 'TS-Ei-LM', 'TS-Ei-LM', 'TS-Ei-LM', 'TS