In [1]:
import pandas as pd
import numpy as np
import missingno as msno
from sklearn import preprocessing

# Loading Metafeatures

Here we will load the metafeatures we previously extracted using PyMFE and ImbCoL libraries.

## Load PyMFE metafeatures

In [2]:
pymfe_mf = pd.read_csv('./Meta_features_extraction/pymfe_meta_features.csv')
pymfe_mf = pymfe_mf.rename(columns={'Unnamed: 0':'ds_name2'})

# Transforming ds_name: 0 into D1, 1 into D2, ... 299 into D300.
pymfe_mf['ds_name2'] = pymfe_mf['ds_name2'].apply(lambda x: f'D{x+1}')

In [3]:
#pymfe_mf

## Loading ImbCoL meta-features

In [4]:
imbcol_mf = pd.read_csv('./Meta_features_extraction/imbcol_meta_features.csv')

In [5]:
#imbcol_mf

# Merging meta-features

In [6]:
mf_full = pd.concat([pymfe_mf, imbcol_mf], axis = 1)

In [7]:
mf_full.columns[113]

'ds_name'

In [8]:
#Drop repeated column ds_name at index 113
mf_full = mf_full.drop(columns='ds_name', axis=1)

In [9]:
mf_full = mf_full.rename(columns={'ds_name2':'ds_name'})
mf_full 

Unnamed: 0,ds_name,attr_conc.mean,attr_ent.mean,attr_to_inst,best_node.mean,best_node.mean.relative,c1,c2,can_cor.mean,cat_to_num,...,neighborhood.N4_partial.0,neighborhood.N4_partial.1,neighborhood.T1_partial.0,neighborhood.T1_partial.1,linearity.class.L1_partial.0,linearity.class.L1_partial.1,linearity.class.L2_partial.0,linearity.class.L2_partial.1,linearity.class.L3_partial.0,linearity.class.L3_partial.1
0,D1,0.019398,2.584913,0.066445,0.494839,5.0,0.999928,0.000199,0.202900,0.0,...,0.033223,0.039867,1.000000,1.000000,0.166374,0.188615,0.427632,0.476510,0.365449,0.448505
1,D2,0.037477,2.454799,0.034632,0.558514,3.0,0.997714,0.006314,0.442222,0.0,...,0.129870,0.186147,0.885246,0.908257,0.144262,0.208822,0.229508,0.403670,0.203463,0.359307
2,D3,0.018884,2.584927,0.062696,0.448185,4.0,0.999823,0.000491,0.193009,0.0,...,0.059561,0.031348,1.000000,1.000000,0.200284,0.171473,0.445860,0.450617,0.463950,0.470219
3,D4,0.022152,2.584918,0.066445,0.571290,6.0,0.999992,0.000022,0.261264,0.0,...,0.033223,0.046512,1.000000,1.000000,0.191736,0.168479,0.377483,0.393333,0.335548,0.325581
4,D5,0.018890,2.584963,0.066667,0.550000,7.0,0.997402,0.007174,0.209018,0.0,...,0.023333,0.040000,1.000000,1.000000,0.216616,0.161922,0.439716,0.339623,0.423333,0.330000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,D296,0.039737,2.453230,0.034188,0.594565,5.0,0.996625,0.009307,0.425928,0.0,...,0.192308,0.123932,0.856000,0.889908,0.124541,0.203828,0.256000,0.467890,0.196581,0.414530
296,D297,0.044615,2.427931,0.034632,0.631522,4.0,0.999986,0.000037,0.468845,0.0,...,0.155844,0.142857,0.834783,0.862069,0.159092,0.157423,0.260870,0.336207,0.199134,0.285714
297,D298,0.019225,2.584904,0.066445,0.597957,6.0,0.996486,0.009688,0.391580,0.0,...,0.039867,0.063123,1.000000,1.000000,0.173652,0.170869,0.357143,0.354037,0.265781,0.332226
298,D299,0.017539,2.584963,0.066667,0.690000,5.0,1.000000,0.000000,0.187479,0.0,...,0.013333,0.070000,1.000000,1.000000,0.097212,0.293950,0.280000,0.493333,0.250000,0.496667


# Loading ST Performances

Here we will load the results from the classification performance over the 300 datasets using different classification algorithms and six different scaling techniques (STs).

In [10]:
performances = pd.read_csv('../results/csv_tabs/results_ST_perfs.csv')
#performances

In [11]:
print(performances.shape)

(21600, 38)


In [12]:
# Number of rows: 300 datasets X 6 STs X 12 models = 
300*6*12

21600

In [13]:
#Rename performances['Dataset'] to ds_name and transform 1.0 to D1, 2.0 to D2 and so on..
performances = performances.rename(columns={'Dataset':'ds_name'})
performances['ds_name'] = performances['ds_name'].apply(lambda x: f'D{int(x)}')
performances

Unnamed: 0,ds_name,Scaling technique,Model,acc_fold1,acc_fold2,acc_fold3,acc_fold4,acc_fold5,acc_mean,acc_stddev,...,f1_fold5,f1_mean,f1_stddev,gmean_fold1,gmean_fold2,gmean_fold3,gmean_fold4,gmean_fold5,gmean_mean,gmean_stddev
0,D1,NS,Bagging,0.508197,0.533333,0.500000,0.516667,0.500000,0.511639,0.012480,...,0.516129,0.440079,0.162779,0.249731,0.524166,0.489898,0.514242,0.498888,0.455385,0.103510
1,D1,SS,Bagging,0.442623,0.516667,0.516667,0.466667,0.500000,0.488525,0.029327,...,0.545455,0.466025,0.047304,0.439941,0.494688,0.514242,0.465475,0.489898,0.480849,0.025684
2,D1,MMS,Bagging,0.475410,0.550000,0.533333,0.500000,0.416667,0.495082,0.046988,...,0.363636,0.394689,0.055504,0.451997,0.505806,0.523874,0.422953,0.408248,0.462576,0.045299
3,D1,MAS,Bagging,0.508197,0.516667,0.516667,0.550000,0.483333,0.514973,0.021352,...,0.311111,0.303649,0.133215,0.401610,0.182676,0.494413,0.505525,0.413656,0.399576,0.116161
4,D1,RS,Bagging,0.459016,0.516667,0.500000,0.366667,0.500000,0.468470,0.054341,...,0.531250,0.449671,0.094074,0.459078,0.510185,0.498888,0.341565,0.495536,0.461050,0.062154
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21595,D300,SS,SVM_lin,0.475410,0.557377,0.450000,0.466667,0.533333,0.496557,0.041376,...,0.416667,0.347853,0.063527,0.456937,0.478091,0.352753,0.425436,0.498867,0.442417,0.050987
21596,D300,MMS,SVM_lin,0.508197,0.573770,0.483333,0.483333,0.483333,0.506393,0.035038,...,0.060606,0.121659,0.070488,0.344502,0.193294,0.177972,0.336336,0.177972,0.246015,0.077326
21597,D300,MAS,SVM_lin,0.573770,0.573770,0.566667,0.566667,0.566667,0.569508,0.003480,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
21598,D300,RS,SVM_lin,0.459016,0.540984,0.450000,0.466667,0.533333,0.490000,0.038940,...,0.416667,0.326279,0.072364,0.433488,0.447214,0.322603,0.425436,0.498867,0.425522,0.057456


# Building the target column ('Best_ST')

Lets focus on one performance metric only: F1

In [14]:
scaling_tech_performances = performances[['ds_name', 'Scaling technique', 'Model', 'f1_mean']]
scaling_tech_performances

Unnamed: 0,ds_name,Scaling technique,Model,f1_mean
0,D1,NS,Bagging,0.440079
1,D1,SS,Bagging,0.466025
2,D1,MMS,Bagging,0.394689
3,D1,MAS,Bagging,0.303649
4,D1,RS,Bagging,0.449671
...,...,...,...,...
21595,D300,SS,SVM_lin,0.347853
21596,D300,MMS,SVM_lin,0.121659
21597,D300,MAS,SVM_lin,0.000000
21598,D300,RS,SVM_lin,0.326279


In [15]:
ds_names = scaling_tech_performances.ds_name.unique()
print(f'Lenght of ds_names: {len(ds_names)}')
st_names = scaling_tech_performances.loc[:,'Scaling technique'].unique()
print(f'Lenght of st_names: {len(st_names)}')
model_names = scaling_tech_performances.Model.unique()
print(f'Lenght of model_names: {len(model_names)}')

Lenght of ds_names: 300
Lenght of st_names: 6
Lenght of model_names: 12


In [16]:
st_names

array(['NS', 'SS', 'MMS', 'MAS', 'RS', 'QT'], dtype=object)

In [17]:
model = 'SVM_lin'
dataset = 'D1'
perform_filtered_by_ds = scaling_tech_performances[scaling_tech_performances['ds_name']==dataset]
#perform_filtered_by_ds
perform_filtered_by_ds_and_model = perform_filtered_by_ds[perform_filtered_by_ds['Model']==model]
#perform_filtered_by_ds_and_model
row_for_this_st = perform_filtered_by_ds_and_model['Scaling technique'] == 'RS'
perform_filtered_by_ds_and_model[row_for_this_st].f1_mean.values[0]


0.4636285425101215

In [18]:
performances_per_model_and_ds_dict = {'Model':[],'Dataset':[], 'NS':[], 'SS':[], 
                                      'MMS':[],'MAS':[], 'RS':[], 'QT':[], 
                                     'Max_F1_perf':[], 'Best_STs':[], 'Best_ST':[]}

for model in model_names:
    for dataset in ds_names:
        perform_filtered_by_ds = scaling_tech_performances[scaling_tech_performances['ds_name']==dataset]
        perform_filtered_by_ds_and_model = perform_filtered_by_ds[perform_filtered_by_ds['Model']==model]
        max_perf = perform_filtered_by_ds_and_model.f1_mean.max()
        # There may be more than one scaling technique attaining max_perf. For now, let's just select the first one. REVISIT this later.
        best_sts = perform_filtered_by_ds_and_model[perform_filtered_by_ds_and_model['f1_mean'] == max_perf]['Scaling technique'].values
        best_st = best_sts[0] 
        #print(f'The best st for model {model} and dataset {dataset} is {best_st}, reaching F1 = {max_perf}.')
        performances_per_model_and_ds_dict['Model'].append(model)
        performances_per_model_and_ds_dict['Dataset'].append(dataset)
        performances_per_model_and_ds_dict['Max_F1_perf'].append(max_perf)
        performances_per_model_and_ds_dict['Best_STs'].append(best_sts)
        performances_per_model_and_ds_dict['Best_ST'].append(best_st)
        for st in st_names:
            row_for_this_st = perform_filtered_by_ds_and_model['Scaling technique'] == st
            perf_for_this_st = perform_filtered_by_ds_and_model[row_for_this_st].f1_mean.values[0] 
            performances_per_model_and_ds_dict[st].append(perf_for_this_st)

performances_per_model_and_ds = pd.DataFrame(performances_per_model_and_ds_dict)
performances_per_model_and_ds

Unnamed: 0,Model,Dataset,NS,SS,MMS,MAS,RS,QT,Max_F1_perf,Best_STs,Best_ST
0,Bagging,D1,0.440079,0.466025,0.394689,0.303649,0.449671,0.451235,0.466025,[SS],SS
1,Bagging,D2,0.443400,0.627831,0.612080,0.614462,0.620108,0.625778,0.627831,[SS],SS
2,Bagging,D3,0.450370,0.397866,0.366334,0.417601,0.360426,0.400698,0.450370,[NS],NS
3,Bagging,D4,0.266380,0.481713,0.464659,0.477194,0.463736,0.500806,0.500806,[QT],QT
4,Bagging,D5,0.584809,0.500638,0.459579,0.498007,0.510122,0.514078,0.584809,[NS],NS
...,...,...,...,...,...,...,...,...,...,...,...
3595,SVM_lin,D296,0.529007,0.533769,0.543091,0.532463,0.533769,0.541710,0.543091,[MMS],MMS
3596,SVM_lin,D297,0.680718,0.655115,0.647179,0.649608,0.654405,0.663372,0.680718,[NS],NS
3597,SVM_lin,D298,0.616324,0.579542,0.604518,0.636174,0.593537,0.581481,0.636174,[MAS],MAS
3598,SVM_lin,D299,0.357417,0.387720,0.374748,0.404737,0.367981,0.394571,0.404737,[MAS],MAS


# Merging data

In [19]:
merged_dfs = []
for name in ds_names:
    perf_filt_by_ds = performances_per_model_and_ds[performances_per_model_and_ds['Dataset'] == name]
    perf_filt_by_ds.reset_index(inplace = True, drop = True)
    length = len(perf_filt_by_ds)
    mf_df = pd.concat([mf_full[mf_full['ds_name'] == name]]*length, ignore_index=True)
    pfd_cols = perf_filt_by_ds.columns
    # Inserting mf_df after the second column ('Model') of perf_filt_by_ds
    merged_df = pd.concat([perf_filt_by_ds[pfd_cols[:2]], mf_df, perf_filt_by_ds[pfd_cols[2:]]], axis = 1)
    merged_dfs.append(merged_df)
    
full_df = pd.concat(merged_dfs, ignore_index=True)

In [20]:
full_df = full_df.drop(columns=['ds_name'])

In [21]:
full_df[full_df['Model'] == 'SVM_RBF']

Unnamed: 0,Model,Dataset,attr_conc.mean,attr_ent.mean,attr_to_inst,best_node.mean,best_node.mean.relative,c1,c2,can_cor.mean,...,linearity.class.L3_partial.1,NS,SS,MMS,MAS,RS,QT,Max_F1_perf,Best_STs,Best_ST
10,SVM_RBF,D1,0.019398,2.584913,0.066445,0.494839,5.0,0.999928,0.000199,0.202900,...,0.448505,0.448571,0.445105,0.453572,0.451694,0.448148,0.409987,0.453572,[MMS],MMS
22,SVM_RBF,D2,0.037477,2.454799,0.034632,0.558514,3.0,0.997714,0.006314,0.442222,...,0.359307,0.602960,0.587516,0.624688,0.620223,0.599870,0.605445,0.624688,[MMS],MMS
34,SVM_RBF,D3,0.018884,2.584927,0.062696,0.448185,4.0,0.999823,0.000491,0.193009,...,0.470219,0.491877,0.428666,0.451279,0.478030,0.423347,0.422379,0.491877,[NS],NS
46,SVM_RBF,D4,0.022152,2.584918,0.066445,0.571290,6.0,0.999992,0.000022,0.261264,...,0.325581,0.125000,0.437898,0.448221,0.479342,0.435538,0.446035,0.479342,[MAS],MAS
58,SVM_RBF,D5,0.018890,2.584963,0.066667,0.550000,7.0,0.997402,0.007174,0.209018,...,0.330000,0.515542,0.566138,0.535946,0.530794,0.577659,0.543525,0.577659,[RS],RS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3550,SVM_RBF,D296,0.039737,2.453230,0.034188,0.594565,5.0,0.996625,0.009307,0.425928,...,0.414530,0.639417,0.534791,0.562115,0.558278,0.536777,0.570312,0.639417,[NS],NS
3562,SVM_RBF,D297,0.044615,2.427931,0.034632,0.631522,4.0,0.999986,0.000037,0.468845,...,0.285714,0.600483,0.684449,0.640281,0.641829,0.662535,0.664578,0.684449,[SS],SS
3574,SVM_RBF,D298,0.019225,2.584904,0.066445,0.597957,6.0,0.996486,0.009688,0.391580,...,0.332226,0.696947,0.596224,0.605506,0.612349,0.604742,0.583183,0.696947,[NS],NS
3586,SVM_RBF,D299,0.017539,2.584963,0.066667,0.690000,5.0,1.000000,0.000000,0.187479,...,0.496667,0.403745,0.631493,0.622763,0.630959,0.812755,0.547608,0.812755,[RS],RS


# Dataset cleanup

Some metafeatures could not be calculated for most datatasets, let's remove these metafeatures.

In [22]:
# How many NaNs are there in the entire DataFrame?
full_df.isnull().sum().sum()

3684

In [23]:
cols_with_nans = []
for col in full_df.columns:
    nan_qty = full_df[col].isnull().sum()
    if nan_qty > 0:
        print(f'There are {nan_qty} NaNs in column {col}.')
        cols_with_nans.append(col)
print(f'****\nA total of {len(cols_with_nans)} columns contain NaNs.')

There are 3600 NaNs in column num_to_cat.
There are 84 NaNs in column sd_ratio.
****
A total of 2 columns contain NaNs.


In [24]:
full_df.shape

(3600, 145)

Since only the column 'num_to_cat' contains a significant number of NaNs, we are going to remove only this column. 
The column sd_ratio contains only 84 NaNs, therefore we will keep it and then deal with it later, during the experiment (probably KNN imputation).

Note: 'num_to_cat' is a measure that computes the ratio of the number of numerical over categorical features.  As such, as most of our datasets contain no categorical features, 'num_to_cat' is goig to yield a division by zero, hence NaN.

In [25]:
cleaned_df = full_df.drop(columns=['num_to_cat'])

In [26]:
cleaned_df

Unnamed: 0,Model,Dataset,attr_conc.mean,attr_ent.mean,attr_to_inst,best_node.mean,best_node.mean.relative,c1,c2,can_cor.mean,...,linearity.class.L3_partial.1,NS,SS,MMS,MAS,RS,QT,Max_F1_perf,Best_STs,Best_ST
0,Bagging,D1,0.019398,2.584913,0.066445,0.494839,5.0,0.999928,0.000199,0.202900,...,0.448505,0.440079,0.466025,0.394689,0.303649,0.449671,0.451235,0.466025,[SS],SS
1,GLVQ,D1,0.019398,2.584913,0.066445,0.494839,5.0,0.999928,0.000199,0.202900,...,0.448505,0.434862,0.462094,0.418003,0.445430,0.469747,0.487364,0.487364,[QT],QT
2,GP,D1,0.019398,2.584913,0.066445,0.494839,5.0,0.999928,0.000199,0.202900,...,0.448505,0.362818,0.474876,0.384721,0.000000,0.484660,0.441929,0.484660,[RS],RS
3,KNORAE,D1,0.019398,2.584913,0.066445,0.494839,5.0,0.999928,0.000199,0.202900,...,0.448505,0.435348,0.447054,0.492395,0.450039,0.432771,0.413654,0.492395,[MMS],MMS
4,KNORAU,D1,0.019398,2.584913,0.066445,0.494839,5.0,0.999928,0.000199,0.202900,...,0.448505,0.443713,0.468347,0.487077,0.432025,0.478051,0.505159,0.505159,[QT],QT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3595,MLP,D300,0.019184,2.584899,0.066225,0.569462,6.5,0.986003,0.037949,0.250186,...,0.572848,0.601871,0.431757,0.601871,0.601871,0.438325,0.415073,0.601871,"[NS, MMS, MAS]",NS
3596,OLA,D300,0.019184,2.584899,0.066225,0.569462,6.5,0.986003,0.037949,0.250186,...,0.572848,0.414099,0.445926,0.386136,0.446185,0.376316,0.497744,0.497744,[QT],QT
3597,Percep,D300,0.019184,2.584899,0.066225,0.569462,6.5,0.986003,0.037949,0.250186,...,0.572848,0.265806,0.434663,0.237405,0.267137,0.377495,0.421451,0.434663,[SS],SS
3598,SVM_RBF,D300,0.019184,2.584899,0.066225,0.569462,6.5,0.986003,0.037949,0.250186,...,0.572848,0.186589,0.190331,0.201437,0.179149,0.217621,0.202262,0.217621,[RS],RS


In [27]:
# We finally have our meta-dataset (with some additional columns that are going to be removed later):
cleaned_df.to_csv('metafeat_pymfe+imbcol_and_ST_perform_for_pairs_of_dataset_and_model.csv', encoding='utf8', index=False)

In [28]:
#cleaned_df.loc[cleaned_df['Best_STs'] == f"[{cleaned_df['Best_ST']}]"]
total_rows = len(cleaned_df)
rows_with_1_label = 0
rows_with_2to5_labels = 0
rows_with_6_labels = 0

for index, row in cleaned_df.iterrows():
    #print(row['Best_STs'])
    label_count = (str(row['Best_STs']).count("'"))//2
    #print(label_count)
    if label_count == 1: rows_with_1_label += 1
    elif label_count < 6: rows_with_2to5_labels += 1
    else: rows_with_6_labels += 1

print(f'Number of rows with a single label = {rows_with_1_label}')
print(f'Number of rows with multiple labels = {rows_with_2to5_labels+rows_with_6_labels}, of which:') 
print(f'-> 2 to 5 labels = {rows_with_2to5_labels}') 
print(f'-> 6 labels = {rows_with_6_labels}') 


Number of rows with a single label = 2916
Number of rows with multiple labels = 684, of which:
-> 2 to 5 labels = 245
-> 6 labels = 439
