In [43]:
import pandas as pd
import numpy as np
import missingno as msno
from sklearn import preprocessing

# Loading Metafeatures

Here we will load the metafeatures we previously extracted using PyMFE and ImbCoL libraries.

## Load PyMFE metafeatures

In [45]:
pymfe_mf = pd.read_csv('./Meta_features_extraction/pymfe_meta_features.csv')
pymfe_mf = pymfe_mf.rename(columns={'Unnamed: 0':'ds_name2'})

# Transforming ds_name: 0 into D1, 1 into D2, ... 299 into D300.
pymfe_mf['ds_name2'] = pymfe_mf['ds_name2'].apply(lambda x: f'D{x+1}')

In [None]:
#pymfe_mf

## Loading ImbCoL meta-features

In [46]:
imbcol_mf = pd.read_csv('./Meta_features_extraction/imbcol_meta_features.csv')

In [None]:
#imbcol_mf

# Merging meta-features

In [47]:
mf_full = pd.concat([pymfe_mf, imbcol_mf], axis = 1)

In [48]:
mf_full.columns[113]

'ds_name'

In [50]:
#Drop repeated column ds_name at index 113
mf_full = mf_full.drop(columns='ds_name', axis=1)

In [54]:
mf_full = mf_full.rename(columns={'ds_name2':'ds_name'})
mf_full 

Unnamed: 0,ds_name,attr_conc.mean,attr_ent.mean,attr_to_inst,best_node.mean,best_node.mean.relative,c1,c2,can_cor.mean,cat_to_num,...,neighborhood.N4_partial.0,neighborhood.N4_partial.1,neighborhood.T1_partial.0,neighborhood.T1_partial.1,linearity.class.L1_partial.0,linearity.class.L1_partial.1,linearity.class.L2_partial.0,linearity.class.L2_partial.1,linearity.class.L3_partial.0,linearity.class.L3_partial.1
0,D1,0.019398,2.584913,0.066445,0.494839,5.0,0.999928,0.000199,0.202900,0.0,...,0.033223,0.039867,1.000000,1.000000,0.166374,0.188615,0.427632,0.476510,0.365449,0.448505
1,D2,0.037477,2.454799,0.034632,0.558514,3.0,0.997714,0.006314,0.442222,0.0,...,0.129870,0.186147,0.885246,0.908257,0.144262,0.208822,0.229508,0.403670,0.203463,0.359307
2,D3,0.018884,2.584927,0.062696,0.448185,4.0,0.999823,0.000491,0.193009,0.0,...,0.059561,0.031348,1.000000,1.000000,0.200284,0.171473,0.445860,0.450617,0.463950,0.470219
3,D4,0.022152,2.584918,0.066445,0.571290,6.0,0.999992,0.000022,0.261264,0.0,...,0.033223,0.046512,1.000000,1.000000,0.191736,0.168479,0.377483,0.393333,0.335548,0.325581
4,D5,0.018890,2.584963,0.066667,0.550000,7.0,0.997402,0.007174,0.209018,0.0,...,0.023333,0.040000,1.000000,1.000000,0.216616,0.161922,0.439716,0.339623,0.423333,0.330000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,D296,0.039737,2.453230,0.034188,0.594565,5.0,0.996625,0.009307,0.425928,0.0,...,0.192308,0.123932,0.856000,0.889908,0.124541,0.203828,0.256000,0.467890,0.196581,0.414530
296,D297,0.044615,2.427931,0.034632,0.631522,4.0,0.999986,0.000037,0.468845,0.0,...,0.155844,0.142857,0.834783,0.862069,0.159092,0.157423,0.260870,0.336207,0.199134,0.285714
297,D298,0.019225,2.584904,0.066445,0.597957,6.0,0.996486,0.009688,0.391580,0.0,...,0.039867,0.063123,1.000000,1.000000,0.173652,0.170869,0.357143,0.354037,0.265781,0.332226
298,D299,0.017539,2.584963,0.066667,0.690000,5.0,1.000000,0.000000,0.187479,0.0,...,0.013333,0.070000,1.000000,1.000000,0.097212,0.293950,0.280000,0.493333,0.250000,0.496667


# Loading ST Performances

Here we will load the results from the classification performance over the 300 datasets using different classification algorithms and six different scaling techniques (STs).

In [12]:
performances = pd.read_csv('../results/csv_tabs/results_ST_perfs.csv')
performances

Unnamed: 0,Dataset,Scaling technique,Model,acc_fold1,acc_fold2,acc_fold3,acc_fold4,acc_fold5,acc_mean,acc_stddev,...,f1_fold5,f1_mean,f1_stddev,gmean_fold1,gmean_fold2,gmean_fold3,gmean_fold4,gmean_fold5,gmean_mean,gmean_stddev
0,1.0,NS,SVM_lin,0.475410,0.533333,0.516667,0.483333,0.466667,0.495082,0.025540,...,0.515152,0.474213,0.029015,0.468353,0.516685,0.514242,0.483046,0.455826,0.487630,0.024316
1,1.0,SS,SVM_lin,0.459016,0.483333,0.550000,0.466667,0.433333,0.478470,0.039231,...,0.468750,0.466712,0.042593,0.457905,0.469302,0.549747,0.466667,0.428174,0.474359,0.040427
2,1.0,MMS,SVM_lin,0.459016,0.533333,0.483333,0.433333,0.466667,0.475137,0.033266,...,0.515152,0.457443,0.032417,0.454369,0.516685,0.480740,0.433333,0.455826,0.468191,0.028524
3,1.0,MAS,SVM_lin,0.491803,0.533333,0.466667,0.533333,0.500000,0.505027,0.025590,...,0.500000,0.458244,0.050476,0.450806,0.506904,0.465475,0.529150,0.500000,0.490467,0.028478
4,1.0,RS,SVM_lin,0.442623,0.500000,0.550000,0.466667,0.433333,0.478525,0.042525,...,0.468750,0.463629,0.036131,0.442379,0.482162,0.547723,0.466667,0.428174,0.473421,0.041602
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21523,299.0,SS,KNORAU,0.416667,0.533333,0.450000,0.400000,0.583333,0.476667,0.070396,...,0.509804,0.416705,0.072160,0.408248,0.529150,0.424264,0.394405,0.563718,0.463957,0.068875
21524,299.0,MMS,KNORAU,0.500000,0.566667,0.483333,0.516667,0.550000,0.523333,0.030912,...,0.425532,0.385779,0.040783,0.458258,0.500000,0.413656,0.483046,0.505525,0.472097,0.033534
21525,299.0,MAS,KNORAU,0.583333,0.666667,0.550000,0.600000,0.666667,0.613333,0.046428,...,0.565217,0.522697,0.068195,0.563718,0.645497,0.489898,0.584998,0.624500,0.581722,0.054155
21526,299.0,RS,KNORAU,0.500000,0.716667,0.583333,0.533333,0.666667,0.600000,0.080966,...,0.629630,0.558922,0.087534,0.495536,0.711805,0.571548,0.523874,0.659124,0.592377,0.081514


In [None]:
#@TODO Rename performances['Dataset'] to ds_name and transform 1.0 to D1, 2.0 to D2 and so on..

# Merging data

In [None]:
merged_dfs = []
for name in ds_names:
    perf_filt_by_ds = performances_per_model_and_ds[performances_per_model_and_ds['Dataset'] == name]
    perf_filt_by_ds.reset_index(inplace = True, drop = True)
    length = len(perf_filt_by_ds)
    mf_df = pd.DataFrame()
    mf_df = mf_df.append([mf_full[name]]*length, ignore_index = True)
    pfd_cols = perf_filt_by_ds.columns
    # Inserting mf_df after the second column of perf_filt_by_ds
    merged_df = pd.concat([perf_filt_by_ds[pfd_cols[:2]], mf_df, perf_filt_by_ds[pfd_cols[2:]]], axis = 1)
    merged_dfs.append(merged_df)
    
full_df = pd.concat(merged_dfs, ignore_index=True)

In [None]:
full_df[full_df['Model'] == 'SVM_RBF']

# Dataset cleanup

Some metafeatures could not be calculated for most datatasets, let's remove these metafeatures.

In [None]:
# How many NaNs are there in the entire DataFrame?
full_df.isnull().sum().sum()

In [None]:
cols_with_nans = []
for col in full_df.columns:
    nan_qty = full_df[col].isnull().sum()
    if nan_qty > 0:
        print(f'There are {nan_qty} NaNs in column {col}.')
        cols_with_nans.append(col)
print(f'****\nA total of {len(cols_with_nans)} columns contain NaNs.')

In [None]:
full_df.shape

In [None]:
msno.matrix(full_df[cols_with_nans], fontsize=16)

We are going to remove the following metafeatures: 'num_to_cat' and 'sd_ratio'. The rest of the metafeatures, which couldn't be calculated for a maximum of 77 rows (7 datasets) we will keep and then deal with them later, during the experiment.

Note: 'num_to_cat' is a measure that computes the ratio of the number of numerical over categorical features.  As such, as most of our datasets contain no categorical features, 'num_to_cat' is goig to yield a division by zero, hence NaN.

'sd_ratio' @TODO explanation needed here...

In [None]:
cleaned_df = full_df.drop(columns=['num_to_cat', 'sd_ratio'])

In [None]:
cleaned_df

In [None]:
cleaned_df.to_csv('metafeat_pymfe+imbcol_and_ST_perform_for_pairs_of_dataset_and_model.csv', encoding='utf8', index=False)