In [None]:
# Installations
#!pip install -U pymfe
#!pip install missingno
#!pip install numpy

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.io.arff import loadarff
from pymfe.mfe import MFE 
import missingno as msno
from sklearn import preprocessing
import time

In [None]:
import pymfe
pymfe.__version__

# Loading datasets in order to calculate the meta-features

In [None]:
datasets = []
for i in range(1,301):
    raw_data = loadarff(f'../../data/S1/D{i}-trn.arff')
    df_data = pd.DataFrame(raw_data[0])
    datasets.append(df_data)

In [None]:
# Transforming class labels into ones and zeros.
for ds in datasets:
    ds['class'] = preprocessing.LabelEncoder().fit_transform(ds['class'])

In [None]:
datasets[299]

# Extracting metafeatures

In [None]:
metafeatures = [] # This will store Dataframes with each dataset's metafeatures.
i = 1
elapsed_times = []
for current_ds in datasets:
    print(f'Extracting metafeatures for dataset {i}.')
    start_time = time.time() # Count time from here..
    mfe = MFE(groups='all', summary='mean')
    mfe.fit(current_ds.iloc[:,:-1].values, current_ds.iloc[:,-1].values)
    ft = mfe.extract()
    end_time = time.time() # to here.
    elapsed_time = end_time - start_time
    elapsed_times.append(elapsed_time)
    print("Elapsed time: ", elapsed_time) 
    metafeatures.append(pd.DataFrame(dict(zip(ft[0],ft[1])), index=[0]))
    i += 1

In [None]:
pd.DataFrame(elapsed_times).to_csv('pymfe_computing_times.csv')

In [None]:
pd.DataFrame(elapsed_times).describe()

In [None]:
max_t_pos = 0 
for i in range(len(elapsed_times)):
    t = elapsed_times[i]
    if t > elapsed_times[max_t_pos]: max_t_pos = i
print(f'Maximum time elapsed was {elapsed_times[max_t_pos]} for dataset D{max_t_pos}.')

In [None]:
metafeatures[298]

In [None]:
pymfe_metafeatures_df = pd.concat(metafeatures, ignore_index=True)
pymfe_metafeatures_df

In [None]:
# Saving:
#pymfe_metafeatures_df.to_csv('pymfe_meta_features.csv')

In [None]:
summary_of_dfs = {'name':[], 'columns':[], 'rows':[]}
i = 1
for ds in datasets:
    summary_of_dfs['name'].append(f'D{i}')
    summary_of_dfs['columns'].append(ds.shape[1])
    summary_of_dfs['rows'].append(ds.shape[0])
    i += 1
summary_of_dfs = pd.DataFrame(summary_of_dfs)

In [None]:
summary_of_dfs

In [None]:
summary_of_dfs.describe()

# Analysis of time complexity of meta-feature extraction:

We will build a graph with the meta-features extraction times for datasets with 10, 100, 1000, 10000... instances. For that we will build random datasets with 21 columns (which is the maximum number of columns of our real datasets) and varying number of rows.

In [None]:
from sklearn.datasets import make_classification

In [None]:
n_instances = []
elapsed_times = []
for n_rows in range(100, 10001, 100):
    X, y = make_classification(n_samples = n_rows, n_features = 21, class_sep=0.8, random_state=13)
    print(f'Extracting metafeatures for dataset with {n_rows} rows.')
    start_time = time.time() # Count time from here..
    mfe = MFE(groups='all', summary='mean')
    mfe.fit(X, y)
    ft = mfe.extract()
    end_time = time.time() # to here.
    elapsed_times.append(end_time - start_time)
    n_instances.append(n_rows)


In [None]:
fig = pd.DataFrame({'Time in seconds':elapsed_times}, index=n_instances).plot().set_xlabel("Number of instances").get_figure()


In [None]:
fig.savefig('pymfe_extraction_times.pdf')

Is that quadratical?

In [None]:
coef = np.polyfit(x=n_instances, y=elapsed_times, deg=2)
plt.scatter(n_instances, elapsed_times, label='data')
plt.plot(n_instances, np.poly1d(coef)(n_instances), color='red', label='quadratical polynomial fit')
plt.legend()
plt.show()
print(coef)

Now, how much time would take to actually test all the STs with these random datasets?

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import RobustScaler
#from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import QuantileTransformer
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.model_selection import cross_validate

In [None]:
clf = GaussianProcessClassifier
n_instances_2 = []
mf_extraction_times = []
st_and_classification_times = []
for n_rows in range(100,10001,300):
    X, y = make_classification(n_samples = n_rows, n_features = 21, class_sep=0.8, random_state=13)
    # MF extraction:
    print(f'Extracting metafeatures for dataset with {n_rows} rows.')
    start_time = time.time() # Count time from here..
    mfe = MFE(groups='all', summary='mean')
    mfe.fit(X, y)
    ft = mfe.extract()
    end_time = time.time() # to here.
    mf_extraction_times.append(end_time - start_time)
    # Scaling + Classification
    print(f'Scaling and Classifying for dataset with {n_rows} rows.')
    start_time = time.time() # Count time from here..
    scaled_X = StandardScaler().fit_transform(X)
    cross_validate(estimator=clf(), X=scaled_X, y=y) #5-fold CV
    scaled_X = MinMaxScaler().fit_transform(X)
    cross_validate(estimator=clf(), X=scaled_X, y=y) #5-fold CV
    scaled_X = MaxAbsScaler().fit_transform(X)
    cross_validate(estimator=clf(), X=scaled_X, y=y) #5-fold CV
    scaled_X = RobustScaler().fit_transform(X)
    cross_validate(estimator=clf(), X=scaled_X, y=y) #5-fold CV
    scaled_X = QuantileTransformer().fit_transform(X)
    cross_validate(estimator=clf(), X=scaled_X, y=y) #5-fold CV
    scaled_X = X # nonscaled data also needs to be tested
    cross_validate(estimator=clf(), X=scaled_X, y=y) #5-fold CV
    end_time = time.time() # to here.
    st_and_classification_times.append(end_time - start_time)
    n_instances_2.append(n_rows)

In [None]:
ax = pd.DataFrame({'Meta-feature extraction time':mf_extraction_times, 
                    'Scaling+classification time':st_and_classification_times},
                   index=n_instances_2).plot()
ax.set_xlabel("Number of instances")
ax.set_ylabel('Time (s)')
fig = ax.get_figure()


In [None]:
fig.savefig('pymfe_extraction_times_vs_ST+classifier_times.pdf')

In [None]:
# Creating a boxplot of the computing times:
comp_times = pd.read_csv('comp_times.csv', sep=';')
ax = comp_times.boxplot()
ax.set_xlabel("Meta-feature set")
ax.set_ylabel('Time (s)')
fig = ax.get_figure()

In [None]:
fig.savefig('boxplot_mf_extraction_times_.pdf')