# auto-sklearn

# Installation

In [None]:
!sudo apt-get install build-essential swig
!pip install auto-sklearn==0.14.3

Reading package lists... Done
Building dependency tree       
Reading state information... Done
build-essential is already the newest version (12.4ubuntu1).
Suggested packages:
  swig-doc swig-examples swig3.0-examples swig3.0-doc
The following NEW packages will be installed:
  swig swig3.0
0 upgraded, 2 newly installed, 0 to remove and 37 not upgraded.
Need to get 1,100 kB of archives.
After this operation, 5,822 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 swig3.0 amd64 3.0.12-1 [1,094 kB]
Get:2 http://archive.ubuntu.com/ubuntu bionic/universe amd64 swig amd64 3.0.12-1 [6,460 B]
Fetched 1,100 kB in 13s (85.9 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 76, <> line 2.)
debconf: falling back to frontend: Readline
debconf: unable to initialize frontend: Readline
debconf: (T

***Restart runtime before proceeding futher***

In [None]:
# print auto-sklearn version
import autosklearn
print('autosklearn: %s' % autosklearn.__version__)

autosklearn: 0.14.3


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from tensorflow.keras.utils import plot_model

# Setting up datasets

## Superconductors dataset (regression task)

Source: https://archive.ics.uci.edu/ml/datasets/Superconductivty+Data


The dataset contains 81 numerical features of 21263 superconductors. The label corresponds to their critical temperature measured in Kelvin.

In [None]:
!wget 'https://raw.githubusercontent.com/abcom-mltutorials/automl/main/superconductors.csv'

--2022-02-02 12:09:18--  https://raw.githubusercontent.com/abcom-mltutorials/automl/main/superconductors.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 23859780 (23M) [text/plain]
Saving to: ‘superconductors.csv’


2022-02-02 12:09:19 (120 MB/s) - ‘superconductors.csv’ saved [23859780/23859780]



In [None]:
regressor_df=pd.read_csv('/content/superconductors.csv')

In [None]:
regressor_df.shape

(21263, 82)

### Features/target extraction

In [None]:
features_regressor = regressor_df.iloc[:,:-1]
label_regressor = regressor_df.iloc[:,-1]

### Training/Testing datasets

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train_regressor, X_val_regressor, label_train_regressor, label_val_regressor = train_test_split(features_regressor, label_regressor, test_size=0.2, random_state=42)

Defining function for printing error metrics

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

def error_metrics(y_pred,y_val):
  print('MSE: ',mean_squared_error(y_pred,y_val))
  print('RMSE: ',np.sqrt(mean_squared_error(y_pred,y_val)))
  print('Coefficient of determination: ',r2_score(y_pred,y_val))

## Biodegradation dataset (classification task)

Source: https://archive.ics.uci.edu/ml/datasets/QSAR+biodegradation


The dataset contains 21 numerical features (molecular attributes) of 1055 chemicals. The label corresponds to their experimental class (ready biodegradable "RB" or not ready biodegradable "NRB")

In [None]:
!wget 'https://raw.githubusercontent.com/abcom-mltutorials/automl/main/biodeg.csv'

--2022-02-02 12:09:21--  https://raw.githubusercontent.com/abcom-mltutorials/automl/main/biodeg.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 155987 (152K) [text/plain]
Saving to: ‘biodeg.csv’


2022-02-02 12:09:21 (4.83 MB/s) - ‘biodeg.csv’ saved [155987/155987]



In [None]:
classifier_df=pd.read_csv('/content/biodeg.csv', delimiter=';', header=None)

In [None]:
classifier_df.shape

(1055, 42)

In [None]:
classifier_df.rename(columns={41:'label'}, inplace=True)

In [None]:
classifier_df.columns = classifier_df.columns.astype(str)

### Features/targe extraction

In [None]:
features_classifier = classifier_df.iloc[:,:-1]
label_classifier = classifier_df.iloc[:,-1]

### Balancing dataset

In [None]:
!pip install imblearn



In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
sm = SMOTE(random_state=42)
X_classifier, y_classifier = sm.fit_resample(features_classifier, label_classifier)

In [None]:
y_classifier.value_counts()

RB     699
NRB    699
Name: label, dtype: int64

In [None]:
y_classifier = y_classifier.replace('NRB',0).replace('RB',1)

### Training/Testing datasets

In [None]:
X_train_classifier, X_val_classifier, label_train_classifier, label_val_classifier = train_test_split(X_classifier, y_classifier, random_state=42, test_size = 0.2)

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import label_binarize

# Classifier

In [None]:
from autosklearn.classification import AutoSklearnClassifier

In [None]:
import time
tic = time.perf_counter()

In [None]:
model_auto_class = AutoSklearnClassifier(time_left_for_this_task=10*60, 
                                      per_run_time_limit=30, 
                                      n_jobs=-1)
model_auto_class.fit(X_train_classifier,label_train_classifier)


AutoSklearnClassifier(n_jobs=-1, per_run_time_limit=30,
                      time_left_for_this_task=600)

In [None]:
toc = time.perf_counter()
print (f"Elapsed time {toc - tic:0.4f} seconds")

Elapsed time 606.3505 seconds


In [None]:
print(model_auto_class.sprint_statistics())

auto-sklearn results:
  Dataset name: fa958b64-8420-11ec-8195-0242ac1c0002
  Metric: accuracy
  Best validation score: 0.899729
  Number of target algorithm runs: 81
  Number of successful target algorithm runs: 59
  Number of crashed target algorithm runs: 16
  Number of target algorithms that exceeded the time limit: 3
  Number of target algorithms that exceeded the memory limit: 3



In [None]:
y_pred_class = model_auto_class.predict(X_val_classifier)
print(classification_report(label_val_classifier,y_pred_class))

              precision    recall  f1-score   support

           0       0.91      0.87      0.89       143
           1       0.87      0.91      0.89       137

    accuracy                           0.89       280
   macro avg       0.89      0.89      0.89       280
weighted avg       0.89      0.89      0.89       280



# Regressor

In [None]:
from autosklearn.regression import AutoSklearnRegressor

In [None]:
import time
tic = time.perf_counter()

In [None]:
model_auto_reg = AutoSklearnRegressor(time_left_for_this_task=10*60, 
                                      per_run_time_limit=30, 
                                      n_jobs=-1)
model_auto_reg.fit(X_train_regressor,label_train_regressor)

AutoSklearnRegressor(n_jobs=-1, per_run_time_limit=30,
                     time_left_for_this_task=600)

In [None]:
toc = time.perf_counter()
print (f"Elapsed time {toc - tic:0.4f} seconds")

Elapsed time 603.7752 seconds


In [None]:
print(model_auto_reg.sprint_statistics())

auto-sklearn results:
  Dataset name: 646225b0-8422-11ec-8195-0242ac1c0002
  Metric: r2
  Best validation score: 0.909665
  Number of target algorithm runs: 80
  Number of successful target algorithm runs: 18
  Number of crashed target algorithm runs: 33
  Number of target algorithms that exceeded the time limit: 5
  Number of target algorithms that exceeded the memory limit: 24



In [None]:
model_auto_reg.show_models()

"[(0.260000, SimpleRegressionPipeline({'data_preprocessor:__choice__': 'feature_type', 'feature_preprocessor:__choice__': 'extra_trees_preproc_for_regression', 'regressor:__choice__': 'gradient_boosting', 'data_preprocessor:feature_type:categorical_transformer:categorical_encoding:__choice__': 'no_encoding', 'data_preprocessor:feature_type:categorical_transformer:category_coalescence:__choice__': 'no_coalescense', 'data_preprocessor:feature_type:numerical_transformer:imputation:strategy': 'mean', 'data_preprocessor:feature_type:numerical_transformer:rescaling:__choice__': 'robust_scaler', 'feature_preprocessor:extra_trees_preproc_for_regression:bootstrap': 'False', 'feature_preprocessor:extra_trees_preproc_for_regression:criterion': 'friedman_mse', 'feature_preprocessor:extra_trees_preproc_for_regression:max_depth': 'None', 'feature_preprocessor:extra_trees_preproc_for_regression:max_features': 0.8407991816968586, 'feature_preprocessor:extra_trees_preproc_for_regression:max_leaf_nodes'

In [None]:
y_pred_reg = model_auto_reg.predict(X_val_regressor)

In [None]:
error_metrics(y_pred_reg,label_val_regressor)

MSE:  89.62974521966439
RMSE:  9.467298728764419
Coefficient of determination:  0.9151071664787114
