
# Classification

The following example shows how to fit a simple classification model with
*auto-sklearn*.


In [1]:
from pprint import pprint
import sklearn.metrics
import autosklearn.classification
import pandas as pd
from sklearn.model_selection import train_test_split
import autosklearn

## Data Loading



In [2]:

# file_path = '../data/titanic_dirty_data.csv'
file_path = '../data/airbnb.csv'


df = pd.read_csv(file_path)

# df = df.dropna(subset=['Survived'])
# y = df['Survived']
# X = df.drop('Survived', axis=1)

df = df.dropna(subset=['Rating'])
y = df['Rating']
X = df.drop('Rating', axis=1)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(set(y_test))
print(set(y_train))


{'Y', 'N'}
{'Y', 'N'}


## Build and fit a classifier



In [6]:
automl = autosklearn.classification.AutoSklearnClassifier(
    time_left_for_this_task=60,
    per_run_time_limit=10,
    include = {
        'classifier': ["random_forest"]
    },
    tmp_folder="tmp/autosklearn_classification_example_tmp5",
)

automl.fit(X_train, y_train)
run_key = list(automl.automl_.runhistory_.data.keys())[0]
run_value = automl.automl_.runhistory_.data[run_key]
config=automl.automl_.runhistory_.ids_config[run_key.config_id]
print(config)

  if is_sparse(X[column]):
Fitting to the training data:   0%|[32m          [0m| 0/60 [00:00<?, ?it/s, The total time budget for this task is 0:01:00]

RANDOM FOREST INTI


Fitting to the training data:  25%|[32m██▌       [0m| 15/60 [00:15<00:45,  1.00s/it, The total time budget for this task is 0:01:00]



Fitting to the training data:  28%|[32m██▊       [0m| 17/60 [00:17<00:43,  1.00s/it, The total time budget for this task is 0:01:00]



Fitting to the training data:  47%|[32m████▋     [0m| 28/60 [00:28<00:32,  1.00s/it, The total time budget for this task is 0:01:00]



Fitting to the training data: 100%|[32m██████████[0m| 60/60 [00:50<00:00,  1.20it/s, The total time budget for this task is 0:01:00]


Configuration(values={
  'balancing:strategy': 'none',
  'classifier:__choice__': 'random_forest',
  'classifier:random_forest:bootstrap': 'True',
  'classifier:random_forest:criterion': 'gini',
  'classifier:random_forest:max_depth': 'None',
  'classifier:random_forest:max_features': 0.5,
  'classifier:random_forest:max_leaf_nodes': 'None',
  'classifier:random_forest:min_impurity_decrease': 0.0,
  'classifier:random_forest:min_samples_leaf': 1,
  'classifier:random_forest:min_samples_split': 2,
  'classifier:random_forest:min_weight_fraction_leaf': 0.0,
  'data_preprocessor:__choice__': 'feature_type',
  'data_preprocessor:feature_type:numerical_transformer:imputation:strategy': 'mean',
  'data_preprocessor:feature_type:numerical_transformer:rescaling:__choice__': 'standardize',
  'data_preprocessor:feature_type:text_transformer:text_encoding:__choice__': 'tfidf_encoding',
  'data_preprocessor:feature_type:text_transformer:text_encoding:tfidf_encoding:analyzer': 'char',
  'data_prepr

## View the models found by auto-sklearn



In [7]:
print(automl.leaderboard())

          rank  ensemble_weight           type      cost  duration
model_id                                                          
5            1              1.0  random_forest  0.326314  4.787004


## Print the final ensemble constructed by auto-sklearn



In [8]:
pprint(automl.show_models(), indent=4)

{   5: {   'balancing': Balancing(random_state=1),
           'classifier': <autosklearn.pipeline.components.classification.ClassifierChoice object at 0x7f15da7d3fa0>,
           'cost': 0.3263140711720536,
           'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice object at 0x7f15da7c69a0>,
           'ensemble_weight': 1.0,
           'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice object at 0x7f15da7d3eb0>,
           'model_id': 5,
           'rank': 1,
           'sklearn_classifier': RandomForestClassifier(max_features=1, min_samples_leaf=2, min_samples_split=20,
                       n_estimators=512, n_jobs=1, random_state=1,
                       warm_start=True)}}


## Get the Score of the final ensemble



In [9]:
predictions = automl.predict(X_test)
# importances = automl.predict(X_test)[0]

print(set(predictions))
print(set(y_test))
print("Accuracy score:", sklearn.metrics.accuracy_score(y_test, predictions))
print("Precision", sklearn.metrics.precision_score(y_test, predictions,pos_label='Y'))
print("Recall", sklearn.metrics.recall_score(y_test, predictions,pos_label='Y'))


  array.dtypes.apply(is_sparse).any()):


{'Y', 'N'}
{'Y', 'N'}
Accuracy score: 0.6595561301443654
Precision 0.6895544192841491
Recall 0.9027733503347147


In [10]:

from sklearn.ensemble import RandomForestClassifier
import numpy as np
import matplotlib.pyplot as plt

# file_path = '../data/clean_titanic_data_rf.csv'
file_path = '../data/clean_airbnb_data_rf.csv'


df = pd.read_csv(file_path)

# df = df.dropna(subset=['Survived'])
# y = df['Survived']
# X = df.drop('Survived', axis=1)

df = df.dropna(subset=['Rating'])
y = df['Rating']
X = df.drop('Rating', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

feature_names = [f"feature {i}" for i in range(X.shape[1])]
forest = RandomForestClassifier(random_state=0)
forest.fit(X_train, y_train)

importances = forest.feature_importances_
print(len(importances))
sorted_indices = np.argsort(importances)[::-1]
print(sorted_indices)


  array.dtypes.apply(is_sparse).any()):


138
[  6   5   7   4   3   2   1   0  33   8  35  29  12  15  13  31  30  22
  24  32  16  34  10  37  19  36   9  17  25  20  23  27  11  21  26  28
  18  84 135  14  72  87 137 102  74  64 126 133 130  73 122 134  43 118
 136 109 131 121  90  54 132 127 117 128 120  94  55  59 129  93  78 125
 107 108 110  57  97  86  58 123  40 113  98 124 100 116  68  75  92 106
  51  82  99  91  88  77 114  38  47 111  85 112  81  62  53  39 119 115
  48  66  49 103  45 101 105  63  61  95  52  96  69  41  70 104  44  56
  80  50  79  89  42  46  60  71  83  76  67  65]
