## TPOT (Tree-based Pipeline Optimization Tool) demos

In [1]:
!pip3 install tpot

Defaulting to user installation because normal site-packages is not writeable


### Classification

In [2]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

x = load_iris().data
y = load_iris().target
labels = load_iris().target_names

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [3]:
from tpot import TPOTClassifier

clf = TPOTClassifier(population_size=10, generations=10, verbosity=2)
clf.fit(x_train, y_train)

                                                                             
Generation 1 - Current best internal CV score: 0.975
                                                                             
Generation 2 - Current best internal CV score: 0.975
                                                                             
Generation 3 - Current best internal CV score: 0.975
                                                                             
Generation 4 - Current best internal CV score: 0.975
                                                                             
Generation 5 - Current best internal CV score: 0.9833333333333332
                                                                             
Generation 6 - Current best internal CV score: 0.9833333333333332
                                                                             
Generation 7 - Current best internal CV score: 0.9833333333333332
                                            

In [4]:
clf.fitted_pipeline_

In [5]:
clf.export('./tpot_pipeline.py')

In [6]:
with open('./tpot_pipeline.py') as f:
  for line in f:
    print(line, end='')

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=None)

# Average CV score on the training set was: 0.9833333333333332
exported_pipeline = make_pipeline(
    StackingEstimator(estimator=BernoulliNB(alpha=1.0, fit_prior=False)),
    MLPClassifier(alpha=0.01, learning_rate_init=0.001)
)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)


In [7]:
print('Training accuracy:', round(clf.score(x_train, y_train), 3))
print('Test accuracy:', round(clf.score(x_test, y_test), 3))

Training accuracy: 0.975
Test accuracy: 1.0


In [8]:
predicted = clf.predict(x_test)

In [9]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predicted, target_names=labels))

              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        10
  versicolor       1.00      1.00      1.00         9
   virginica       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



### Regression

In [10]:
import pandas as pd

df = pd.read_csv('https://github.com/PacktPublishing/Automated-Machine-Learning-with-AutoKeras/raw/main/boston.csv')
df

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296,15.3,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273,21.0,9.67,22.4
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273,21.0,9.08,20.6
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273,21.0,5.64,23.9
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273,21.0,6.48,22.0


In [11]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2, random_state=42)

x_train = train.drop(['MEDV'], axis=1).to_numpy()
y_train = train.MEDV.to_numpy()
x_test = test.drop(['MEDV'], axis=1).to_numpy()
y_test = test.MEDV.to_numpy()

In [12]:
from tpot import TPOTRegressor

reg = TPOTRegressor(population_size=10, generations=10, verbosity=2)
reg.fit(x_train, y_train)

                                                                             
Generation 1 - Current best internal CV score: -17.93500521788378
                                                                             
Generation 2 - Current best internal CV score: -17.50305121774162
                                                                             
Generation 3 - Current best internal CV score: -17.360576016533017
                                                                             
Generation 4 - Current best internal CV score: -17.360576016533017
                                                                             
Generation 5 - Current best internal CV score: -17.14261583376961
                                                                             
Generation 6 - Current best internal CV score: -14.846700199504227
                                                                             
Generation 7 - Current best internal CV score: -14.8467

In [13]:
reg.export('./tpot_pipeline.py')

In [14]:
with open('./tpot_pipeline.py') as f:
  for line in f:
    print(line, end='')

import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import train_test_split

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=None)

# Average CV score on the training set was: -14.752330143876128
exported_pipeline = ExtraTreesRegressor(bootstrap=True, max_features=0.9000000000000001, min_samples_leaf=3, min_samples_split=7, n_estimators=100)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)


In [15]:
predicted = reg.predict(x_test)

In [16]:
from sklearn.metrics import r2_score, mean_absolute_error

print('Prection R2:', r2_score(y_test, predicted).round(3))
print('Prection MAE:', mean_absolute_error(y_test, predicted).round(3))

Prection R2: 0.818
Prection MAE: 2.071
