## TPOT (Tree-based Pipeline Optimization Tool) demos

In [1]:
!pip3 install tpot

Collecting tpot
  Downloading TPOT-0.11.7-py3-none-any.whl (87 kB)
[?25l[K     |███▊                            | 10 kB 22.2 MB/s eta 0:00:01[K     |███████▌                        | 20 kB 11.4 MB/s eta 0:00:01[K     |███████████▎                    | 30 kB 8.6 MB/s eta 0:00:01[K     |███████████████                 | 40 kB 7.1 MB/s eta 0:00:01[K     |██████████████████▉             | 51 kB 5.0 MB/s eta 0:00:01[K     |██████████████████████▋         | 61 kB 5.1 MB/s eta 0:00:01[K     |██████████████████████████▎     | 71 kB 5.4 MB/s eta 0:00:01[K     |██████████████████████████████  | 81 kB 6.0 MB/s eta 0:00:01[K     |████████████████████████████████| 87 kB 3.1 MB/s 
Collecting deap>=1.2
  Downloading deap-1.3.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (160 kB)
[?25l[K     |██                              | 10 kB 23.0 MB/s eta 0:00:01[K     |████                            | 20 kB 31.5 MB/s eta 0:00:01[K 

### Classification

In [2]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

x = load_iris().data
y = load_iris().target
labels = load_iris().target_names

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [3]:
from tpot import TPOTClassifier

clf = TPOTClassifier(
    population_size=5, generations=5, verbosity=2)
clf.fit(x_train, y_train)

Optimization Progress:   0%|          | 0/30 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.975

Generation 2 - Current best internal CV score: 0.975

Generation 3 - Current best internal CV score: 0.975

Generation 4 - Current best internal CV score: 0.975

Generation 5 - Current best internal CV score: 0.975

Best pipeline: MLPClassifier(input_matrix, alpha=0.1, learning_rate_init=0.001)


TPOTClassifier(generations=5, population_size=5, verbosity=2)

In [4]:
clf.export('tpot_pipeline.py')

In [5]:
with open('tpot_pipeline.py') as f:
  for line in f:
    print(line, end='')

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=None)

# Average CV score on the training set was: 0.975
exported_pipeline = MLPClassifier(alpha=0.1, learning_rate_init=0.001)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)


In [6]:
print('Training accuracy:', clf.score(x_train, y_train).round(3))
print('Test accuracy:', clf.score(x_test, y_test).round(3))

Training accuracy: 0.975
Test accuracy: 1.0


In [7]:
predicted = clf.predict(x_test)

In [8]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predicted, target_names=labels))

              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        10
  versicolor       1.00      1.00      1.00         9
   virginica       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



### Regression

In [9]:
import pandas as pd

df = pd.read_csv('https://github.com/PacktPublishing/Automated-Machine-Learning-with-AutoKeras/raw/main/boston.csv')
df

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296,15.3,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273,21.0,9.67,22.4
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273,21.0,9.08,20.6
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273,21.0,5.64,23.9
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273,21.0,6.48,22.0


In [10]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2, random_state=42)

x_train = train.drop(['MEDV'], axis=1).to_numpy()
y_train = train.MEDV.to_numpy()
x_test = test.drop(['MEDV'], axis=1).to_numpy()
y_test = test.MEDV.to_numpy()

In [11]:
from tpot import TPOTRegressor

reg = TPOTRegressor(
    population_size=15, generations=15, verbosity=3)
reg.fit(x_train, y_train)

30 operators have been imported by TPOT.


Optimization Progress:   0%|          | 0/240 [00:00<?, ?pipeline/s]


Generation 1 - Current Pareto front scores:

-1	-18.732487579021246	AdaBoostRegressor(input_matrix, AdaBoostRegressor__learning_rate=0.1, AdaBoostRegressor__loss=square, AdaBoostRegressor__n_estimators=100)
_pre_test decorator: _random_mutation_operator: num_test=0 Unsupported set of arguments: The combination of penalty='l2' and loss='epsilon_insensitive' are not supported when dual=False, Parameters: penalty='l2', loss='epsilon_insensitive', dual=False.

Generation 2 - Current Pareto front scores:

-1	-18.732487579021246	AdaBoostRegressor(input_matrix, AdaBoostRegressor__learning_rate=0.1, AdaBoostRegressor__loss=square, AdaBoostRegressor__n_estimators=100)

-2	-18.263279658964027	AdaBoostRegressor(MinMaxScaler(input_matrix), AdaBoostRegressor__learning_rate=0.1, AdaBoostRegressor__loss=square, AdaBoostRegressor__n_estimators=100)

-3	-13.278772123355363	DecisionTreeRegressor(LassoLarsCV(PolynomialFeatures(input_matrix, PolynomialFeatures__degree=2, PolynomialFeatures__include_bias=

TPOTRegressor(generations=15, population_size=15, verbosity=3)

In [12]:
reg.export('tpot_pipeline.py')

In [13]:
with open('tpot_pipeline.py') as f:
  for line in f:
    print(line, end='')

import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import ElasticNetCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator
from xgboost import XGBRegressor

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=None)

# Average CV score on the training set was: -11.61652080039325
exported_pipeline = make_pipeline(
    StackingEstimator(estimator=ExtraTreesRegressor(bootstrap=False, max_features=1.0, min_samples_leaf=5, min_samples_split=4, n_estimators=100)),
    StackingEstimator(estimator=ElasticNetCV(l1_ratio=0.1, tol=0.001)),

In [15]:
predicted = reg.predict(x_test)

In [16]:
from sklearn.metrics import r2_score, mean_absolute_error

print('Prection R2:', r2_score(y_test, predicted).round(3))
print('Prection MAE:', mean_absolute_error(y_test, predicted).round(3))

Prection R2: 0.86
Prection MAE: 1.948
