# [TPOT - Python Automated Machine Learning Tool](https://epistasislab.github.io/tpot/)
<img src="https://raw.githubusercontent.com/EpistasisLab/tpot/master/images/tpot-ml-pipeline.png" width="1000"></img>

The acronym "TPOT" stands for "Tree-based Pipeline Optimization Tool". TPOT was first introduced in the paper

[Randal S. Olson, Nathan Bartley, Ryan J. Urbanowicz, and Jason H. Moore (2016). Evaluation of a Tree-based Pipeline Optimization Tool for Automating Data Science. Proceedings of GECCO 2016, pages 485-492.](https://dl.acm.org/doi/10.1145/2908812.2908918) - [which I have here]('./2016Olsonetal_EvaluationTPOT.pdf')

TPOT uses [Genetic Programming](https://en.wikipedia.org/wiki/Genetic_programming), with a customizable objective function to identify an optimal pipeline of operations for:
- feature engineering
- feature selection
- feature preprocessing
- regression / classification modeling

It works with scikit-learn, and is very customizable. At the end of it's search, TPOT can even generate a template python script to perform the best pipeline of operations.

In [None]:
import math
import datetime as dat
import numpy as np
import pandas as pd
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import mean_squared_error

from tpot import TPOTRegressor

## Generate some fake data with a known regressive structure

In [None]:
''' simulate some data '''
# create the features & response
p = 10
n = 1000
X = np.random.normal(loc=0, scale=1, size=(n,p))
X[:,-1] = 0.42 + np.random.normal(loc=0, scale=0.0001, size=(n)) # replace with a very low variance feature
B = [7, -6, 5, -4, 3] + [0]*(p-5)
b = 0
y = b + np.sum(B*X, axis=1) + np.random.normal(loc=0, scale=5, size=(n,))
data = pd.DataFrame(data=X, columns=['X%d'%i for i in range(p)])
features = data.columns.values.tolist()
data['target'] = y
data = data[['target'] + features]

# talk
display(data.head())

## Initialize & Fit the TPOT Regressor

In [None]:
''' define the allowed pipeline structure '''
# define the type of oeprations allowed
templt = 'Selector-Transformer-Regressor'

# define common parameter choices
lrnRat = [1e-3, 1e-2, 1e-1, 0.5, 1.]
maxDep = [5, 10, None]
tol = [1e-3, 1e-2, 1e-1]
estim = [100]
maxFeat = np.arange(0.05, 1.01, 0.10)
minSplt = range(10, 21)
minLeaf = range(10, 21)

# define the config of allowable pipeline steps
config = {
    # feature selectors
    'sklearn.feature_selection.SelectFwe': {
        'alpha': np.arange(0, 0.05, 0.001),
        'score_func': {'sklearn.feature_selection.f_regression':None}},
    
    'sklearn.feature_selection.SelectPercentile': {
        'percentile': range(1, 100),
        'score_func': {'sklearn.feature_selection.f_regression': None}},

    'sklearn.feature_selection.VarianceThreshold': {
        'threshold': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.2]},

    'sklearn.feature_selection.SelectFromModel': {
        'threshold': np.arange(0, 1.01, 0.05),
        'estimator': {'sklearn.ensemble.ExtraTreesRegressor': {'n_estimators': estim, 'max_features':maxFeat}}},
    
    # preprocessors
    'sklearn.preprocessing.Binarizer': {
        'threshold': np.arange(0.0, 1.01, 0.05)},

    'sklearn.decomposition.FastICA': {
        'tol': np.arange(0.0, 1.01, 0.05)},

    'sklearn.preprocessing.MaxAbsScaler': {},

    'sklearn.preprocessing.MinMaxScaler': {},

    'sklearn.preprocessing.Normalizer': {
        'norm': ['l1', 'l2', 'max']},

    'sklearn.decomposition.PCA': {
        'svd_solver': ['randomized'],
        'iterated_power': range(1, 11)},

    'sklearn.preprocessing.PolynomialFeatures': {
        'degree': [2],
        'include_bias': [False],
        'interaction_only': [False]},

    'sklearn.preprocessing.RobustScaler': {},

    'sklearn.preprocessing.StandardScaler': {},

    'tpot.builtins.ZeroCount': {},
    
    # regressors
    'sklearn.linear_model.ElasticNetCV':{
        'l1_ratio':np.arange(0.0, 1.01, 0.05),
        'tol': tol},
    
    'sklearn.ensemble.GradientBoostingRegressor':{
        'n_estimators': estim,
        'loss': ["ls", "lad", "huber", "quantile"],
        'learning_rate': lrnRat,
        'max_depth': maxDep,
        'min_samples_split': minSplt,
        'min_samples_leaf': minLeaf,
        'max_features': maxFeat,
        'alpha': [0.75, 0.8, 0.85, 0.9, 0.95, 0.99]},
    
    'sklearn.ensemble.AdaBoostRegressor':{
        'n_estimators': [100],
        'learning_rate': lrnRat,
        'loss': ["linear", "square", "exponential"]},
    
    'sklearn.linear_model.LassoLarsCV': {
        'normalize': [True, False]},
        
    'sklearn.ensemble.RandomForestRegressor': {
        'n_estimators': estim,
        'max_features': maxFeat,
        'min_samples_split': minSplt,
        'min_samples_leaf': minLeaf,
        'bootstrap': [True, False]},
        
    'sklearn.linear_model.RidgeCV': {}
}

In [None]:
''' run TPOT '''
# set genetic algorithm parameters
gens = 10    # number of generations
early = 5   # number of generations with no improvement in objective function to terminate early
pops = 20   # size of population
mutat = 0.05 # mutation rate 
xover = 0.90 # crossover rate
objective = 'neg_mean_squared_error' # objective function that TPOT will attempt to maxim
randSeed = 42 # prng seed

# generate for cross-validation
nSplits = 10
trainPerc = 0.7
cvs = ShuffleSplit(nSplits, train_size=trainPerc)

# init
tReg = TPOTRegressor(generations=gens, population_size=pops, mutation_rate=mutat, crossover_rate=xover, scoring=objective,
                     template=templt, config_dict=config, cv=cvs, use_dask=True, verbosity=3, n_jobs=-1, random_state=randSeed)

# fit the regressor
startTime = dat.datetime.now()
tReg.fit(data[features].values, data.target.values)
stopTime = dat.datetime.now()
elapsTime = stopTime - startTime

# talk
print('Elapsed time = %s'%(stopTime - startTime))

## Attributes from Fit TPOT
- TPOT will return the set of best models that score similarly but tradeoff on parameter complexity - this is called the [Pareto Front](https://en.wikipedia.org/wiki/Pareto_efficiency).
- You can also access the best pipeline, which is an [sklearn pipeline object](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html), which we all know how to use.

In [None]:
# can access *all* pipelines fit
print('%d Pipelines Fit'%len(tReg.evaluated_individuals_.keys())) # note that this is equal to gens*pops
# print an MST of text
for (key, val) in tReg.evaluated_individuals_.items():
    print('%0.3f - %s'%(val['internal_cv_score'], key))

In [None]:
# TPOT provides a dict of pipelines on the Pareto Front
print('Pareto Front')
for key in tReg.pareto_front_fitted_pipelines_.keys():
    print(key)

# get the "best" pipeline
print('Best Pipeline')
bestPipe = tReg.fitted_pipeline_
print(bestPipe)
print('Best Pipeline RMSE = %0.3f'%math.sqrt(mean_squared_error(data.target.values, bestPipe.predict(data[features].values))))

# get the estimator from the best pipeline
bestEstim = bestPipe.steps[-1][-1]

In [None]:
# let's look at feature coefficients from the best pipeline
coefs = pd.DataFrame(data=bestEstim.coef_, index=features, columns=['Coefficients'])
coefs['abs'] = coefs.Coefficients.abs()
coefs = coefs.sort_values(by=['abs'], ascending=False, inplace=False).drop(columns=['abs'], inplace=False)
display(coefs)

## Export Best Pipeline as Python Code
This can be used as a template for starting more comprehensive modeling work.

In [None]:
# export the resultant model
pyFile = './TPOT_Demo.py'
tReg.export(pyFile)

# check it out
print('Best TPOT Pipeline')
with open(pyFile, 'rt') as f:
    print(''.join(f.readlines()))