In [1]:
# Set Working Directory
import os
os.chdir('..')

In [2]:
# Load Requirements
import warnings
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, Imputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from tpot import TPOTClassifier
from sklearn.metrics import recall_score, precision_score

In [3]:
# Load Data
wrangled_data = pd.read_csv('output/wrangled_data_ii.csv', dtype='str', na_values='NaN')

In [4]:
def prep_columns_classification(wrangled_data_ii, feature_set=None):
    """
    Return X_train, X_dev, X_test, y_train, y_dev, y_test: predictors (X) and label (y) of train, dev, and test sets (0.8, 0.1, 0.1 split)

    param dataframe wrangled_data_ii: dataframe of wrangled dataframe (after rename/reduce columns)
    param list feature_set: list of features to include
    """

    prepped_classification_data = wrangled_data_ii.copy()

    # Specify features to include in model
    if feature_set:
        pass
    else:
        feature_set = wrangled_data_ii.drop(columns=['NAME', 'LEAID', 'exist_five_years']).columns
    X = wrangled_data_ii[feature_set].copy()

    # Identify column types
    identifying_columns = ['NAME', 'LEAID']
    prediction_columns = ['exist_five_years']
    categorical_columns = ['lowest_grade', 'highest_grade', 'charter_status']
    boolean_columns = ['bureau_indian_education']
    numerical_columns = []

    # identify numerical columns
    for column in X.columns:
        if column in identifying_columns or column in categorical_columns or column in boolean_columns or column in prediction_columns:
            pass
        elif len(X[column].unique()) > 100:
            numerical_columns.append(column)
        else:
            categorical_columns.append(column)

    X[numerical_columns] = X[numerical_columns].astype(float)

    # one hot encode categorical variables
    X = pd.get_dummies(X, prefix_sep='_', columns=categorical_columns, drop_first=True)

    # Split into train and test sets
    y = wrangled_data_ii[prediction_columns].apply(lambda x: x=='False')
    y = y.values.ravel()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=20)
    X_dev, X_test, y_dev, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=21)

    return X_train, y_train, X_dev, y_dev, X_test, y_test

In [5]:
features_15 = ['total_local_revenue', 
               'total_state_revenue', 
               'total_federal_revenue',
               'teachers_total', 
               'charter_status', 
               'white_students', 
               'total_schools',
               'total_students',
               'lowest_grade', 
               'highest_grade', 
               'state_name', 
               'total_expenditure', 
               'administrators_school', 
               'metro_micro', 
               'white_male_students']

X_train, y_train, X_dev, y_dev, X_test, y_test = prep_columns_classification(wrangled_data, features_15)

In [6]:
def test_model(clf_pipeline, X_train, y_train, X_dev, y_dev):
    """
    Return sklearn pipeline object clf_pipeline: trained sklearn pipeline
    Return recall_train, recall_dev, precision_train, precision_dev: recall and precision of training and development sets
    
    param sklearn pipeline object clf_pipeline: untrained sklearn pipeline
    param np.array X_train, y_train, X_dev, y_dev: feature (X) and labels (y) of training and development sets
    """
    
    warnings.filterwarnings('ignore', category=DeprecationWarning)
    
    # Fit pipeline using training data
    clf_pipeline.fit(X_train, y_train) 

    # Get predictions for training and development sets
    train_predictions = clf_pipeline.predict(X_train)
    dev_predictions = clf_pipeline.predict(X_dev)
    
    # Calculate recall and precision of training and development sets
    recall_train = recall_score(y_train, train_predictions)
    recall_dev = recall_score(y_dev, dev_predictions)
    precision_train = precision_score(y_train, train_predictions)
    precision_dev = precision_score(y_dev, dev_predictions)

    return clf_pipeline, recall_train, recall_dev, precision_train, precision_dev

In [7]:
automl_recall_pipeline = TPOTClassifier(verbosity=2, scoring='recall', periodic_checkpoint_folder='output/automl_progress/', early_stop=10, n_jobs=-1)
automl_recall_pipeline, recall_train, recall_dev, precision_train, precision_dev = test_model(automl_recall_pipeline, X_train, y_train, X_dev, y_dev)

print("Recall on training data:       {}".format(recall_train))
print("Recall on development data:    {}".format(recall_dev))
print("Precision on training data:    {}".format(precision_train))
print("Precision on development data: {}".format(precision_dev))

Imputing missing values in feature set


Optimization Progress:   2%|▏         | 206/10100 [40:12<43:43:13, 15.91s/pipeline]

Generation 1 - Current best internal CV score: 0.9977207792207793


Optimization Progress:   3%|▎         | 307/10100 [58:02<27:11:05,  9.99s/pipeline]

Generation 2 - Current best internal CV score: 0.9977207792207793


Optimization Progress:   4%|▍         | 408/10100 [1:16:50<22:38:39,  8.41s/pipeline]

Generation 3 - Current best internal CV score: 0.9977207792207793


Optimization Progress:   5%|▌         | 510/10100 [1:40:12<52:21:14, 19.65s/pipeline]

Generation 4 - Current best internal CV score: 0.9977207792207793


Optimization Progress:   6%|▌         | 610/10100 [1:55:21<22:04:50,  8.38s/pipeline]

Generation 5 - Current best internal CV score: 0.9977207792207793


Optimization Progress:   7%|▋         | 710/10100 [2:10:54<21:25:50,  8.22s/pipeline]

Generation 6 - Current best internal CV score: 0.9977207792207793


Optimization Progress:   8%|▊         | 811/10100 [2:27:53<16:52:59,  6.54s/pipeline]

Generation 7 - Current best internal CV score: 0.9988571428571429


Optimization Progress:   9%|▉         | 912/10100 [2:51:57<30:42:43, 12.03s/pipeline]

Generation 8 - Current best internal CV score: 1.0


Optimization Progress:  10%|█         | 1013/10100 [3:13:22<18:17:50,  7.25s/pipeline]

Generation 9 - Current best internal CV score: 1.0


Optimization Progress:  11%|█         | 1113/10100 [3:25:31<17:42:44,  7.10s/pipeline]

Generation 10 - Current best internal CV score: 1.0


Optimization Progress:  12%|█▏        | 1215/10100 [3:48:56<17:15:16,  6.99s/pipeline]

Generation 11 - Current best internal CV score: 1.0


Optimization Progress:  13%|█▎        | 1317/10100 [4:13:52<17:13:32,  7.06s/pipeline]

Generation 12 - Current best internal CV score: 1.0


Optimization Progress:  14%|█▍        | 1421/10100 [4:48:25<25:36:31, 10.62s/pipeline]

Generation 13 - Current best internal CV score: 1.0


Optimization Progress:  15%|█▌        | 1523/10100 [5:10:00<39:59:37, 16.79s/pipeline]

Generation 14 - Current best internal CV score: 1.0


Optimization Progress:  16%|█▌        | 1624/10100 [5:39:25<32:26:40, 13.78s/pipeline]

Generation 15 - Current best internal CV score: 1.0


Optimization Progress:  17%|█▋        | 1726/10100 [6:00:00<19:38:45,  8.45s/pipeline]

Generation 16 - Current best internal CV score: 1.0


Optimization Progress:  18%|█▊        | 1826/10100 [6:12:21<14:45:26,  6.42s/pipeline]

Generation 17 - Current best internal CV score: 1.0


Optimization Progress:  19%|█▉        | 1928/10100 [6:33:41<22:35:32,  9.95s/pipeline]

Generation 18 - Current best internal CV score: 1.0


Optimization Progress:  20%|██        | 2028/10100 [6:43:17<14:01:13,  6.25s/pipeline]

Generation 19 - Current best internal CV score: 1.0


Optimization Progress:  21%|██        | 2128/10100 [6:55:35<11:08:14,  5.03s/pipeline]

Generation 20 - Current best internal CV score: 1.0


Optimization Progress:  22%|██▏       | 2228/10100 [7:05:24<19:33:27,  8.94s/pipeline]

Generation 21 - Current best internal CV score: 1.0


Optimization Progress:  23%|██▎       | 2329/10100 [7:16:47<17:31:20,  8.12s/pipeline]

Generation 22 - Current best internal CV score: 1.0


Optimization Progress:  24%|██▍       | 2431/10100 [7:35:12<20:11:01,  9.47s/pipeline]

Generation 23 - Current best internal CV score: 1.0


Optimization Progress:  25%|██▌       | 2531/10100 [7:46:15<7:58:31,  3.79s/pipeline] 

Generation 24 - Current best internal CV score: 1.0


Optimization Progress:  26%|██▌       | 2632/10100 [8:02:04<12:12:37,  5.89s/pipeline]

Generation 25 - Current best internal CV score: 1.0




Generation 26 - Current best internal CV score: 1.0

The optimized pipeline was not improved after evaluating 10 more generations. Will end the optimization process.

TPOT closed prematurely. Will use the current best pipeline.

Best pipeline: GaussianNB(Nystroem(input_matrix, gamma=0.6000000000000001, kernel=rbf, n_components=3))
Imputing missing values in feature set
Imputing missing values in feature set
Recall on training data:       1.0
Recall on development data:    1.0
Precision on training data:    0.05937711577522004
Precision on development data: 0.056879739978331526


In [8]:
automl_recall_pipeline.export('output/pipelines/automl_recall_pipeline.py')

True

In [7]:
automl_f1_weighted_pipeline = TPOTClassifier(verbosity=2, scoring='f1_weighted', periodic_checkpoint_folder='output/automl_progress/', early_stop=10, n_jobs=-1)
automl_f1_weighted_pipeline, recall_train, recall_dev, precision_train, precision_dev = test_model(automl_f1_weighted_pipeline, X_train, y_train, X_dev, y_dev)

print("Recall on training data:       {}".format(recall_train))
print("Recall on development data:    {}".format(recall_dev))
print("Precision on training data:    {}".format(precision_train))
print("Precision on development data: {}".format(precision_dev))

Imputing missing values in feature set


Optimization Progress:   2%|▏         | 200/10100 [23:38<21:11:17,  7.70s/pipeline]

Generation 1 - Current best internal CV score: 0.9552090195865057


Optimization Progress:   3%|▎         | 300/10100 [40:06<23:55:43,  8.79s/pipeline]

Generation 2 - Current best internal CV score: 0.9555754422706657


Optimization Progress:   4%|▍         | 402/10100 [1:05:54<22:05:11,  8.20s/pipeline]

Generation 3 - Current best internal CV score: 0.9555754422706657


Optimization Progress:   5%|▍         | 504/10100 [1:32:49<29:05:51, 10.92s/pipeline]

Generation 4 - Current best internal CV score: 0.955797611246876


Optimization Progress:   6%|▌         | 604/10100 [1:48:47<29:21:40, 11.13s/pipeline]

Generation 5 - Current best internal CV score: 0.9561272929159065


Optimization Progress:   7%|▋         | 706/10100 [2:14:29<26:17:59, 10.08s/pipeline]

Generation 6 - Current best internal CV score: 0.9561272929159065


Optimization Progress:   8%|▊         | 809/10100 [2:43:20<39:08:09, 15.16s/pipeline]

Generation 7 - Current best internal CV score: 0.9564888315579939


Optimization Progress:   9%|▉         | 912/10100 [3:15:43<44:02:45, 17.26s/pipeline]

Generation 8 - Current best internal CV score: 0.9564888315579939


Optimization Progress:  10%|█         | 1015/10100 [3:50:25<43:31:16, 17.25s/pipeline]

Generation 9 - Current best internal CV score: 0.9567441938335473


Optimization Progress:  11%|█         | 1120/10100 [4:31:07<29:22:21, 11.78s/pipeline]

Generation 10 - Current best internal CV score: 0.9568318001240316


Optimization Progress:  12%|█▏        | 1225/10100 [5:06:57<35:28:42, 14.39s/pipeline]

Generation 11 - Current best internal CV score: 0.9568318001240316


Optimization Progress:  13%|█▎        | 1328/10100 [5:40:32<49:36:41, 20.36s/pipeline]

Generation 12 - Current best internal CV score: 0.9568318001240316


Optimization Progress:  14%|█▍        | 1429/10100 [6:11:48<31:57:38, 13.27s/pipeline]

Generation 13 - Current best internal CV score: 0.9568318001240316


Optimization Progress:  15%|█▌        | 1533/10100 [6:47:56<58:59:24, 24.79s/pipeline]

Generation 14 - Current best internal CV score: 0.9571419283969183


Optimization Progress:  16%|█▌        | 1639/10100 [7:33:44<67:37:47, 28.78s/pipeline]

Generation 15 - Current best internal CV score: 0.9571419283969183


Optimization Progress:  17%|█▋        | 1742/10100 [8:14:24<48:38:29, 20.95s/pipeline]

Generation 16 - Current best internal CV score: 0.9574759081895741


Optimization Progress:  18%|█▊        | 1842/10100 [8:32:38<18:33:09,  8.09s/pipeline]

Generation 17 - Current best internal CV score: 0.9574759081895741


Optimization Progress:  19%|█▉        | 1944/10100 [9:01:38<36:31:42, 16.12s/pipeline]

Generation 18 - Current best internal CV score: 0.9574759081895741


Optimization Progress:  20%|██        | 2045/10100 [9:26:44<28:41:50, 12.83s/pipeline]

Generation 19 - Current best internal CV score: 0.9574759081895741


Optimization Progress:  21%|██▏       | 2147/10100 [9:50:16<41:58:07, 19.00s/pipeline]

Generation 20 - Current best internal CV score: 0.9574759081895741


Optimization Progress:  22%|██▏       | 2247/10100 [10:15:36<21:20:15,  9.78s/pipeline]

Generation 21 - Current best internal CV score: 0.9574759081895741


Optimization Progress:  23%|██▎       | 2348/10100 [10:45:33<28:59:35, 13.46s/pipeline]

Generation 22 - Current best internal CV score: 0.9574759081895741


Optimization Progress:  24%|██▍       | 2453/10100 [11:27:04<62:54:29, 29.62s/pipeline]

Generation 23 - Current best internal CV score: 0.9574759081895741


Optimization Progress:  25%|██▌       | 2554/10100 [11:56:48<24:13:20, 11.56s/pipeline]

Generation 24 - Current best internal CV score: 0.9574759081895741


Optimization Progress:  26%|██▋       | 2658/10100 [12:32:43<39:43:52, 19.22s/pipeline]

Generation 25 - Current best internal CV score: 0.9574759081895741


Optimization Progress:  27%|██▋       | 2760/10100 [13:10:26<28:28:04, 13.96s/pipeline]

Generation 26 - Current best internal CV score: 0.9574759081895741


Optimization Progress:  28%|██▊       | 2861/10100 [13:45:51<58:32:54, 29.12s/pipeline]

Generation 27 - Current best internal CV score: 0.9576386406262041


Optimization Progress:  29%|██▉       | 2962/10100 [14:21:35<36:10:52, 18.25s/pipeline]

Generation 28 - Current best internal CV score: 0.9576386406262041


Optimization Progress:  30%|███       | 3064/10100 [14:56:04<31:50:35, 16.29s/pipeline]

Generation 29 - Current best internal CV score: 0.9576386406262041


Optimization Progress:  31%|███▏      | 3165/10100 [15:26:30<26:17:46, 13.65s/pipeline]

Generation 30 - Current best internal CV score: 0.9576386406262041


Optimization Progress:  32%|███▏      | 3269/10100 [16:00:38<34:37:42, 18.25s/pipeline]

Generation 31 - Current best internal CV score: 0.9576386406262041


Optimization Progress:  33%|███▎      | 3370/10100 [16:27:10<29:41:21, 15.88s/pipeline]

Generation 32 - Current best internal CV score: 0.9576386406262041


Optimization Progress:  34%|███▍      | 3472/10100 [17:06:57<37:34:39, 20.41s/pipeline]

Generation 33 - Current best internal CV score: 0.9576386406262041


Optimization Progress:  35%|███▌      | 3577/10100 [17:46:45<39:44:55, 21.94s/pipeline]

Generation 34 - Current best internal CV score: 0.9576386406262041


Optimization Progress:  36%|███▋      | 3682/10100 [18:27:01<34:37:11, 19.42s/pipeline]

Generation 35 - Current best internal CV score: 0.9576386406262041


Optimization Progress:  37%|███▋      | 3784/10100 [19:04:54<35:57:50, 20.50s/pipeline]

Generation 36 - Current best internal CV score: 0.9576790979890856


Optimization Progress:  38%|███▊      | 3884/10100 [19:33:02<25:39:01, 14.86s/pipeline]

Generation 37 - Current best internal CV score: 0.9576790979890856


Optimization Progress:  39%|███▉      | 3984/10100 [20:06:12<30:20:54, 17.86s/pipeline]

Generation 38 - Current best internal CV score: 0.9576790979890856


Optimization Progress:  40%|████      | 4085/10100 [20:41:34<42:12:11, 25.26s/pipeline]

Generation 39 - Current best internal CV score: 0.9576790979890856


Optimization Progress:  41%|████▏     | 4191/10100 [21:32:15<47:05:38, 28.69s/pipeline]

Generation 40 - Current best internal CV score: 0.9576790979890856


Optimization Progress:  43%|████▎     | 4295/10100 [22:13:06<27:18:19, 16.93s/pipeline]

Generation 41 - Current best internal CV score: 0.9578456717783361


Optimization Progress:  44%|████▎     | 4397/10100 [22:51:12<33:57:50, 21.44s/pipeline]

Generation 42 - Current best internal CV score: 0.9578456717783361


Optimization Progress:  45%|████▍     | 4500/10100 [23:29:16<33:13:02, 21.35s/pipeline]

Generation 43 - Current best internal CV score: 0.9578456717783361


Optimization Progress:  46%|████▌     | 4603/10100 [24:03:57<22:19:35, 14.62s/pipeline]

Generation 44 - Current best internal CV score: 0.9582025515802549


Optimization Progress:  47%|████▋     | 4703/10100 [24:42:20<31:31:51, 21.03s/pipeline]

Generation 45 - Current best internal CV score: 0.9582025515802549


Optimization Progress:  48%|████▊     | 4805/10100 [25:16:14<20:43:03, 14.09s/pipeline]

Generation 46 - Current best internal CV score: 0.9582025515802549


Optimization Progress:  49%|████▊     | 4907/10100 [25:50:06<25:28:55, 17.67s/pipeline]

Generation 47 - Current best internal CV score: 0.9582025515802549


Optimization Progress:  50%|████▉     | 5010/10100 [26:20:54<26:07:23, 18.48s/pipeline]

Generation 48 - Current best internal CV score: 0.9582025515802549


Optimization Progress:  51%|█████     | 5112/10100 [26:59:53<28:40:12, 20.69s/pipeline]

Generation 49 - Current best internal CV score: 0.9582025515802549


Optimization Progress:  52%|█████▏    | 5214/10100 [27:39:31<27:55:54, 20.58s/pipeline]

Generation 50 - Current best internal CV score: 0.9582025515802549


Optimization Progress:  53%|█████▎    | 5316/10100 [28:14:00<19:07:45, 14.39s/pipeline]

Generation 51 - Current best internal CV score: 0.9582025515802549


Optimization Progress:  54%|█████▎    | 5420/10100 [28:52:22<24:27:00, 18.81s/pipeline]

Generation 52 - Current best internal CV score: 0.9582025515802549


Optimization Progress:  55%|█████▍    | 5522/10100 [29:24:23<20:10:40, 15.87s/pipeline]

Generation 53 - Current best internal CV score: 0.9582025515802549




Generation 54 - Current best internal CV score: 0.9582025515802549

The optimized pipeline was not improved after evaluating 10 more generations. Will end the optimization process.

TPOT closed prematurely. Will use the current best pipeline.

Best pipeline: ExtraTreesClassifier(CombineDFs(RandomForestClassifier(MaxAbsScaler(input_matrix), bootstrap=False, criterion=entropy, max_features=0.25, min_samples_leaf=5, min_samples_split=14, n_estimators=100), input_matrix), bootstrap=True, criterion=gini, max_features=0.6000000000000001, min_samples_leaf=1, min_samples_split=5, n_estimators=100)
Imputing missing values in feature set
Imputing missing values in feature set
Recall on training data:       0.7434435575826682
Recall on development data:    0.4
Precision on training data:    0.9644970414201184
Precision on development data: 0.84


In [8]:
automl_f1_weighted_pipeline.export('output/pipelines/automl_f1_weighted_pipeline.py')

True

In [9]:
automl_roc_auc_pipeline = TPOTClassifier(verbosity=2, scoring='roc_auc', periodic_checkpoint_folder='output/automl_progress/', early_stop=10, n_jobs=-1)
automl_roc_auc_pipeline, recall_train, recall_dev, precision_train, precision_dev = test_model(automl_roc_auc_pipeline, X_train, y_train, X_dev, y_dev)

print("Recall on training data:       {}".format(recall_train))
print("Recall on development data:    {}".format(recall_dev))
print("Precision on training data:    {}".format(precision_train))
print("Precision on development data: {}".format(precision_dev))

Imputing missing values in feature set


Optimization Progress:   2%|▏         | 205/10100 [45:59<20:40:15,  7.52s/pipeline]

Generation 1 - Current best internal CV score: 0.9266751050317081


Optimization Progress:   3%|▎         | 306/10100 [1:07:01<25:33:45,  9.40s/pipeline]

Generation 2 - Current best internal CV score: 0.9266751050317081


Optimization Progress:   4%|▍         | 407/10100 [1:30:37<31:54:17, 11.85s/pipeline]

Generation 3 - Current best internal CV score: 0.9310347796320269


Optimization Progress:   5%|▌         | 508/10100 [1:59:01<28:25:59, 10.67s/pipeline]

Generation 4 - Current best internal CV score: 0.9310347796320269


Optimization Progress:   6%|▌         | 609/10100 [2:26:01<35:35:14, 13.50s/pipeline]

Generation 5 - Current best internal CV score: 0.9321063670945824


Optimization Progress:   7%|▋         | 711/10100 [2:51:55<26:23:18, 10.12s/pipeline]

Generation 6 - Current best internal CV score: 0.9322631867952127


Optimization Progress:   8%|▊         | 811/10100 [3:10:21<28:10:07, 10.92s/pipeline]

Generation 7 - Current best internal CV score: 0.9322631867952127


Optimization Progress:   9%|▉         | 912/10100 [3:28:58<20:25:20,  8.00s/pipeline]

Generation 8 - Current best internal CV score: 0.9322631867952127


Optimization Progress:  10%|█         | 1016/10100 [4:04:02<38:00:04, 15.06s/pipeline]

Generation 9 - Current best internal CV score: 0.9322631867952127


Optimization Progress:  11%|█         | 1116/10100 [4:20:39<19:17:19,  7.73s/pipeline]

Generation 10 - Current best internal CV score: 0.9322631867952127


Optimization Progress:  12%|█▏        | 1216/10100 [4:37:36<22:42:54,  9.20s/pipeline]

Generation 11 - Current best internal CV score: 0.9322631867952127


Optimization Progress:  13%|█▎        | 1316/10100 [4:56:04<27:40:24, 11.34s/pipeline]

Generation 12 - Current best internal CV score: 0.9326543452049929


Optimization Progress:  14%|█▍        | 1419/10100 [5:25:17<27:38:17, 11.46s/pipeline]

Generation 13 - Current best internal CV score: 0.933276593234042


Optimization Progress:  15%|█▌        | 1520/10100 [5:47:30<33:47:24, 14.18s/pipeline]

Generation 14 - Current best internal CV score: 0.933276593234042


Optimization Progress:  16%|█▌        | 1622/10100 [6:14:23<56:06:32, 23.83s/pipeline]

Generation 15 - Current best internal CV score: 0.9336586890080053


Optimization Progress:  17%|█▋        | 1722/10100 [6:30:13<22:23:10,  9.62s/pipeline]

Generation 16 - Current best internal CV score: 0.9336586890080053


Optimization Progress:  18%|█▊        | 1824/10100 [6:55:49<43:07:34, 18.76s/pipeline]

Generation 17 - Current best internal CV score: 0.9336586890080053


Optimization Progress:  19%|█▉        | 1927/10100 [7:28:38<29:10:29, 12.85s/pipeline]

Generation 18 - Current best internal CV score: 0.9336586890080053


Optimization Progress:  20%|██        | 2029/10100 [7:53:09<20:14:39,  9.03s/pipeline]

Generation 19 - Current best internal CV score: 0.9341006657070888


Optimization Progress:  21%|██        | 2130/10100 [8:15:01<20:36:49,  9.31s/pipeline]

Generation 20 - Current best internal CV score: 0.9341643857222304


Optimization Progress:  22%|██▏       | 2230/10100 [8:32:03<31:45:43, 14.53s/pipeline]

Generation 21 - Current best internal CV score: 0.9341643857222304


Optimization Progress:  23%|██▎       | 2330/10100 [8:46:32<16:07:22,  7.47s/pipeline]

Generation 22 - Current best internal CV score: 0.9341643857222304


Optimization Progress:  24%|██▍       | 2431/10100 [9:07:19<17:08:46,  8.05s/pipeline]

Generation 23 - Current best internal CV score: 0.9341643857222304


Optimization Progress:  25%|██▌       | 2531/10100 [9:26:17<20:44:31,  9.87s/pipeline]

Generation 24 - Current best internal CV score: 0.9348076424295388


Optimization Progress:  26%|██▌       | 2634/10100 [9:50:58<20:15:30,  9.77s/pipeline]

Generation 25 - Current best internal CV score: 0.9348076424295388


Optimization Progress:  27%|██▋       | 2737/10100 [10:23:52<27:25:05, 13.41s/pipeline]

Generation 26 - Current best internal CV score: 0.9357530306145815


Optimization Progress:  28%|██▊       | 2838/10100 [10:44:42<18:41:18,  9.26s/pipeline]

Generation 27 - Current best internal CV score: 0.9357530306145815


Optimization Progress:  29%|██▉       | 2942/10100 [11:16:17<28:13:34, 14.20s/pipeline]

Generation 28 - Current best internal CV score: 0.9357530306145815


Optimization Progress:  30%|███       | 3044/10100 [11:45:12<30:27:28, 15.54s/pipeline]

Generation 29 - Current best internal CV score: 0.9357530306145815


Optimization Progress:  31%|███       | 3146/10100 [12:11:34<19:35:27, 10.14s/pipeline]

Generation 30 - Current best internal CV score: 0.9357530306145815


Optimization Progress:  32%|███▏      | 3246/10100 [12:32:21<24:19:13, 12.77s/pipeline]

Generation 31 - Current best internal CV score: 0.9357530306145815


Optimization Progress:  33%|███▎      | 3348/10100 [12:57:12<20:58:15, 11.18s/pipeline]

Generation 32 - Current best internal CV score: 0.9357530306145815


Optimization Progress:  34%|███▍      | 3449/10100 [13:20:10<17:17:49,  9.36s/pipeline]

Generation 33 - Current best internal CV score: 0.9357530306145815


Optimization Progress:  35%|███▌      | 3551/10100 [13:44:02<19:43:50, 10.85s/pipeline]

Generation 34 - Current best internal CV score: 0.9357530306145815


Optimization Progress:  36%|███▌      | 3654/10100 [14:15:35<27:50:26, 15.55s/pipeline]

Generation 35 - Current best internal CV score: 0.9357530306145815


Optimization Progress:  37%|███▋      | 3754/10100 [14:32:38<12:37:45,  7.16s/pipeline]

Generation 36 - Current best internal CV score: 0.9357530306145815


Optimization Progress:  38%|███▊      | 3857/10100 [15:00:48<19:30:21, 11.25s/pipeline]

Generation 37 - Current best internal CV score: 0.9357530306145815


Optimization Progress:  39%|███▉      | 3959/10100 [15:26:39<18:06:29, 10.62s/pipeline]

Generation 38 - Current best internal CV score: 0.9357530306145815


Optimization Progress:  40%|████      | 4060/10100 [15:48:14<12:23:02,  7.38s/pipeline]

Generation 39 - Current best internal CV score: 0.9357530306145815


Optimization Progress:  41%|████      | 4160/10100 [16:06:02<9:50:31,  5.96s/pipeline] 

Generation 40 - Current best internal CV score: 0.9357530306145815


Optimization Progress:  42%|████▏     | 4261/10100 [16:24:19<14:00:19,  8.64s/pipeline]

Generation 41 - Current best internal CV score: 0.9357530306145815


Optimization Progress:  43%|████▎     | 4363/10100 [16:53:51<19:38:00, 12.32s/pipeline]

Generation 42 - Current best internal CV score: 0.9357530306145815


Optimization Progress:  44%|████▍     | 4466/10100 [17:23:19<20:41:02, 13.22s/pipeline]

Generation 43 - Current best internal CV score: 0.9357530306145815


Optimization Progress:  45%|████▌     | 4567/10100 [17:43:15<20:43:06, 13.48s/pipeline]

Generation 44 - Current best internal CV score: 0.9357530306145815


Optimization Progress:  46%|████▌     | 4669/10100 [18:08:26<14:48:36,  9.82s/pipeline]

Generation 45 - Current best internal CV score: 0.9357530306145815


Optimization Progress:  47%|████▋     | 4770/10100 [18:27:48<12:34:45,  8.50s/pipeline]

Generation 46 - Current best internal CV score: 0.9357530306145815


Optimization Progress:  48%|████▊     | 4871/10100 [18:48:04<11:49:54,  8.15s/pipeline]

Generation 47 - Current best internal CV score: 0.9357530306145815


Optimization Progress:  49%|████▉     | 4973/10100 [19:07:39<24:07:08, 16.94s/pipeline]

Generation 48 - Current best internal CV score: 0.9357530306145815


Optimization Progress:  50%|█████     | 5075/10100 [19:31:17<18:31:22, 13.27s/pipeline]

Generation 49 - Current best internal CV score: 0.9357530306145815


Optimization Progress:  51%|█████▏    | 5177/10100 [19:51:17<9:11:47,  6.72s/pipeline] 

Generation 50 - Current best internal CV score: 0.9357530306145815


Optimization Progress:  52%|█████▏    | 5278/10100 [20:07:13<11:45:23,  8.78s/pipeline]

Generation 51 - Current best internal CV score: 0.9357530306145815


Optimization Progress:  53%|█████▎    | 5379/10100 [20:22:20<13:51:15, 10.56s/pipeline]

Generation 52 - Current best internal CV score: 0.9361822434492459


Optimization Progress:  54%|█████▍    | 5483/10100 [20:47:35<14:30:12, 11.31s/pipeline]

Generation 53 - Current best internal CV score: 0.9361822434492459


Optimization Progress:  55%|█████▌    | 5583/10100 [20:57:52<6:20:46,  5.06s/pipeline]

Generation 54 - Current best internal CV score: 0.9361822434492459


Optimization Progress:  56%|█████▋    | 5683/10100 [21:07:52<7:04:53,  5.77s/pipeline]

Generation 55 - Current best internal CV score: 0.9361822434492459


Optimization Progress:  57%|█████▋    | 5783/10100 [21:18:28<6:43:00,  5.60s/pipeline]

Generation 56 - Current best internal CV score: 0.9361822434492459


Optimization Progress:  58%|█████▊    | 5883/10100 [21:28:13<7:31:45,  6.43s/pipeline]

Generation 57 - Current best internal CV score: 0.9361822434492459


Optimization Progress:  59%|█████▉    | 5984/10100 [21:42:29<4:11:25,  3.67s/pipeline]

Generation 58 - Current best internal CV score: 0.9361822434492459


Optimization Progress:  60%|██████    | 6085/10100 [21:56:20<10:16:14,  9.21s/pipeline]

Generation 59 - Current best internal CV score: 0.9361822434492459


Optimization Progress:  61%|██████    | 6185/10100 [22:05:48<6:10:51,  5.68s/pipeline]

Generation 60 - Current best internal CV score: 0.9361822434492459


Optimization Progress:  62%|██████▏   | 6286/10100 [22:20:39<13:15:40, 12.52s/pipeline]

Generation 61 - Current best internal CV score: 0.9361822434492459




Generation 62 - Current best internal CV score: 0.9361822434492459

The optimized pipeline was not improved after evaluating 10 more generations. Will end the optimization process.

TPOT closed prematurely. Will use the current best pipeline.

Best pipeline: ExtraTreesClassifier(RobustScaler(LogisticRegression(GaussianNB(GaussianNB(BernoulliNB(input_matrix, alpha=0.1, fit_prior=True))), C=0.0001, dual=True, penalty=l2)), bootstrap=False, criterion=entropy, max_features=0.15000000000000002, min_samples_leaf=3, min_samples_split=4, n_estimators=100)
Imputing missing values in feature set
Imputing missing values in feature set
Recall on training data:       0.5461801596351197
Recall on development data:    0.37142857142857144
Precision on training data:    0.9429133858267716
Precision on development data: 0.8863636363636364


In [10]:
automl_roc_auc_pipeline.export('output/pipelines/automl_roc_auc_pipeline.py')

True