In [0]:
!pip install tpot

In [0]:
from xgboost import XGBClassifier
from tpot import TPOTClassifier, TPOTRegressor
from deap.gp import Primitive
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_curve, auc
import time

In [0]:
from google.colab import files


In [0]:
uploaded = files.upload()

Saving Pima_Diabetes_Data.csv to Pima_Diabetes_Data.csv


In [0]:
import io

In [0]:
# Read in uploaded CSV file into Pandas DataFrame-
pima_data = pd.read_csv(io.BytesIO(uploaded['Pima_Diabetes_Data.csv']))

In [0]:
# Dimension/shape of data-
pima_data.shape

(768, 9)

In [0]:
# Check for missing values-
print("\nMissing values in dataset? {0}\n".format(pima_data.isnull().values.any()))
print("# of missing values in dataset = {0}\n".format(pima_data.isnull().sum().sum()))


Missing values in dataset? False

# of missing values in dataset = 0



In [0]:
# Get data types for the attributes/features/columns in dataset-
pima_data.dtypes

Pregnancies                   int64
Glucose                       int64
BloodPressure                 int64
SkinThickness                 int64
Insulin                       int64
BMI                         float64
DiabetesPedigreeFunction    float64
Age                           int64
Outcome                       int64
dtype: object

In [0]:
# Get distribution of target variable 'Outcome'-
pima_data['Outcome'].value_counts()

0    500
1    268
Name: Outcome, dtype: int64

We can see that the dataset is skewed towards target label of '0'. This will have to be taken into account later towards machine learning model optimizations and tuning

In [0]:
# Split dataset into features (X) and label (y)-
X = pima_data.drop('Outcome', axis = 1)
y = pima_data['Outcome']

In [0]:
# Convert all numeric values to float-
X = X.values.astype("float")

# Convert 'X' from numpy array to pandas DataFrame-
X = pd.DataFrame(X, columns=pima_data.columns.tolist()[:-1])

In [0]:
# Normalize/Scale dataset-
# mm_scaler = MinMaxScaler()
rb_scaler = RobustScaler()

# X_scaled = mm_scaler.fit_transform(X)
X_scaled = rb_scaler.fit_transform(X)

# Convert 'X_scaled' from numpy array to pandas DataFrame-
X_scaled = pd.DataFrame(X_scaled, columns=X.columns.tolist())

In [0]:
# Divide attributes & labels into training & testing sets-
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.30, stratify = y)

print("\nDimensions of training and testing sets are:")
print("X_train = {0}, y_train = {1}, X_test = {2} and y_test = {3}\n\n".format(X_train.shape, y_train.shape, X_test.shape, y_test.shape))


Dimensions of training and testing sets are:
X_train = (537, 8), y_train = (537,), X_test = (231, 8) and y_test = (231,)




In [0]:
# Use a default XGBoost classifier to get model metrics to optimize further (if possible):

# Initialize a base/default XGBoost classifier-
xgb_clf = xgb.XGBClassifier()

# Train classifier on training data-
xgb_clf.fit(X_train, y_train)

# Make predictions using trained model-
y_pred_base = xgb_clf.predict(X_test)

In [0]:
# Get model metrics of default model-
acc_base = accuracy_score(y_test, y_pred_base)
precision_base = precision_score(y_test, y_pred_base)
recall_base = recall_score(y_test, y_pred_base)

print("\nModel metrics for default XGBoost classifier are:")
print("accuracy = {0:.4f}, precision = {1:.4f} & recall = {2:.4f}\n".format(acc_base, precision_base, recall_base))


Model metrics for default XGBoost classifier are:
accuracy = 0.7532, precision = 0.6667 & recall = 0.5926



Now use hyper-parameter optimizations using 'TPOT'.

"TPOT is a Python Automated Machine Learning tool that optimizes machine learning pipelines using genetic programming."

Refer-
https://epistasislab.github.io/tpot/

In [0]:
# Define the hyperparamters-
params = {
	'max_depth': np.arange(1,200,1),
	'learning_rate': np.arange(0.0001,0.1,0.0001),
	'n_estimators': np.arange(1,200,1),
	# 'nthread':[6],
	# 'gamma':np.arange(0.00001,0.1,0.00001),
	'subsample':np.arange(0.1,2,0.1),
	'reg_lambda': np.arange(0.1,200,1),
	'reg_alpha': np.arange(1,200,1),
	'min_child_weight': np.arange(1,200,1),
	'gamma': np.arange(0.1, 2, 0.1),
	'colsample_bytree': np.arange(0.1,2,0.1),
	'colsample_bylevel': np.arange(0.1,2,0.1)
	}

In [0]:
# Execute TPOT for hyperparameter optimization-
tpot_classifier = TPOTClassifier(
	generations = 20, population_size = 500,
	offspring_size = 250, verbosity = 2,
	early_stop = 8, 
	config_dict = {'xgboost.XGBClassifier': params},
	cv = 10, scoring = 'accuracy')

In [0]:
tic = time.time()

# Train TPOT on training data-
tpot_classifier.fit(X_train, y_train)

toc = time.time()
print("\nTime take by tpot to find 'best' hyperparameter = {0:.4f} seconds\n".format(toc - tic))

HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=5500.0, style=ProgressStyle(d…

Generation 1 - Current best internal CV score: 0.7356394129979035
Generation 2 - Current best internal CV score: 0.7356394129979035
Generation 3 - Current best internal CV score: 0.7542627533193571
Generation 4 - Current best internal CV score: 0.7542627533193571
Generation 5 - Current best internal CV score: 0.7542627533193571
Generation 6 - Current best internal CV score: 0.7542627533193571
Generation 7 - Current best internal CV score: 0.7542627533193571

The optimized pipeline was not improved after evaluating 8 more generations. Will end the optimization process.

TPOT closed prematurely. Will use the current best pipeline.

Best pipeline: XGBClassifier(CombineDFs(input_matrix, input_matrix), colsample_bylevel=1.0, colsample_bytree=0.9, gamma=0.4, learning_rate=0.04290000000000001, max_depth=22, min_child_weight=32, n_estimators=197, reg_alpha=13, reg_lambda=101.1, subsample=1.0)

Time take by tpot to find 'best' hyperparameter = 550.8192 seconds



In [0]:
# Extract best parameters found by TPOT-

# Python 3 dict to hold found 'best' hyper-parameters-
args = {}

for arg in tpot_classifier._optimized_pipeline:
  if type(arg) != Primitive:
    try:
      if arg.value.split('__')[1].split('=')[0] in ['max_depth', 'n_estimators', 'nthread','min_child_weigh']:
        args[arg.value.split('__')[1].split('=')[0]] = int(arg.value.split('__')[1].split('=')[1])
      else:
        args[arg.value.split('__')[1].split('=')[0]] = float(arg.value.split('__')[1].split('=')[1])

    except:
      pass


In [0]:
params = args

In [0]:
params

{'colsample_bylevel': 1.0,
 'colsample_bytree': 0.9,
 'gamma': 0.4,
 'learning_rate': 0.04290000000000001,
 'max_depth': 22,
 'min_child_weight': 32.0,
 'n_estimators': 197,
 'reg_alpha': 13.0,
 'reg_lambda': 101.1,
 'subsample': 1.0}

In [0]:
xgb_clf_hp = xgb.XGBClassifier(**params)

In [0]:
xgb_clf_hp.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1.0,
              colsample_bynode=1, colsample_bytree=0.9, gamma=0.4,
              learning_rate=0.04290000000000001, max_delta_step=0, max_depth=22,
              min_child_weight=32.0, missing=None, n_estimators=197, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=13.0, reg_lambda=101.1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1.0, verbosity=1)

In [0]:
y_pred = xgb_clf_hp.predict(X_test)

In [0]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

In [0]:
accuracy, precision, recall

(0.7532467532467533, 0.7222222222222222, 0.48148148148148145)