## Import Dataset

In [88]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
cdc_diabetes_health_indicators = fetch_ucirepo(id=891) 
  
# data (as pandas dataframes) 
X = cdc_diabetes_health_indicators.data.features 
y = cdc_diabetes_health_indicators.data.targets 

## Import Dependencies

In [89]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
import time
import xgboost as xgb
import lightgbm as lgb
from sklearn.linear_model import LogisticRegression
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical


## CDC Diabetes Health Indicators Dataset Summary

- **UCI ID**: 891
- **Name**: CDC Diabetes Health Indicators
- **Repository URL**: [https://archive.ics.uci.edu/dataset/891/cdc+diabetes+health+indicators](https://archive.ics.uci.edu/dataset/891/cdc+diabetes+health+indicators)
- **Data URL**: [https://archive.ics.uci.edu/static/public/891/data.csv](https://archive.ics.uci.edu/static/public/891/data.csv)
- **Abstract**: This dataset contains healthcare statistics and lifestyle survey information about people, along with their diagnosis of diabetes. It includes demographics, lab test results, and survey responses, focusing on the classification of diabetes status.
- **Area**: Health and Medicine
- **Tasks**: Classification
- **Characteristics**: Tabular, Multivariate
- **Number of Instances**: 253,680
- **Number of Features**: 21
- **Feature Types**: Categorical, Integer
- **Demographics Included**: Sex, Age, Education Level, Income
- **Target Variable**: `Diabetes_binary`
- **Index Column**: `ID`
- **Missing Values**: No missing values reported
- **Year of Dataset Creation**: 2017
- **Last Updated**: Fri Nov 03 2023
- **Dataset DOI**: 10.24432/C53919
- **Creators**: Not listed
- **Introductory Paper**:
  - **Title**: Incidence of End-Stage Renal Disease Attributed to Diabetes Among Persons with Diagnosed Diabetes — United States and Puerto Rico, 2000–2014
  - **Authors**: Nilka Rios Burrows, MPH; Israel Hora, PhD; Linda S. Geiss, MA; Edward W. Gregg, PhD; Ann Albright, PhD
  - **Published In**: Morbidity and Mortality Weekly Report
  - **Year**: 2017
  - **URL**: [https://www.cdc.gov/mmwr/volumes/66/wr/mm6643a2.htm](https://www.cdc.gov/mmwr/volumes/66/wr/mm6643a2.htm)
- **Additional Information**:
  - **Summary**: Dataset link [https://www.cdc.gov/brfss/annual_data/annual_2014.html](https://www.cdc.gov/brfss/annual_data/annual_2014.html)
  - **Purpose**: To understand the relationship between lifestyle and diabetes in the US
  - **Funded By**: The CDC
  - **Instances Represent**: Each row represents a person participating in the study.
  - **Recommended Data Splits**: Cross-validation or a fixed train-test split
  - **Sensitive Data**: Gender, Income, Education level
  - **Preprocessing Description**: Bucketing of age
  - **Variable Info**: Diabetes diagnosis, demographics (race, sex), personal information (income, education), health history (drinking, smoking, mental health, physical health)
- **External URL**: [https://www.kaggle.com/datasets/alexteboul/diabetes-health-indicators-dataset](https://www.kaggle.com/datasets/alexteboul/diabetes-health-indicators-dataset)



In [90]:
pd.set_option('display.max_colwidth', None)

In [91]:
# variable information 
cdc_diabetes_health_indicators.variables

Unnamed: 0,name,role,type,demographic,description,units,missing_values
0,ID,ID,Integer,,Patient ID,,no
1,Diabetes_binary,Target,Binary,,0 = no diabetes 1 = prediabetes or diabetes,,no
2,HighBP,Feature,Binary,,0 = no high BP 1 = high BP,,no
3,HighChol,Feature,Binary,,0 = no high cholesterol 1 = high cholesterol,,no
4,CholCheck,Feature,Binary,,0 = no cholesterol check in 5 years 1 = yes cholesterol check in 5 years,,no
5,BMI,Feature,Integer,,Body Mass Index,,no
6,Smoker,Feature,Binary,,Have you smoked at least 100 cigarettes in your entire life? [Note: 5 packs = 100 cigarettes] 0 = no 1 = yes,,no
7,Stroke,Feature,Binary,,(Ever told) you had a stroke. 0 = no 1 = yes,,no
8,HeartDiseaseorAttack,Feature,Binary,,coronary heart disease (CHD) or myocardial infarction (MI) 0 = no 1 = yes,,no
9,PhysActivity,Feature,Binary,,physical activity in past 30 days - not including job 0 = no 1 = yes,,no


## Initial Exploratory Analysis of Data

In [92]:
#Setting max display of columns to 30 per dataframe
pd.set_option('display.max_columns', 30)

In [93]:
#Shows first five rows of dataset
X.head()

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,1,1,1,40,1,0,0,0,0,1,0,1,0,5,18,15,1,0,9,4,3
1,0,0,0,25,1,0,0,1,0,0,0,0,1,3,0,0,0,0,7,6,1
2,1,1,1,28,0,0,0,0,1,0,0,1,1,5,30,30,1,0,9,4,8
3,1,0,1,27,0,0,0,1,1,1,0,1,0,2,0,0,0,0,11,3,6
4,1,1,1,24,0,0,0,1,1,1,0,1,0,2,3,0,0,0,11,5,4


In [94]:
#Shows descriptive analytics of the dataset
X.describe()

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
count,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0
mean,0.429001,0.424121,0.96267,28.382364,0.443169,0.040571,0.094186,0.756544,0.634256,0.81142,0.056197,0.951053,0.084177,2.511392,3.184772,4.242081,0.168224,0.440342,8.032119,5.050434,6.053875
std,0.494934,0.49421,0.189571,6.608694,0.496761,0.197294,0.292087,0.429169,0.481639,0.391175,0.230302,0.215759,0.277654,1.068477,7.412847,8.717951,0.374066,0.496429,3.05422,0.985774,2.071148
min,0.0,0.0,0.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
25%,0.0,0.0,1.0,24.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,6.0,4.0,5.0
50%,0.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,8.0,5.0,7.0
75%,1.0,1.0,1.0,31.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,3.0,2.0,3.0,0.0,1.0,10.0,6.0,8.0
max,1.0,1.0,1.0,98.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,5.0,30.0,30.0,1.0,1.0,13.0,6.0,8.0


In [95]:
#Total rows and columns in dataset
print("Total amount of rows in dataset:",X.shape[0])
print("Total amount of columns in dataset:", X.shape[1])

Total amount of rows in dataset: 253680
Total amount of columns in dataset: 21


In [96]:
#Frequency counts for each column 
for column in X.columns:
    print(f"Frequency counts for column {column}:")
    print(X[column].value_counts().sort_index())
    print("\n")

Frequency counts for column HighBP:
HighBP
0    144851
1    108829
Name: count, dtype: int64


Frequency counts for column HighChol:
HighChol
0    146089
1    107591
Name: count, dtype: int64


Frequency counts for column CholCheck:
CholCheck
0      9470
1    244210
Name: count, dtype: int64


Frequency counts for column BMI:
BMI
12      6
13     21
14     41
15    132
16    348
     ... 
91      1
92     32
95     12
96      1
98      7
Name: count, Length: 84, dtype: int64


Frequency counts for column Smoker:
Smoker
0    141257
1    112423
Name: count, dtype: int64


Frequency counts for column Stroke:
Stroke
0    243388
1     10292
Name: count, dtype: int64


Frequency counts for column HeartDiseaseorAttack:
HeartDiseaseorAttack
0    229787
1     23893
Name: count, dtype: int64


Frequency counts for column PhysActivity:
PhysActivity
0     61760
1    191920
Name: count, dtype: int64


Frequency counts for column Fruits:
Fruits
0     92782
1    160898
Name: count, dtype: int64


Fre

In [99]:
# Iterating over each column in the target DataFrame 'y' to print frequency counts and percentages
for column in y.columns:
    print(f"Frequency counts for column {column}:")
    counts = y[column].value_counts().sort_index()
    print(counts)
    
    print(f"\nPercentage of each value in column {column}:")
    percentages = y[column].value_counts(normalize=True).sort_index() * 100
    print(percentages.to_string(float_format='%.2f%%'))  # Formatting percentages to two decimal places
    
    print("\n" * 2)

Frequency counts for column Diabetes_binary:
Diabetes_binary
0    218334
1     35346
Name: count, dtype: int64

Percentage of each value in column Diabetes_binary:
Diabetes_binary
0   86.07%
1   13.93%





In [100]:
# Assuming X is your DataFrame of features and Y is a Series or DataFrame of the target variable
# First, ensure Y is a Series with the appropriate name for clarity
if isinstance(y, pd.DataFrame):
    y = y.squeeze()  # Converts DataFrame with a single column to a Series
y.name = 'Target'

# Combine X and Y into a single DataFrame
combined_df = pd.concat([X, y], axis=1)

# Calculate the correlation matrix
corr_matrix = combined_df.corr()

# Extract and sort correlations with the target variable (absolute values for sorting)
target_corr = corr_matrix['Target'].drop('Target', axis=0).abs().sort_values(ascending=False)

# Print the sorted correlations with the target variable
print(target_corr)

GenHlth                 0.293569
HighBP                  0.263129
DiffWalk                0.218344
BMI                     0.216843
HighChol                0.200276
Age                     0.177442
HeartDiseaseorAttack    0.177282
PhysHlth                0.171337
Income                  0.163919
Education               0.124456
PhysActivity            0.118133
Stroke                  0.105816
MentHlth                0.069315
CholCheck               0.064761
Smoker                  0.060789
HvyAlcoholConsump       0.057056
Veggies                 0.056584
Fruits                  0.040779
NoDocbcCost             0.031433
Sex                     0.031430
AnyHealthcare           0.016255
Name: Target, dtype: float64


## Preprocessing the Data

In [102]:
#Removing AnyHealthcare, NoDocbcCost, Fruits, Veggies
columns_to_remove = ['AnyHealthcare', 'NoDocbcCost', 'Fruits', 'Veggies']

#Remove specified columns
X_modified = X.drop(columns=columns_to_remove)


# Verify the columns have been removed by printing the remaining columns
print(X_modified.columns)

Index(['HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker', 'Stroke',
       'HeartDiseaseorAttack', 'PhysActivity', 'HvyAlcoholConsump', 'GenHlth',
       'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education',
       'Income'],
      dtype='object')


In [104]:
#Normalize data
scaler = StandardScaler().fit(X_modified)
X_standard = scaler.transform(X_modified)

#Convert normalized numpy array to dataframe
X_df = pd.DataFrame(X_standard)

In [105]:
#Checking to ensure normalization
X_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,1.153688,1.165254,0.196922,1.757936,1.120927,-0.205637,-0.322458,-1.762814,-0.244014,2.329121,1.998592,1.233999,2.223615,-0.887021,0.3169,-1.065595,-1.474487
1,-0.866785,-0.858182,-5.078164,-0.511806,1.120927,-0.205637,-0.322458,0.567275,-0.244014,0.457294,-0.42963,-0.486592,-0.449718,-0.887021,-0.337933,0.963272,-2.440138
2,1.153688,1.165254,0.196922,-0.057858,-0.892119,-0.205637,-0.322458,-1.762814,-0.244014,2.329121,3.617407,2.95459,2.223615,-0.887021,0.3169,-1.065595,0.939638
3,1.153688,-0.858182,0.196922,-0.209174,-0.892119,-0.205637,-0.322458,0.567275,-0.244014,-0.478619,-0.42963,-0.486592,-0.449718,-0.887021,0.971733,-2.080028,-0.026012
4,1.153688,1.165254,0.196922,-0.663122,-0.892119,-0.205637,-0.322458,0.567275,-0.244014,-0.478619,-0.024926,-0.486592,-0.449718,-0.887021,0.971733,-0.051162,-0.991662


In [106]:
#Taking a random sample of the dataset 100K rows to reduce training/testing time of the model
#Random seed for reproducibility, then making sure that we take a random sample of X and the matching target Y
sampled_df_indices = X_df.sample(n=100000, random_state=19).index
sampled_X_df = X_df.loc[sampled_df_indices]
sampled_Y= y.loc[sampled_df_indices]

# Verify the shapes to confirm they are as expected
(sampled_X_df.shape, sampled_Y.shape)


((100000, 17), (100000,))

In [107]:
#Resetting the indices of both sampled_X_df and sampled_Y to start from 0 to 99999
sampled_X_df.reset_index(drop=True, inplace=True)
sampled_Y.reset_index(drop=True, inplace=True)

#Verify the reset indices by displaying the head of the DataFrames
(sampled_X_df.head(), sampled_Y.head())


(         0         1         2         3         4         5         6   \
 0 -0.866785  1.165254 -5.078164 -0.360490  1.120927 -0.205637 -0.322458   
 1 -0.866785 -0.858182  0.196922 -0.360490  1.120927 -0.205637 -0.322458   
 2 -0.866785 -0.858182 -5.078164 -0.209174 -0.892119 -0.205637 -0.322458   
 3 -0.866785 -0.858182  0.196922 -0.360490  1.120927 -0.205637 -0.322458   
 4  1.153688  1.165254  0.196922 -0.663122 -0.892119 -0.205637 -0.322458   
 
          7         8         9         10        11        12        13  \
 0  0.567275 -0.244014  0.457294 -0.024926  0.086938 -0.449718  1.127369   
 1  0.567275  4.098123 -1.414532  0.919382 -0.486592 -0.449718 -0.887021   
 2  0.567275 -0.244014 -1.414532 -0.429630 -0.486592 -0.449718  1.127369   
 3  0.567275 -0.244014 -1.414532 -0.429630 -0.486592 -0.449718  1.127369   
 4 -1.762814 -0.244014  0.457294  0.379778 -0.486592 -0.449718  1.127369   
 
          14        15        16  
 0 -0.665349  0.963272 -0.991662  
 1 -0.992766  

## Deal with the imbalanced class issue

In [109]:
# Separate indices for Y=1
indices_y_1 = y[y == 1].index

# Sampling 35346 indices for Y=0 without replacement
indices_y_0 = y[y == 0].sample(n=35346, random_state=19).index

# Concatenate indices from Y=1 and sampled Y=0
balanced_indices = indices_y_1.union(indices_y_0)

# Selecting the rows from X_df and y
balanced_X_df = X_df.loc[balanced_indices]
balanced_Y = y.loc[balanced_indices]

# Output the shapes to verify the operation
balanced_X_df.shape, balanced_Y.shape

((70692, 17), (70692,))

## Run baseline for logistic regression, random forest, lightgbm, neural network

In [110]:
# Start timing
start_time = time.time()

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(balanced_X_df, balanced_Y, test_size=0.2, random_state=42)

# Initialize a dictionary to store model accuracies
model_accuracies = {}

# Logistic Regression
lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_train, y_train)
lr_predictions = lr_model.predict(X_test)
lr_accuracy = accuracy_score(y_test, lr_predictions)
model_accuracies['Logistic Regression'] = lr_accuracy
print("Logistic Regression Accuracy:", lr_accuracy)

# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_predictions)
model_accuracies['Random Forest'] = rf_accuracy
print("Random Forest Accuracy:", rf_accuracy)

# LightGBM
lgb_model = lgb.LGBMClassifier(n_estimators=100, random_state=42)
lgb_model.fit(X_train, y_train)
lgb_predictions = lgb_model.predict(X_test)
lgb_accuracy = accuracy_score(y_test, lgb_predictions)
model_accuracies['LightGBM'] = lgb_accuracy
print("LightGBM Accuracy:", lgb_accuracy)

# Simple Neural Network with Keras
nn_model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(64, activation='relu'),
    Dense(2, activation='softmax')
])
nn_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
nn_model.fit(X_train, to_categorical(y_train), epochs=10, batch_size=32, verbose=1)
nn_predictions = nn_model.predict(X_test)
nn_predictions = np.argmax(nn_predictions, axis=1)
nn_accuracy = accuracy_score(y_test, nn_predictions)
model_accuracies['Neural Network'] = nn_accuracy
print("Neural Network Accuracy:", nn_accuracy)

# Find the best model based on accuracy
best_model_name = max(model_accuracies, key=model_accuracies.get)
print(f"The best model is: {best_model_name} with an accuracy of: {model_accuracies[best_model_name]:.4f}")

# Print the classification report for the best model
predictions = {
    'Logistic Regression': lr_predictions,
    'Random Forest': rf_predictions,
    'LightGBM': lgb_predictions,
    'Neural Network': nn_predictions
}
print(f"Classification report for {best_model_name}:")
print(classification_report(y_test, predictions[best_model_name]))

# End timing and print elapsed time in minutes
end_time = time.time()
elapsed_time_minutes = (end_time - start_time) / 60
print(f"Total execution time: {elapsed_time_minutes:.2f} minutes")



Logistic Regression Accuracy: 0.7540137209137846
Random Forest Accuracy: 0.7358370464672184
[LightGBM] [Info] Number of positive: 28175, number of negative: 28378
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001660 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 203
[LightGBM] [Info] Number of data points in the train set: 56553, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.498205 -> initscore=-0.007179
[LightGBM] [Info] Start training from score -0.007179
LightGBM Accuracy: 0.7581158497772119
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Neural Network Accuracy: 0.7592474715326402
The best model is: Neural Network with an accuracy of: 0.7592
Classification report for Neural Network:
              precision    recall  f1-score   support


## Add in Grid Search and Hyperparameter Tuning 

In [113]:
#Splitting into train and test
X_train, X_test, y_train, y_test = train_test_split(balanced_X_df, balanced_Y, test_size=0.2, random_state=42)

# Start timing
start_time = time.time()

# Logistic Regression with GridSearchCV
lr_param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'solver': ['liblinear', 'lbfgs']
}
lr_grid_search = GridSearchCV(LogisticRegression(max_iter=1000, random_state=42), lr_param_grid, cv=3, scoring='accuracy', verbose=1)
lr_grid_search.fit(X_train, y_train)

# LightGBM with GridSearchCV
lgb_param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1],
    'max_depth': [3, 5, -1],
    'num_leaves': [31, 50],
}
lgb_grid_search = GridSearchCV(lgb.LGBMClassifier(random_state=42), lgb_param_grid, cv=3, scoring='accuracy', verbose=1)
lgb_grid_search.fit(X_train, y_train)

# Random Forest with GridSearchCV
rf_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}
rf_grid_search = GridSearchCV(RandomForestClassifier(random_state=42), rf_param_grid, cv=3, scoring='accuracy', verbose=1)
rf_grid_search.fit(X_train, y_train)

# Print the best parameters and accuracies
models = {'Logistic Regression': lr_grid_search, 'LightGBM': lgb_grid_search, 'Random Forest': rf_grid_search}
for name, model in models.items():
    print(f"{name} Best Parameters:", model.best_params_)
    predictions = model.predict(X_test)
    print(f"{name} Accuracy:", accuracy_score(y_test, predictions))
    print(f"{name} Classification Report:")
    print(classification_report(y_test, predictions))

# End timing and print elapsed time in minutes
end_time = time.time()
elapsed_time_minutes = (end_time - start_time) / 60
print(f"Total execution time: {elapsed_time_minutes:.2f} minutes")


Fitting 3 folds for each of 8 candidates, totalling 24 fits
Fitting 3 folds for each of 24 candidates, totalling 72 fits
[LightGBM] [Info] Number of positive: 18783, number of negative: 18919
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001152 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 203
[LightGBM] [Info] Number of data points in the train set: 37702, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.498196 -> initscore=-0.007215
[LightGBM] [Info] Start training from score -0.007215
[LightGBM] [Info] Number of positive: 18783, number of negative: 18919
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001034 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total B

In [117]:
import tensorflow as tf
import keras_tuner as kt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

def build_model(hp):
    model = Sequential()
    model.add(Dense(units=hp.Int('units_input', 
                                 min_value=32, 
                                 max_value=512, 
                                 step=32), 
                    activation='relu', 
                    input_shape=(X_train.shape[1],)))
    model.add(Dropout(rate=hp.Float('dropout_input',
                                    min_value=0.0,
                                    max_value=0.5,
                                    default=0.25,
                                    step=0.05)))
    
    for i in range(hp.Int('n_layers', 1, 3)):
        model.add(Dense(units=hp.Int(f'units_layer_{i}', 
                                     min_value=32, 
                                     max_value=512, 
                                     step=32), 
                        activation='relu'))
        model.add(Dropout(rate=hp.Float(f'dropout_layer_{i}', 
                                        min_value=0.0, 
                                        max_value=0.5, 
                                        default=0.25, 
                                        step=0.05)))
    
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(optimizer=Adam(hp.Float('learning_rate', 
                                          min_value=1e-4, 
                                          max_value=1e-2, 
                                          sampling='LOG')), 
                  loss='binary_crossentropy', 
                  metrics=['accuracy'])
    return model

# Create a tuner. The objective is set to 'val_accuracy' to maximize validation accuracy.
tuner = kt.Hyperband(build_model,
                     objective='val_accuracy',
                     max_epochs=10,
                     factor=3,
                     directory='my_dir',
                     project_name='intro_to_kt')

# Early stopping callback to avoid unnecessary training time
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

# Search for the best hyperparameters
tuner.search(X_train, y_train, epochs=50, validation_split=0.2, callbacks=[stop_early])

# Get the optimal hyperparameters
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]

# Build the model with the best hyperparameters and train it on the data
model = tuner.hypermodel.build(best_hps)
history = model.fit(X_train, y_train, epochs=50, validation_split=0.2)


Trial 30 Complete [00h 00m 34s]
val_accuracy: 0.7497126460075378

Best val_accuracy So Far: 0.7520112991333008
Total elapsed time: 00h 10m 29s
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
