In [174]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder,OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import pickle

In [175]:
df = pd.read_csv('insurance.csv')
df.head(2)


Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523


In [176]:
# Divide into X and Y variables

X = df.drop('charges',axis = 1)
Y = df[['charges']]

In [177]:
# Train and Test Split

X_train_full, X_test, Y_train_full, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
X_train, X_val, Y_train, Y_val = train_test_split(X_train_full, Y_train_full, test_size=0.2, random_state = 42)


In [178]:
# Feature Engg

label_sex = LabelEncoder()
label_smoke = LabelEncoder()
onehot_region = OneHotEncoder(sparse_output=False)

X_train['sex'] = label_sex.fit_transform(X_train['sex'])
X_val['sex'] = label_sex.transform(X_val['sex'])

X_train['smoker'] = label_smoke.fit_transform(X_train['smoker'])
X_val['smoker'] = label_smoke.transform(X_val['smoker'])


X_train_trans = onehot_region.fit_transform(X_train[['region']])
X_val_trans = onehot_region.transform(X_val[['region']])
col_names = onehot_region.get_feature_names_out()
encoded_df_test = pd.DataFrame(X_train_trans,columns = col_names )
encoded_df_val = pd.DataFrame(X_val_trans,columns = col_names )



In [179]:
# Drop the region column

X_train = X_train.drop('region',axis = 1)
X_val = X_val.drop('region',axis = 1)

In [180]:
# Concat the one hot encoded columns

X_train = pd.concat([encoded_df_test],axis = 1)
X_val = pd.concat([encoded_df_val],axis = 1)

In [181]:
# Scaling

scalar = StandardScaler()
X_train = scalar.fit_transform(X_train)
X_val = scalar.transform(X_val)

In [182]:
# Save in Pickle file

with open('label_encoder_sex.pkl','wb') as file:
    pickle.dump(label_sex,file)

with open('label_encoder_smoke.pkl','wb') as file:
    pickle.dump(label_smoke,file)    

with open('onehot_region.pkl','wb') as file:
    pickle.dump(onehot_region,file)  

with open('scalar.pkl','wb') as file:
    pickle.dump(scalar,file)         

In [183]:
X_val.shape

(214, 4)

In [184]:
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam, SGD, RMSprop
import keras_tuner as kt

In [185]:
# Hyperparameter tuning. model building and prediction

def build_model(hp):

    model = Sequential()

    # Adding the first hidden layer   
    model.add(Dense(units=hp.Int('units1', min_value=10, max_value=100, step=10),
                    activation='relu', input_dim=X_train.shape[1]))
                    
    # Adding consecutive hidden layers and deciding the number of neurons              
    for i in range(hp.Int('num_layers', 1, 10)):  
        model.add(Dense(units=hp.Int(f'units_{i+2}', min_value=10, max_value=100, step=10),
                        activation='relu')) 

    # Defining the output layer
    model.add(Dense(1))

    # Selecting the optimizer
    optimizer = hp.Choice('optimizer', values=['adam', 'sgd', 'rmsprop'])

    model.compile(optimizer=optimizer, loss='mean_squared_error', metrics=['mean_absolute_error'])
    
    return model

# Setup Keras Tuner with Hyperband algorithm
tuner = kt.Hyperband(
build_model,
objective='val_loss',
max_epochs=10,  
factor=3,  
directory='my_dir',  
project_name='hyperparameter_tuning')

# Run hyperparameter search
tuner.search(X_train, Y_train, epochs=10, validation_data=(X_val, Y_val))


# Get the best hyperparameters found by the tuner
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

# Print the best hyperparameters
print(f"Number of layers: {best_hps['num_layers']}")
print(f"Neurons in first hidden layer: {best_hps['units1']}")
for i in range(best_hps['num_layers']):
    print(f"Neurons in layer {i+2}: {best_hps[f'units_{i+2}']}")
print(f"Optimizer: {best_hps['optimizer']}")

# Build the best model based on found hyperparameters
best_model = tuner.hypermodel.build(best_hps)

# Adding early_stopping
early_stopping = EarlyStopping(monitor='val_loss',  # Monitor the validation loss
                               patience=3,  # Number of epochs to wait for improvement
                               restore_best_weights=True)

# Train the best model
best_model.fit(X_train, Y_train, epochs = 100, validation_data=(X_val, Y_val), callbacks=early_stopping)

# 16. Evaluate the best model
loss, accuracy = best_model.evaluate(X_val, Y_val)
print(f"Final model accuracy on validation data: {accuracy}")


Reloading Tuner from my_dir/hyperparameter_tuning/tuner0.json
Number of layers: 4
Neurons in first hidden layer: 10
Neurons in layer 2: 40
Neurons in layer 3: 20
Neurons in layer 4: 40
Neurons in layer 5: 20
Optimizer: adam
Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - loss: 313202400.0000 - mean_absolute_error: 13194.2129 - val_loss: 355160192.0000 - val_mean_absolute_error: 13986.2930
Epoch 2/100
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 333113376.0000 - mean_absolute_error: 13450.8936 - val_loss: 354896480.0000 - val_mean_absolute_error: 13976.8584
Epoch 3/100
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 311869472.0000 - mean_absolute_error: 13158.6641 - val_loss: 353393696.0000 - val_mean_absolute_error: 13922.8789
Epoch 4/100
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 296533696.0000 - mean_absolute_error: 12837.2705 - val_loss: 346361216.0000 - val_mean_absolute_error: 13666.9648
Epoch 5/100
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 284542592.0000 - mean_absolute_error: 12662.6455 - val_loss: 321723232.0000 - val_mean_abso