In [14]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder,OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import pickle

In [15]:
# Read the DataFrame
df = pd.read_csv('insurance.csv')
df.head(2)


Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523


In [16]:
# Drop Duplicates
df = df.drop_duplicates()

In [17]:
# Assigning the X and Y variables
X = df.drop('charges',axis = 1)
Y = df[['charges']]

In [18]:
# Test and Train Split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [19]:
# Feature Engineering
label_sex = LabelEncoder()
label_smoke = LabelEncoder()
onehot_region = OneHotEncoder(sparse_output=False)

X_train['sex'] = label_sex.fit_transform(X_train['sex']) # Label Encoding
X_test['sex'] = label_sex.transform(X_test['sex'])

X_train['smoker'] = label_smoke.fit_transform(X_train['smoker'])  # Label Encoding
X_test['smoker'] = label_smoke.transform(X_test['smoker'])


X_train_trans = onehot_region.fit_transform(X_train[['region']])  # One Hot Encoding
X_test_trans = onehot_region.transform(X_test[['region']])
col_names = onehot_region.get_feature_names_out()

# Convert Dataframe and Reset index
encoded_df_train = pd.DataFrame(X_train_trans, columns=col_names).reset_index(drop=True)
encoded_df_test = pd.DataFrame(X_test_trans, columns=col_names).reset_index(drop=True)
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)

In [20]:
# Removing the Region column
X_train = X_train.drop('region',axis = 1)
X_test = X_test.drop('region',axis = 1)

In [21]:
# Concat the one hot encoded columns
X_train = pd.concat([X_train, encoded_df_train], axis=1)
X_test = pd.concat([X_test, encoded_df_test], axis=1)

In [22]:
# Scaling
scalar = StandardScaler()
X_train = scalar.fit_transform(X_train)
X_test = scalar.transform(X_test)

In [23]:
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam, SGD, RMSprop
import keras_tuner as kt

In [24]:
# Hyperparameter tuning, model building, training

def build_model(hp):

    model = Sequential()

    # Adding the first hidden layer   
    model.add(Dense(units=hp.Int('units1', min_value=10, max_value=100, step=10),
                    activation='relu', input_dim=X_train.shape[1]))
                    
    # Adding consecutive hidden layers and deciding the number of neurons              
    for i in range(hp.Int('num_layers', 1, 10)):  
        model.add(Dense(units=hp.Int(f'units_{i+2}', min_value=10, max_value=100, step=10),
                        activation='relu')) 

    # Defining the output layer
    model.add(Dense(1))

    # Selecting the optimizer
    optimizer = hp.Choice('optimizer', values=['adam', 'sgd', 'rmsprop'])

    model.compile(optimizer=optimizer, loss='mean_absolute_error', metrics=['mae'])
    
    return model

# Setup Keras Tuner with Hyperband algorithm
tuner = kt.Hyperband(
build_model,
objective='val_loss',
max_epochs=10,  
factor=3,  
directory='my_dir',  
project_name='hyperparameter_tuning')

# Run hyperparameter search
tuner.search(X_train, Y_train, epochs=10, validation_data=(X_test, Y_test))

# Get the best hyperparameters found by the tuner
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

# Build the best model based on found hyperparameters
best_model = tuner.hypermodel.build(best_hps)


# Print the best hyperparameters
print(f"Number of layers: {best_hps['num_layers']}")
print(f"Neurons in first hidden layer: {best_hps['units1']}")
for i in range(best_hps['num_layers']):
    print(f"Neurons in layer {i+2}: {best_hps[f'units_{i+2}']}")
print(f"Optimizer: {best_hps['optimizer']}")


# Adding early_stopping
early_stopping = EarlyStopping(monitor='val_loss',  # Monitor the validation loss
                               patience=5,  # Number of epochs to wait for improvement
                               restore_best_weights=True)

# Train the best model
best_model.fit(X_train, Y_train, epochs = 100, validation_data=(X_test, Y_test), callbacks=early_stopping)

Reloading Tuner from my_dir/hyperparameter_tuning/tuner0.json
Number of layers: 4
Neurons in first hidden layer: 10
Neurons in layer 2: 40
Neurons in layer 3: 20
Neurons in layer 4: 40
Neurons in layer 5: 20
Optimizer: adam
Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 13448.4121 - mae: 13448.4121 - val_loss: 14271.2891 - val_mae: 14271.2891
Epoch 2/100
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 12722.8535 - mae: 12722.8535 - val_loss: 14262.1670 - val_mae: 14262.1670
Epoch 3/100
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 12973.1016 - mae: 12973.1016 - val_loss: 14168.6924 - val_mae: 14168.6924
Epoch 4/100
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 12962.4902 - mae: 12962.4902 - val_loss: 13558.1035 - val_mae: 13558.1035
Epoch 5/100
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 12173.0801 - mae: 12173.0801 - val_loss: 11464.0488 - val_mae: 11464.0488
Epoch 6/100
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 9598.6045 - mae: 9598.6045 - val_loss: 9124.1426 - val_mae: 9124.1426
Epoch 7/100
[

<keras.src.callbacks.history.History at 0x71874f782430>

In [25]:
# Evaluate the best model
test_loss, test_mae = best_model.evaluate(X_test, Y_test)
print(f"MAE: {test_mae}")

[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 1878.6288 - mae: 1878.6288 
MAE: 1900.786865234375


In [26]:
# Saving the model in h5 format
best_model.save('model.h5')

