In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder,OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import pickle

In [2]:
# Read the DataFrame
df = pd.read_csv('insurance.csv')
df.head(2)


Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523


In [3]:
# Divide into X and Y variables
X = df.drop('charges',axis = 1)
Y = df[['charges']]

In [4]:
# Train and Test Split
X_train_full, X_test, Y_train_full, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
X_train, X_val, Y_train, Y_val = train_test_split(X_train_full, Y_train_full, test_size=0.2, random_state = 42)


In [5]:
# Feature Engineering
label_sex = LabelEncoder()
label_smoke = LabelEncoder()
onehot_region = OneHotEncoder(sparse_output=False)

X_train['sex'] = label_sex.fit_transform(X_train['sex']) # Label Encoding
X_val['sex'] = label_sex.transform(X_val['sex'])

X_train['smoker'] = label_smoke.fit_transform(X_train['smoker'])  # Label Encoding
X_val['smoker'] = label_smoke.transform(X_val['smoker'])


X_train_trans = onehot_region.fit_transform(X_train[['region']])  # One Hot Encoding
X_val_trans = onehot_region.transform(X_val[['region']])
col_names = onehot_region.get_feature_names_out()

# Convert Dataframe and Reset index
encoded_df_test = pd.DataFrame(X_train_trans, columns=col_names).reset_index(drop=True)
encoded_df_val = pd.DataFrame(X_val_trans, columns=col_names).reset_index(drop=True)
X_train = X_train.reset_index(drop=True)
X_val = X_val.reset_index(drop=True)

In [6]:
# Drop the region column
X_train = X_train.drop('region',axis = 1)
X_val = X_val.drop('region',axis = 1)

In [7]:
# Concat the one hot encoded columns
X_train = pd.concat([X_train, encoded_df_test], axis=1)
X_val = pd.concat([X_val, encoded_df_val], axis=1)

In [8]:
# Scaling
scalar = StandardScaler()
X_train = scalar.fit_transform(X_train)
X_val = scalar.transform(X_val)

In [9]:
# Save in Pickle file
with open('label_encoder_sex.pkl','wb') as file:
    pickle.dump(label_sex,file)

with open('label_encoder_smoke.pkl','wb') as file:
    pickle.dump(label_smoke,file)    

with open('onehot_region.pkl','wb') as file:
    pickle.dump(onehot_region,file)  

with open('scalar.pkl','wb') as file:
    pickle.dump(scalar,file)         

In [10]:
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam, SGD, RMSprop
import keras_tuner as kt

2024-11-17 16:34:19.829943: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-11-17 16:34:19.835968: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-11-17 16:34:19.850905: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1731861259.876778    3429 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1731861259.888484    3429 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-17 16:34:19.921096: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU ins

In [None]:
# Hyperparameter tuning, model building, training

def build_model(hp):

    model = Sequential()

    # Adding the first hidden layer   
    model.add(Dense(units=hp.Int('units1', min_value=10, max_value=100, step=10),
                    activation='relu', input_dim=X_train.shape[1]))
                    
    # Adding consecutive hidden layers and deciding the number of neurons              
    for i in range(hp.Int('num_layers', 1, 8)):  
        model.add(Dense(units=hp.Int(f'units_{i+2}', min_value=10, max_value=100, step=10),
                        activation='relu')) 

    # Defining the output layer
    model.add(Dense(1))

    # Selecting the optimizer
    optimizer = hp.Choice('optimizer', values=['adam', 'sgd', 'rmsprop'])

    model.compile(optimizer=optimizer, loss='mean_squared_error', metrics=['mean_absolute_error'])
    
    return model

# Setup Keras Tuner with Hyperband algorithm
tuner = kt.Hyperband(
build_model,
objective='val_loss',
max_epochs=10,  
factor=3,  
directory='my_dir',  
project_name='hyperparameter_tuning')

# Run hyperparameter search
tuner.search(X_train, Y_train, epochs=10, validation_data=(X_val, Y_val))


# Get the best hyperparameters found by the tuner
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

# Print the best hyperparameters
print(f"Number of layers: {best_hps['num_layers']}")
print(f"Neurons in first hidden layer: {best_hps['units1']}")
for i in range(best_hps['num_layers']):
    print(f"Neurons in layer {i+2}: {best_hps[f'units_{i+2}']}")
print(f"Optimizer: {best_hps['optimizer']}")

# Build the best model based on found hyperparameters
best_model = tuner.hypermodel.build(best_hps)

# Adding early_stopping
early_stopping = EarlyStopping(monitor='val_loss',  # Monitor the validation loss
                               patience=5,  # Number of epochs to wait for improvement
                               restore_best_weights=True)

# Train the best model
best_model.fit(X_train, Y_train, epochs = 100, validation_data=(X_val, Y_val), callbacks=early_stopping)

# Evaluate the best model
loss, accuracy = best_model.evaluate(X_val, Y_val)
print(f"Final model accuracy on validation data: {loss}")


Reloading Tuner from my_dir/hyperparameter_tuning/tuner0.json
Number of layers: 4
Neurons in first hidden layer: 10
Neurons in layer 2: 40
Neurons in layer 3: 20
Neurons in layer 4: 40
Neurons in layer 5: 20
Optimizer: adam


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
2024-11-17 16:34:31.377589: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


Epoch 1/100
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 10ms/step - loss: 291419552.0000 - mean_absolute_error: 12841.6699 - val_loss: 355187328.0000 - val_mean_absolute_error: 13987.3428
Epoch 2/100
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 283122912.0000 - mean_absolute_error: 12635.4219 - val_loss: 355034624.0000 - val_mean_absolute_error: 13982.4346
Epoch 3/100
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 346318304.0000 - mean_absolute_error: 13982.0586 - val_loss: 353937056.0000 - val_mean_absolute_error: 13948.0068
Epoch 4/100
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 322727328.0000 - mean_absolute_error: 13386.5010 - val_loss: 347648416.0000 - val_mean_absolute_error: 13753.6689
Epoch 5/100
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 327258944.0000 - mean_absolute_error: 13440.0400 - val_loss: 321332224.0000 - 

In [16]:
loss = best_model.evaluate(X_val, Y_val)
print(f"Final model loss (MSE) on validation data: {loss}")

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 25962466.0000 - mean_absolute_error: 2900.3040
Final model loss (MSE) on validation data: [31066402.0, 3222.407958984375]


In [12]:
X_test.to_csv('X_test',index=False, header=True)

In [13]:
Y_test.to_csv('Y_test',index=False, header=True)

In [14]:
# Saving the model in h5 format
best_model.save('model.h5')



In [15]:
print(f"Final model accuracy on validation data: {loss}")

Final model accuracy on validation data: 31066402.0
