# Medical Insurance Cost Prediction

This notebook demonstrates building, tuning, and comparing a Scikit-learn Random Forest model and a TensorFlow Neural Network model for predicting medical insurance costs.

In [23]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import keras_tuner as kt
import matplotlib.pyplot as plt

## 1. Load and Explore Data

In [24]:
# Load the dataset
df = pd.read_csv('/content/insurance.csv')

# Display the first 5 rows
print("First 5 rows of the dataset:")
display(df.head())

# Print concise information about the DataFrame
print("\nDataFrame Info:")
display(df.info())

# Display descriptive statistics
print("\nDescriptive Statistics:")
display(df.describe())

# Check for missing values
print("\nMissing values per column:")
display(df.isnull().sum())

First 5 rows of the dataset:


Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552



DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


None


Descriptive Statistics:


Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801



Missing values per column:


Unnamed: 0,0
age,0
sex,0
bmi,0
children,0
smoker,0
region,0
charges,0


## 2. Preprocess Data

In [25]:
# Define categorical and numerical features
categorical_features = ['sex', 'smoker', 'region']
numerical_features = ['age', 'bmi', 'children']

# Create preprocessing pipelines for numerical and categorical features
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Create a column transformer to apply different transformations to different columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='passthrough' # Keep other columns (like 'charges')
)

# Apply preprocessing to the entire dataset
df_preprocessed = preprocessor.fit_transform(df.drop('charges', axis=1))

# Get the names of the processed columns
# This requires getting the feature names from the one-hot encoder
cat_feature_names = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)
all_features = numerical_features + list(cat_feature_names)

# Convert the preprocessed data back to a DataFrame (optional, but good for inspection)
X = pd.DataFrame(df_preprocessed, columns=all_features)
y = df['charges'] # Target variable

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Data preprocessing complete. Shapes of training and testing sets:")
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

print("\nProcessed features (X_train head):")
display(X_train.head())
print("\nTarget variable (y_train head):")
display(y_train.head())

Data preprocessing complete. Shapes of training and testing sets:
X_train shape: (1070, 11)
X_test shape: (268, 11)
y_train shape: (1070,)
y_test shape: (268,)

Processed features (X_train head):


Unnamed: 0,age,bmi,children,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
560,0.483668,-1.757474,0.751079,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1285,0.554869,-1.040599,-0.908614,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
1142,0.910875,-0.952015,-0.908614,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
969,-0.01474,0.599846,3.240619,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
486,1.053277,-1.508126,1.580926,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0



Target variable (y_train head):


Unnamed: 0,charges
560,9193.8385
1285,8534.6718
1142,27117.99378
969,8596.8278
486,12475.3513


## 3. Build and Train Untuned Models

### 3.1 Scikit-learn (Random Forest Regressor)

In [26]:
# Instantiate a RandomForestRegressor object
rf_model = RandomForestRegressor(random_state=42)

# Fit the model to the training data
rf_model.fit(X_train, y_train)

print("Untuned RandomForestRegressor model training complete.")

Untuned RandomForestRegressor model training complete.


### 3.2 TensorFlow (Neural Network)

In [27]:
# Instantiate a Sequential model
keras_model = Sequential()

# Add Dense layers
# The input layer size should match the number of features in X_train
keras_model.add(Dense(128, activation='relu', input_shape=(X_train.shape[1],)))
keras_model.add(Dense(64, activation='relu'))
keras_model.add(Dense(32, activation='relu'))

# Add the output layer
keras_model.add(Dense(1))

# Compile the model
keras_model.compile(optimizer='adam', loss='mse', metrics=['mae'])

# Train the model
# Use validation_split to monitor performance during training
history_keras_untuned = keras_model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.2, verbose=0)

print("Untuned Keras Sequential model training complete.")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Untuned Keras Sequential model training complete.


## 4. Evaluate Untuned Models

In [28]:
# Make predictions with the untuned scikit-learn model
y_pred_rf_untuned = rf_model.predict(X_test)

# Evaluate and print metrics for untuned scikit-learn model
mae_rf_untuned = mean_absolute_error(y_test, y_pred_rf_untuned)
mse_rf_untuned = mean_squared_error(y_test, y_pred_rf_untuned)
r2_rf_untuned = r2_score(y_test, y_pred_rf_untuned)

print("--- Scikit-learn Model Evaluation (Untuned) ---")
print(f"Mean Absolute Error (MAE): {mae_rf_untuned:.4f}")
print(f"Mean Squared Error (MSE): {mse_rf_untuned:.4f}")
print(f"R-squared (R2): {r2_rf_untuned:.4f}")
print("-" * 45)

# Make predictions with the untuned TensorFlow model
y_pred_keras_untuned = keras_model.predict(X_test).flatten()

# Evaluate and print metrics for untuned TensorFlow model
mae_keras_untuned = mean_absolute_error(y_test, y_pred_keras_untuned)
mse_keras_untuned = mean_squared_error(y_test, y_pred_keras_untuned)
r2_keras_untuned = r2_score(y_test, y_pred_keras_untuned)

print("--- TensorFlow Model Evaluation (Untuned) ---")
print(f"Mean Absolute Error (MAE): {mae_keras_untuned:.4f}")
print(f"Mean Squared Error (MSE): {mse_keras_untuned:.4f}")
print(f"R-squared (R2): {r2_keras_untuned:.4f}")
print("-" * 45)

--- Scikit-learn Model Evaluation (Untuned) ---
Mean Absolute Error (MAE): 2540.9935
Mean Squared Error (MSE): 21051201.5989
R-squared (R2): 0.8644
---------------------------------------------
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step
--- TensorFlow Model Evaluation (Untuned) ---
Mean Absolute Error (MAE): 2843.6694
Mean Squared Error (MSE): 20599460.8591
R-squared (R2): 0.8673
---------------------------------------------


## 5. Hyperparameter Tuning for Both Models

### 5.1 Scikit-learn (Random Forest Regressor) Tuning

In [29]:
# Define the hyperparameter grid for RandomForestRegressor
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Instantiate a RandomForestRegressor model
rf_model_tune = RandomForestRegressor(random_state=42)

# Instantiate a GridSearchCV object
grid_search_rf = GridSearchCV(
    estimator=rf_model_tune,
    param_grid=param_grid_rf,
    scoring='neg_mean_absolute_error', # Use negative MAE for GridSearchCV (maximizes score)
    cv=5, # 5-fold cross-validation
    verbose=2,
    n_jobs=-1 # Use all available cores
)

print("Starting GridSearchCV for RandomForestRegressor...")
# Fit the GridSearchCV object to the training data
grid_search_rf.fit(X_train, y_train)

# Print the best hyperparameters and best score
print("\nBest hyperparameters found for RandomForestRegressor:")
print(grid_search_rf.best_params_)
print("Best cross-validation negative MAE found for RandomForestRegressor:")
print(grid_search_rf.best_score_)

# Retrieve the best hyperparameters
best_params_rf = grid_search_rf.best_params_
print("\nHyperparameter tuning for RandomForestRegressor complete.")

Starting GridSearchCV for RandomForestRegressor...
Fitting 5 folds for each of 108 candidates, totalling 540 fits

Best hyperparameters found for RandomForestRegressor:
{'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 100}
Best cross-validation negative MAE found for RandomForestRegressor:
-2623.432438288127

Hyperparameter tuning for RandomForestRegressor complete.


### 5.2 TensorFlow (Neural Network) Tuning with KerasTuner

In [30]:
# Define the model building function for KerasTuner
def build_tunable_model(hp):
    """
    Builds and compiles a Keras Sequential model with tunable hyperparameters.

    Args:
        hp: KerasTuner HyperParameters object.

    Returns:
        tensorflow.keras.models.Sequential: The compiled Keras model.
    """
    model = Sequential()

    # Tune the number of layers and units
    hp_num_layers = hp.Int('num_layers', min_value=1, max_value=3, step=1)
    # Define units for each potential layer
    hp_units = [hp.Int(f'units_{i}', min_value=32, max_value=128, step=32) for i in range(3)] # Max 3 layers

    # Add input layer and first hidden layer
    model.add(Dense(hp_units[0], activation=hp.Choice('activation', values=['relu', 'tanh']), input_shape=(X_train.shape[1],)))

    # Add additional hidden layers based on tuned number of layers
    for i in range(1, hp_num_layers):
        model.add(Dense(hp_units[i], activation=hp.Choice('activation', values=['relu', 'tanh'])))

    # Add output layer
    model.add(Dense(1))

    # Tune the optimizer
    hp_optimizer = hp.Choice('optimizer', values=['adam', 'rmsprop'])

    # Tune the loss function
    hp_loss = hp.Choice('loss', values=['mse', 'mae'])

    model.compile(optimizer=hp_optimizer, loss=hp_loss, metrics=['mae'])

    return model

# Instantiate the tuner (Hyperband)
tuner = kt.Hyperband(
    build_tunable_model,
    objective='val_mae', # Objective is to minimize validation MAE
    max_epochs=50, # Maximum number of epochs to train a model
    factor=3, # Factor by which the number of epochs is reduced and the number of models is increased
    directory='keras_tuner_dir', # Directory to store tuning results
    project_name='insurance_cost_prediction_tuned' # Name of the project
)

print("Starting KerasTuner search for TensorFlow model...")
# Run the hyperparameter search
# Use a portion of the training data for validation during tuning
tuner.search(X_train, y_train, epochs=50, validation_split=0.2, verbose=1)

# Get the optimal hyperparameters
best_hps_keras = tuner.get_best_hyperparameters(num_trials=1)[0]

print("\nBest hyperparameters found for TensorFlow model:")
print(best_hps_keras.values)

print("\nHyperparameter tuning for TensorFlow model complete.")

Trial 90 Complete [00h 00m 13s]
val_mae: 4916.18115234375

Best val_mae So Far: 2973.486083984375
Total elapsed time: 00h 06m 40s

Best hyperparameters found for TensorFlow model:
{'num_layers': 3, 'units_0': 128, 'units_1': 64, 'units_2': 32, 'activation': 'relu', 'optimizer': 'adam', 'loss': 'mae', 'tuner/epochs': 50, 'tuner/initial_epoch': 17, 'tuner/bracket': 3, 'tuner/round': 3, 'tuner/trial_id': '0047'}

Hyperparameter tuning for TensorFlow model complete.


## 6. Train Models with Best Hyperparameters

### 6.1 Tuned Scikit-learn (Random Forest Regressor)

In [31]:
# Instantiate a new RandomForestRegressor model with the best hyperparameters
tuned_rf_model = RandomForestRegressor(random_state=42, **best_params_rf)

# Train the tuned model on the entire training data
tuned_rf_model.fit(X_train, y_train)

print("Tuned RandomForestRegressor model training complete.")

Tuned RandomForestRegressor model training complete.


### 6.2 Tuned TensorFlow (Neural Network)

In [32]:
# Build and compile the Keras model with the best hyperparameters found by KerasTuner
tuned_keras_model = build_tunable_model(best_hps_keras)

# Train the tuned Keras model on the entire training data
# The number of epochs is obtained from the best hyperparameters found by the tuner
history_tuned_keras = tuned_keras_model.fit(X_train, y_train, epochs=best_hps_keras.get('tuner/epochs'),
                                             batch_size=32, # Use a fixed batch size, or tune this as well
                                             validation_split=0.2, # Still use validation split to monitor training
                                             verbose=0) # Set verbose to 1 to see training progress

print("Tuned Keras model training complete.")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Tuned Keras model training complete.


## 7. Evaluate Tuned Models

In [33]:
# Make predictions with the tuned scikit-learn model
y_pred_tuned_rf = tuned_rf_model.predict(X_test)

# Evaluate and print metrics for the tuned scikit-learn model
mae_tuned_rf = mean_absolute_error(y_test, y_pred_tuned_rf)
mse_tuned_rf = mean_squared_error(y_test, y_pred_tuned_rf)
r2_tuned_rf = r2_score(y_test, y_pred_tuned_rf)

print("--- Tuned Scikit-learn Model Evaluation ---")
print(f"Mean Absolute Error (MAE): {mae_tuned_rf:.4f}")
print(f"Mean Squared Error (MSE): {mse_tuned_rf:.4f}")
print(f"R-squared (R2): {r2_tuned_rf:.4f}")
print("-" * 40)

# Make predictions with the tuned TensorFlow model
y_pred_tuned_keras = tuned_keras_model.predict(X_test).flatten()

# Evaluate and print metrics for the tuned TensorFlow model
mae_tuned_keras = mean_absolute_error(y_test, y_pred_tuned_keras)
mse_tuned_keras = mean_squared_error(y_test, y_pred_tuned_keras)
r2_tuned_keras = r2_score(y_test, y_pred_tuned_keras)

print("--- Tuned TensorFlow Model Evaluation ---")
print(f"Mean Absolute Error (MAE): {mae_tuned_keras:.4f}")
print(f"Mean Squared Error (MSE): {mse_tuned_keras:.4f}")
print(f"R-squared (R2): {r2_tuned_keras:.4f}")
print("-" * 40)

--- Tuned Scikit-learn Model Evaluation ---
Mean Absolute Error (MAE): 2435.4770
Mean Squared Error (MSE): 19098675.6105
R-squared (R2): 0.8770
----------------------------------------
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step
--- Tuned TensorFlow Model Evaluation ---
Mean Absolute Error (MAE): 2739.7066
Mean Squared Error (MSE): 34097455.9727
R-squared (R2): 0.7804
----------------------------------------


## 8. Compare Tuned Model Performance

In [34]:
print("--- Tuned Model Performance Comparison ---")
print(f"Metric        | Tuned Scikit-learn (RF) | Tuned TensorFlow (Keras)")
print("-" * 60)
print(f"MAE           | {mae_tuned_rf:<24.4f} | {mae_tuned_keras:<25.4f}")
print(f"MSE           | {mse_tuned_rf:<24.4f} | {mse_tuned_keras:<25.4f}")
print(f"R-squared     | {r2_tuned_rf:<24.4f} | {r2_tuned_keras:<25.4f}")
print("-" * 60)

print("\n--- Comparison with Untuned Models ---")
print(f"Model                     | MAE (Untuned) | MAE (Tuned) | Improvement (MAE)")
print("-" * 60)
print(f"Scikit-learn (Random Forest) | {mae_rf_untuned:<13.4f} | {mae_tuned_rf:<11.4f} | {mae_rf_untuned - mae_tuned_rf:<17.4f}")
print(f"TensorFlow (Neural Network) | {mae_keras_untuned:<13.4f} | {mae_tuned_keras:<11.4f} | {mae_keras_untuned - mae_tuned_keras:<17.4f}")
print("-" * 60)

print("\n--- Overall Best Performing Tuned Model ---")
if mae_tuned_rf < mae_tuned_keras:
    print("The tuned Scikit-learn Random Forest model performed better with a lower MAE.")
elif mae_tuned_keras < mae_tuned_rf:
    print("The tuned TensorFlow Neural Network model performed better with a lower MAE.")
else:
    print("The tuned models have similar MAE performance.")

--- Tuned Model Performance Comparison ---
Metric        | Tuned Scikit-learn (RF) | Tuned TensorFlow (Keras)
------------------------------------------------------------
MAE           | 2435.4770                | 2739.7066                
MSE           | 19098675.6105            | 34097455.9727            
R-squared     | 0.8770                   | 0.7804                   
------------------------------------------------------------

--- Comparison with Untuned Models ---
Model                     | MAE (Untuned) | MAE (Tuned) | Improvement (MAE)
------------------------------------------------------------
Scikit-learn (Random Forest) | 2540.9935     | 2435.4770   | 105.5165         
TensorFlow (Neural Network) | 2843.6694     | 2739.7066   | 103.9628         
------------------------------------------------------------

--- Overall Best Performing Tuned Model ---
The tuned Scikit-learn Random Forest model performed better with a lower MAE.
