In [6]:
#Importing...
import pandas as pd
import tensorflow as tf
from tensorflow.keras.utils import plot_model
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import Ridge 
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor


In [7]:
#Load data
data = pd.read_csv('cleaned_data.csv')

data.head()

Unnamed: 0.1,Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest
0,0,19,27.9,0,16884.924,0,1,0,0,1
1,1,18,33.77,1,1725.5523,1,0,0,1,0
2,2,28,33.0,3,4449.462,1,0,0,1,0
3,3,33,22.705,0,21984.47061,1,0,1,0,0
4,4,32,28.88,0,3866.8552,1,0,1,0,0


In [8]:
#Features and target variables

X = data.drop(columns=['charges'])
y = data['charges']

In [9]:
#Split data

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

In [10]:
#Models

linerarRegressor = LinearRegression()
linerarRegressor.fit(X_train, y_train)
predictions_lr = linerarRegressor.predict(X_test)


randomForestModel = RandomForestRegressor(n_estimators=100, random_state=42)
randomForestModel.fit(X_train, y_train)
predictions_rf = randomForestModel.predict(X_test)


In [11]:
#Model Evaluation
mse_lr = mean_squared_error(y_test, y_pred=predictions_lr)

r2_lr= r2_score(y_test, y_pred=predictions_lr)

print(f"\nMean Squared Error for Linear Regression : {mse_lr}\n"
      f"\nR-squared Score Linear Regression: {r2_lr}"
      )

mse_rf = mean_squared_error(y_test, y_pred=predictions_rf)

r2_rf= r2_score(y_test, y_pred=predictions_rf)

print(f"\nMean Squared Error of Random Forest: {mse_rf}\n"
      f"\nR-squared Score of Random Forest: {r2_rf}"
      )


Mean Squared Error for Linear Regression : 35564984.47965559

R-squared Score Linear Regression: 0.8064554513330181

Mean Squared Error of Random Forest: 20478073.23579169

R-squared Score of Random Forest: 0.8885583812286515


In [12]:

#Standardization
scaler_std = StandardScaler()
X_train_std = scaler_std.fit_transform(X_train)
X_test_std = scaler_std.fit_transform(X_test)

#Normalization
scaler_minmax = MinMaxScaler()
X_train_minmax = scaler_minmax.fit_transform(X_train)
X_test_minmax = scaler_minmax.fit_transform(X_test)


In [13]:

ridge = Ridge(alpha=1.0)
ridge.fit(X_train_std, y_train)

predictions_ridge = ridge.predict(X_test_std)

mse_ridge = mean_squared_error(y_test, predictions_ridge)
r2_ridge = r2_score(y_test, predictions_ridge)

print("\nRidge Regression Mean Squared Error:\n", mse_ridge)
print("\nRidge Regression R2 score:\n", r2_ridge)


Ridge Regression Mean Squared Error:
 38366284.09655684

Ridge Regression R2 score:
 0.7912107864479742


Ridge Regression:
Ridge Regression is a type of regularized linear regression that can help in handling multicollinearity and prevent overfitting by adding a regularization term to the cost function

In [14]:

#Set up the pipeline with scaling and the Ridge model
steps = [
    ('scaler', StandardScaler()),
    ('model', Ridge()) 
]

pipeline = Pipeline(steps)

#Hyperparameters grid to search over
param_grid = {
    'model__alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]  # Values of alpha to test
}

# Perform GridSearchCV
grid = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error')
grid.fit(X_train, y_train)

#Best hyperparameters and best score
print("\nBest Hyperparameters:\n", grid.best_params_)
print("\nBest Cross-Validation MSE:\n", -grid.best_score_)

#Evaluate on test set using best model
best_model = grid.best_estimator_
test_predictions = best_model.predict(X_test)

test_mse = mean_squared_error(y_test, test_predictions)
test_r2 = r2_score(y_test, test_predictions)

print("\nTest Set MSE with Best Model:\n", test_mse)
print("\nTest Set R2 with Best Model:\n", test_r2)


Best Hyperparameters:
 {'model__alpha': 10.0}

Best Cross-Validation MSE:
 37629448.590384945

Test Set MSE with Best Model:
 35915547.839589596

Test Set R2 with Best Model:
 0.8045476864831158


XGBoost: Gradient Boosting Regression (e.g., XGBoost, LightGBM):
Model: Builds trees sequentially, focusing on areas where previous trees performed poorly. We will aso tune the hyperparameters to optimize model


In [15]:
xgb = XGBRegressor()

param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5]
}
grid = GridSearchCV(xgb, param_grid, cv=5, scoring='neg_mean_squared_error')
grid.fit(X_train, y_train)

#Best parameters
print("\nBest Hyperparameters:\n", grid.best_params_)
print("\nBest Cross-Validation MSE:\n", -grid.best_score_)

#Model Evaluation
best_model = grid.best_estimator_
test_predictions = best_model.predict(X_test)

test_mse_xgb = mean_squared_error(y_test, test_predictions)
test_r2_xgb = r2_score(y_test, test_predictions)

print("\nTest Set MSE with Best Model:\n", test_mse_xgb)
print("\nTest Set R2 with Best Model: \n", test_r2_xgb)



Best Hyperparameters:
 {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 300}

Best Cross-Validation MSE:
 21658101.807436105

Test Set MSE with Best Model:
 19114153.728462704

Test Set R2 with Best Model: 
 0.895980827472515


In [16]:
Model = tf.keras.Sequential([
    tf.keras.layers.Dense(100),
    tf.keras.layers.Dense(10),
    tf.keras.layers.Dense(1)
]
)
Model.compile(
    loss = tf.keras.losses.mae,
    optimizer = tf.keras.optimizers.legacy.Adam(lr = 0.01),
    metrics = ['mae']
)
Model.fit(X_train,y_train,epochs= 200)

Epoch 1/200
 1/34 [..............................] - ETA: 5s - loss: 13809.3906 - mae: 13809.3906

  super().__init__(name, **kwargs)


Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78/200
Epoch 7

<keras.src.callbacks.History at 0x2c7f0fc40>

In [17]:

plot_model(Model,show_shapes=True)

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model to work.


In [18]:
ev = Model.evaluate(X_test,y_test)



In [19]:
tf.random.set_seed(42)
model_1 = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation = 'relu'),
    tf.keras.layers.Dense(32, activation = 'relu'),
    tf.keras.layers.Dense(16, activation = 'relu'),
    tf.keras.layers.Dense(1, activation = 'relu'),
])

model_1.compile(loss=tf.keras.losses.mae,
                optimizer = tf.keras.optimizers.Adam(lr = 0.001), 
                metrics = ['mae'])

model_1.fit(X_train, y_train, batch_size = 32, epochs = 300)



Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300
Epoch 75/300
Epoch 76/300
Epoch 77/300
Epoch 78

<keras.src.callbacks.History at 0x2d1ad9070>

In [20]:
ev_1 = model_1.evaluate(X_test, y_test)
ev_1



[3752.9453125, 3752.9453125]

Lets add another layer:

In [21]:
tf.random.set_seed(42)
model_2 = tf.keras.Sequential([
    tf.keras.layers.Dense(128, activation = 'relu'),
    tf.keras.layers.Dense(64, activation = 'relu'),
    tf.keras.layers.Dense(32, activation = 'relu'),
    tf.keras.layers.Dense(16, activation = 'relu'),
    tf.keras.layers.Dense(1, activation = 'relu')
])

model_2.compile(loss=tf.keras.losses.mae,
                optimizer = tf.keras.optimizers.Adam(lr = 0.001),
                metrics = ['mae'])

model_2.fit(X_train, y_train, batch_size = 32, epochs = 500)



Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

<keras.src.callbacks.History at 0x2d2495970>

In [22]:
ev_2 = model_2.evaluate(X_test, y_test)



In [23]:
model_3 = tf.keras.Sequential([
    tf.keras.layers.Dense(128, activation = 'relu'),
    tf.keras.layers.Dense(128, activation = 'relu'),
    tf.keras.layers.Dense(64, activation = 'relu'),
    tf.keras.layers.Dense(32, activation = 'relu'),
    tf.keras.layers.Dense(16, activation = 'relu'),
    tf.keras.layers.Dense(1)
])

model_3.compile(loss = tf.keras.losses.mae,
               optimizer = tf.keras.optimizers.Adam(lr=0.01),
               metrics = ['mae'])

model_3.fit(X_train, y_train, batch_size = 32, epochs = 1000)



Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
E

<keras.src.callbacks.History at 0x2d1451a30>