In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

In [2]:
df = pd.read_csv('../data/cleaned_data.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,charges,region_northwest,region_southeast,region_southwest,bmi_category_normal,bmi_category_overweight,bmi_category_underweight,smoker_bmi
0,19,0,27.9,0,1,16884.924,0,0,1,0,1,0,27.9
1,18,1,33.77,1,0,1725.5523,0,1,0,0,0,0,0.0
2,28,1,33.0,3,0,4449.462,0,1,0,0,0,0,0.0
3,33,1,22.705,0,0,21984.47061,1,0,0,1,0,0,0.0
4,32,1,28.88,0,0,3866.8552,1,0,0,0,1,0,0.0


In [3]:
x = df.drop('charges', axis=1)
y = df['charges']

xTrain, xTest, yTrain, yTest = train_test_split(
    x,y, test_size=0.2, random_state=42
)

xTrain.shape, xTest.shape, yTrain.shape, yTest.shape

((1069, 12), (268, 12), (1069,), (268,))

In [4]:
scaler = StandardScaler()

xTrainScaled = scaler.fit_transform(xTrain)
xTestScaled = scaler.transform(xTest)

In [5]:
def train_and_evaluate_model(model, xTrain,xTest,yTrain,yTest):
    
    model.fit(xTrain,yTrain)
    yPred = model.predict(xTest)

    rmse = np.sqrt(mean_squared_error(yTest,yPred))
    mae = mean_absolute_error(yTest, yPred)
    r2 = r2_score(yTest, yPred)

    # Print metrics
    print(f"Model: {model.__class__.__name__}")
    print(f"RMSE: {rmse:.2f}")
    print(f"MAE: {mae:.2f}")
    print(f"R²: {r2:.2f}\n")
    
    
    return model, rmse, mae, r2

In [6]:
#Linear Regression

linModel, linRmse, linMae, linR2 = train_and_evaluate_model(
    LinearRegression(),
    xTrainScaled,xTestScaled,
    yTrain, yTest
)


Model: LinearRegression
RMSE: 4560.55
MAE: 2813.07
R²: 0.89



In [7]:
#Ridge Regression

ridgeModel, ridgeRmse, ridgeMae, ridgeR2 = train_and_evaluate_model(
    Ridge(alpha=1.0),
    xTrainScaled, xTestScaled,
    yTrain, yTest
)

Model: Ridge
RMSE: 4553.15
MAE: 2810.58
R²: 0.89



In [8]:
#Lasso Regression

lassoModel, lassoRmse, lassoMae, lassoR2 = train_and_evaluate_model(
    Lasso(alpha=0.1),
    xTrainScaled,xTestScaled,
    yTrain,yTest
)

Model: Lasso
RMSE: 4560.45
MAE: 2813.02
R²: 0.89



In [None]:
#Polynomial Regression (degree 2)
poly = PolynomialFeatures(degree=2)
xtrainPoly = poly.fit_transform(xTrainScaled)
xtestPoly  = poly.transform(xTestScaled)

polyModel, polyRmse, polyMae,polyR2 = train_and_evaluate_model(
    LinearRegression(),
    xtrainPoly,xtestPoly,
    yTrain,yTest
)

Model: LinearRegression
RMSE: 4193.11
MAE: 2406.38
R²: 0.90



In [10]:
#KNN 
knnModel, knnRmse, knnMae, knnR2 = train_and_evaluate_model(
    KNeighborsRegressor(n_neighbors=5),
    xTrainScaled, xTestScaled,
    yTrain, yTest
)

Model: KNeighborsRegressor
RMSE: 5126.23
MAE: 3170.60
R²: 0.86



In [17]:
rfModel, rfRmse, rfMae, rfR2 = train_and_evaluate_model(
    RandomForestRegressor(n_estimators=100, max_depth=None, random_state=42),
    xTrain, xTest,
    yTrain, yTest
)

Model: RandomForestRegressor
RMSE: 4708.63
MAE: 2660.73
R²: 0.88



In [15]:
xgbModel, xgbRmse, xgbMae, xgbR2 = train_and_evaluate_model(
    xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42),
    xTrain, xTest,
    yTrain, yTest
)

Model: XGBRegressor
RMSE: 4278.53
MAE: 2470.45
R²: 0.90

