In [65]:
#Importing...
import pandas as pd
import tensorflow as tf
from tensorflow.keras.utils import plot_model
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import Ridge 
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor


ModuleNotFoundError: No module named 'tensorflow'

In [56]:
#Load data
data = pd.read_csv('cleaned_data.csv')

data.head()

Unnamed: 0.1,Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest
0,0,19,27.9,0,16884.924,0,1,0,0,1
1,1,18,33.77,1,1725.5523,1,0,0,1,0
2,2,28,33.0,3,4449.462,1,0,0,1,0
3,3,33,22.705,0,21984.47061,1,0,1,0,0
4,4,32,28.88,0,3866.8552,1,0,1,0,0


In [7]:
#Features and target variables

X = data.drop(columns=['charges'])
y = data['charges']

In [8]:
#Split data

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

In [15]:
#Models

linerarRegressor = LinearRegression()
linerarRegressor.fit(X_train, y_train)
predictions_lr = linerarRegressor.predict(X_test)


randomForestModel = RandomForestRegressor(n_estimators=100, random_state=42)
randomForestModel.fit(X_train, y_train)
predictions_rf = randomForestModel.predict(X_test)


In [21]:
#Model Evaluation
mse_lr = mean_squared_error(y_test, y_pred=predictions_lr)

r2_lr= r2_score(y_test, y_pred=predictions_lr)

print(f"\nMean Squared Error for Linear Regression : {mse_lr}\n"
      f"\nR-squared Score Linear Regression: {r2_lr}"
      )

mse_rf = mean_squared_error(y_test, y_pred=predictions_rf)

r2_rf= r2_score(y_test, y_pred=predictions_rf)

print(f"\nMean Squared Error of Random Forest: {mse_rf}\n"
      f"\nR-squared Score of Random Forest: {r2_rf}"
      )


Mean Squared Error for Linear Regression : 35564984.47965559

R-squared Score Linear Regression: 0.8064554513330181

Mean Squared Error of Random Forest: 20478073.23579169

R-squared Score of Random Forest: 0.8885583812286515


In [54]:

#Standardization
scaler_std = StandardScaler()
X_train_std = scaler_std.fit_transform(X_train)
X_test_std = scaler_std.fit_transform(X_test)

#Normalization
scaler_minmax = MinMaxScaler()
X_train_minmax = scaler_minmax.fit_transform(X_train)
X_test_minmax = scaler_minmax.fit_transform(X_test)


In [64]:

ridge = Ridge(alpha=1.0)
ridge.fit(X_train_std, y_train)

predictions_ridge = ridge.predict(X_test_std)

mse_ridge = mean_squared_error(y_test, predictions_ridge)
r2_ridge = r2_score(y_test, predictions_ridge)

print("\nRidge Regression Mean Squared Error:\n", mse_ridge)
print("\nRidge Regression R2 score:\n", r2_ridge)


Ridge Regression Mean Squared Error:
 38366284.09655684

Ridge Regression R2 score:
 0.7912107864479742


Ridge Regression:
Ridge Regression is a type of regularized linear regression that can help in handling multicollinearity and prevent overfitting by adding a regularization term to the cost function

In [63]:

#Set up the pipeline with scaling and the Ridge model
steps = [
    ('scaler', StandardScaler()),
    ('model', Ridge()) 
]

pipeline = Pipeline(steps)

#Hyperparameters grid to search over
param_grid = {
    'model__alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]  # Values of alpha to test
}

# Perform GridSearchCV
grid = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error')
grid.fit(X_train, y_train)

#Best hyperparameters and best score
print("\nBest Hyperparameters:\n", grid.best_params_)
print("\nBest Cross-Validation MSE:\n", -grid.best_score_)

#Evaluate on test set using best model
best_model = grid.best_estimator_
test_predictions = best_model.predict(X_test)

test_mse = mean_squared_error(y_test, test_predictions)
test_r2 = r2_score(y_test, test_predictions)

print("\nTest Set MSE with Best Model:\n", test_mse)
print("\nTest Set R2 with Best Model:\n", test_r2)


Best Hyperparameters:
 {'model__alpha': 10.0}

Best Cross-Validation MSE:
 37629448.590384945

Test Set MSE with Best Model:
 35915547.839589596

Test Set R2 with Best Model:
 0.8045476864831158


XGBoost: Gradient Boosting Regression (e.g., XGBoost, LightGBM):
Model: Builds trees sequentially, focusing on areas where previous trees performed poorly. We will aso tune the hyperparameters to optimize model


In [61]:
xgb = XGBRegressor()

param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5]
}
grid = GridSearchCV(xgb, param_grid, cv=5, scoring='neg_mean_squared_error')
grid.fit(X_train, y_train)

#Best parameters
print("\nBest Hyperparameters:\n", grid.best_params_)
print("\nBest Cross-Validation MSE:\n", -grid.best_score_)

#Model Evaluation
best_model = grid.best_estimator_
test_predictions = best_model.predict(X_test)

test_mse_xgb = mean_squared_error(y_test, test_predictions)
test_r2_xgb = r2_score(y_test, test_predictions)

print("\nTest Set MSE with Best Model:\n", test_mse_xgb)
print("\nTest Set R2 with Best Model: \n", test_r2_xgb)



Best Hyperparameters:
 {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 300}

Best Cross-Validation MSE:
 21658101.807436105

Test Set MSE with Best Model:
 19114153.728462704

Test Set R2 with Best Model: 
 0.895980827472515


In [None]:
Model = tf.keras.Sequential([
    tf.keras.layers.Dense(100),
    tf.keras.layers.Dense(10),
    tf.keras.layers.Dense(1)
]
)
Model.compile(
    loss = tf.keras.losses.mae,
    optimizer = tf.keras.optimizers.Adam(lr = 0.01),
    metrics = ['mae']
)
Model.fit(X_train,y_train,epochs= 200)

In [None]:

plot_model(Model,show_shapes=True)

In [None]:
ev = Model.evaluate(X_test,y_test)