In [4]:
import pandas as pd
import numpy as np

In [5]:
df=pd.read_csv('/content/insurance.csv')
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [7]:
df.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [8]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Identify categorical and numerical columns
categorical_features = ['sex', 'smoker', 'region']
numerical_features = ['age', 'bmi', 'children', 'charges']

# Create a column transformer to apply OneHotEncoder to the categorical features
# 'passthrough' keeps the numerical columns as they are
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ],
    remainder='passthrough'
)

# Fit and transform the data
encoded_data = preprocessor.fit_transform(df)

# Create a new DataFrame with the encoded data
# Get the feature names after encoding
encoded_feature_names = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)
all_feature_names = list(encoded_feature_names) + numerical_features

df_encoded = pd.DataFrame(encoded_data, columns=all_feature_names)

# Display the first few rows of the new DataFrame
display(df_encoded.head())

Unnamed: 0,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest,age,bmi,children,charges
0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,19.0,27.9,0.0,16884.924
1,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,18.0,33.77,1.0,1725.5523
2,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,28.0,33.0,3.0,4449.462
3,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,33.0,22.705,0.0,21984.47061
4,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,32.0,28.88,0.0,3866.8552


In [9]:
from sklearn.model_selection import train_test_split

# Separate features (X) and target (y)
X = df_encoded.drop('charges', axis=1)
y = df_encoded['charges']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training data shape:", X_train.shape)
print("Testing data shape:", X_test.shape)

Training data shape: (1070, 11)
Testing data shape: (268, 11)


In [10]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score

# Initialize and train the Linear Regression model
linear_regressor = LinearRegression()
linear_regressor.fit(X_train, y_train)

# Make predictions on the test set
y_pred_lr = linear_regressor.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred_lr)
r2 = r2_score(y_test, y_pred_lr)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"R-squared (R2): {r2}")

Mean Absolute Error (MAE): 4181.194473753645
R-squared (R2): 0.7835929767120723


In [11]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Initialize and train the Random Forest Regressor model
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
rf_regressor.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_regressor.predict(X_test)

# Evaluate the model
mse =  mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"R-squared (R2): {r2}")

Mean Absolute Error (MAE): 4181.194473753645
R-squared (R2): 0.8642665871830159


In [12]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, r2_score

# Initialize and train the Gradient Boosting Regressor model
gbr = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gbr.fit(X_train, y_train)

# Make predictions on the test set
y_pred_gbr = gbr.predict(X_test)

# Evaluate the model
mae_gbr = mean_absolute_error(y_test, y_pred_gbr)
r2_gbr = r2_score(y_test, y_pred_gbr)

print(f"Mean Absolute Error (MAE) for Gradient Boosting: {mae_gbr}")
print(f"R-squared (R2) for Gradient Boosting: {r2_gbr}")

Mean Absolute Error (MAE) for Gradient Boosting: 2402.0240220053965
R-squared (R2) for Gradient Boosting: 0.8794636885872643


In [13]:
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler

# Scaling the data for SVR
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and train the SVR model
# Using a common kernel like 'rbf'
svr = SVR(kernel='rbf')
svr.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred_svr = svr.predict(X_test_scaled)

# Evaluate the model
mae_svr = mean_absolute_error(y_test, y_pred_svr)
r2_svr = r2_score(y_test, y_pred_svr)

print(f"Mean Absolute Error (MAE) for SVR: {mae_svr}")
print(f"R-squared (R2) for SVR: {r2_svr}")

Mean Absolute Error (MAE) for SVR: 8611.415551847522
R-squared (R2) for SVR: -0.06894401159271046


In [20]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor

# Define the parameter grid for Gradient Boosting
param_grid_gbr = {
    'n_estimators': [80, 150, 300],
    'learning_rate': [0.01, 0.1, 0.3],
    'max_depth': [2, 4, 5]
}

# Initialize GridSearchCV with the Gradient Boosting Regressor
grid_search_gbr = GridSearchCV(estimator=GradientBoostingRegressor(random_state=42),
                               param_grid=param_grid_gbr,
                               scoring='neg_mean_absolute_error', # Using negative MAE for scoring
                               cv=5, # 5-fold cross-validation
                               n_jobs=-1) # Use all available cores

# Fit Grid Search to the training data
grid_search_gbr.fit(X_train, y_train)

# Print the best parameters and best score
print("Best parameters for Gradient Boosting:", grid_search_gbr.best_params_)
print("Best cross-validation score (negative MAE):", grid_search_gbr.best_score_)

# You can access the best model like this:
best_gbr_model = grid_search_gbr.best_estimator_

Best parameters for Gradient Boosting: {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 150}
Best cross-validation score (negative MAE): -2590.5955906036625


In [21]:
import joblib

# Save the best Gradient Boosting model
joblib.dump(best_gbr_model, 'insurance_cost_model_v1.joblib')

print("Model saved successfully as 'insurance_cost_model_v1.joblib'")

Model saved successfully as 'insurance_cost_model_v1.joblib'
