In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import iqr


# Read the dataset
filepath= r'C:\Users\Hp\Downloads\insurance_dataset.csv'
data = pd.read_csv(filepath)

data = data.sample(frac=0.1, random_state=0)  # Use 1% of the data


# Display the first few rows of the DataFrame
data.head()

Unnamed: 0,age,gender,bmi,children,smoker,region,medical_history,family_medical_history,exercise_frequency,occupation,coverage_level,charges
157105,56,male,22.63,3,yes,southwest,Heart disease,,Rarely,White collar,Premium,23476.784896
374554,23,male,43.38,0,yes,southeast,,Diabetes,Occasionally,Unemployed,Basic,14483.094719
688694,39,female,29.92,4,yes,northwest,High blood pressure,Heart disease,Rarely,Unemployed,Premium,21685.459729
265381,32,female,21.34,0,yes,northeast,Heart disease,Diabetes,Frequently,Blue collar,Standard,21856.056052
955415,64,male,19.3,1,no,northwest,High blood pressure,Diabetes,Never,Blue collar,Premium,15412.60274


In [2]:
# Display basic summary statistics and structure of the data
print(data.describe())
print(data.info())

# Check for missing values in the dataset
print(data.isna().sum().sum())

                 age            bmi       children        charges
count  100000.000000  100000.000000  100000.000000  100000.000000
mean       41.451650      34.007784       2.497610   16721.772665
std        13.867106       9.219441       1.711112    4424.470617
min        18.000000      18.000000       0.000000    4203.740016
25%        29.000000      26.040000       1.000000   13584.720247
50%        41.000000      33.980000       2.000000   16606.923423
75%        53.000000      42.000000       4.000000   19776.041119
max        65.000000      50.000000       5.000000   32404.269435
<class 'pandas.core.frame.DataFrame'>
Index: 100000 entries, 157105 to 871744
Data columns (total 12 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   age                     100000 non-null  int64  
 1   gender                  100000 non-null  object 
 2   bmi                     100000 non-null  float64
 3   children            

In [3]:
categorical_columns = data.select_dtypes(include=['object']).columns

# Print unique values in categorical columns
for col in categorical_columns:
    print(f"Unique values in {col}:")
    print(data[col].unique())

Unique values in gender:
['male' 'female']
Unique values in smoker:
['yes' 'no']
Unique values in region:
['southwest' 'southeast' 'northwest' 'northeast']
Unique values in medical_history:
['Heart disease' nan 'High blood pressure' 'Diabetes']
Unique values in family_medical_history:
[nan 'Diabetes' 'Heart disease' 'High blood pressure']
Unique values in exercise_frequency:
['Rarely' 'Occasionally' 'Frequently' 'Never']
Unique values in occupation:
['White collar' 'Unemployed' 'Blue collar' 'Student']
Unique values in coverage_level:
['Premium' 'Basic' 'Standard']


In [4]:
# Omit rows with any NA values
data.dropna(inplace=True)

In [5]:
data

Unnamed: 0,age,gender,bmi,children,smoker,region,medical_history,family_medical_history,exercise_frequency,occupation,coverage_level,charges
688694,39,female,29.92,4,yes,northwest,High blood pressure,Heart disease,Rarely,Unemployed,Premium,21685.459729
265381,32,female,21.34,0,yes,northeast,Heart disease,Diabetes,Frequently,Blue collar,Standard,21856.056052
955415,64,male,19.30,1,no,northwest,High blood pressure,Diabetes,Never,Blue collar,Premium,15412.602740
4280,50,male,43.04,5,no,southeast,Heart disease,Heart disease,Never,Student,Standard,20472.321397
514674,46,male,48.70,2,yes,southwest,Diabetes,Diabetes,Rarely,Student,Basic,16661.841940
...,...,...,...,...,...,...,...,...,...,...,...,...
958767,27,female,40.14,2,yes,southwest,Heart disease,Diabetes,Occasionally,Blue collar,Basic,19275.246160
987666,61,female,38.48,4,yes,northeast,Diabetes,Heart disease,Occasionally,Blue collar,Premium,26684.870073
554477,39,male,39.55,3,no,northwest,Heart disease,High blood pressure,Occasionally,Student,Standard,16647.970185
838426,40,female,26.13,5,yes,northeast,Heart disease,High blood pressure,Rarely,Unemployed,Basic,17719.195876


In [6]:
# Calculate IQR for BMI and Charges

data['bmi'] = pd.to_numeric(data['bmi'], errors='coerce')
data['charges'] = pd.to_numeric(data['charges'], errors='coerce')

bmi_iqr = iqr(data['bmi'], nan_policy='omit')
charges_iqr = iqr(data['charges'], nan_policy='omit')

# Calculate the quantiles
bmi_quantiles = data['bmi'].quantile([0.25, 0.75])
charges_quantiles = data['charges'].quantile([0.25, 0.75])

# Define the cutoffs
bmi_cutoff = [bmi_quantiles.iloc[0] - 1.5 * bmi_iqr, bmi_quantiles.iloc[1] + 1.5 * bmi_iqr]
charges_cutoff = [charges_quantiles.iloc[0] - 1.5 * charges_iqr, charges_quantiles.iloc[1] + 1.5 * charges_iqr]

# Filter out the outliers
data = data[(data['bmi'] >= bmi_cutoff[0]) & (data['bmi'] <= bmi_cutoff[1])]
data = data[(data['charges'] >= charges_cutoff[0]) & (data['charges'] <= charges_cutoff[1])]

In [7]:
data

Unnamed: 0,age,gender,bmi,children,smoker,region,medical_history,family_medical_history,exercise_frequency,occupation,coverage_level,charges
688694,39,female,29.92,4,yes,northwest,High blood pressure,Heart disease,Rarely,Unemployed,Premium,21685.459729
265381,32,female,21.34,0,yes,northeast,Heart disease,Diabetes,Frequently,Blue collar,Standard,21856.056052
955415,64,male,19.30,1,no,northwest,High blood pressure,Diabetes,Never,Blue collar,Premium,15412.602740
4280,50,male,43.04,5,no,southeast,Heart disease,Heart disease,Never,Student,Standard,20472.321397
514674,46,male,48.70,2,yes,southwest,Diabetes,Diabetes,Rarely,Student,Basic,16661.841940
...,...,...,...,...,...,...,...,...,...,...,...,...
958767,27,female,40.14,2,yes,southwest,Heart disease,Diabetes,Occasionally,Blue collar,Basic,19275.246160
987666,61,female,38.48,4,yes,northeast,Diabetes,Heart disease,Occasionally,Blue collar,Premium,26684.870073
554477,39,male,39.55,3,no,northwest,Heart disease,High blood pressure,Occasionally,Student,Standard,16647.970185
838426,40,female,26.13,5,yes,northeast,Heart disease,High blood pressure,Rarely,Unemployed,Basic,17719.195876


In [8]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
for column in data.select_dtypes(include=['object']).columns:
    data[column] = label_encoder.fit_transform(data[column])
data

Unnamed: 0,age,gender,bmi,children,smoker,region,medical_history,family_medical_history,exercise_frequency,occupation,coverage_level,charges
688694,39,0,29.92,4,1,1,2,1,3,2,1,21685.459729
265381,32,0,21.34,0,1,0,1,0,0,0,2,21856.056052
955415,64,1,19.30,1,0,1,2,0,1,0,1,15412.602740
4280,50,1,43.04,5,0,2,1,1,1,1,2,20472.321397
514674,46,1,48.70,2,1,3,0,0,3,1,0,16661.841940
...,...,...,...,...,...,...,...,...,...,...,...,...
958767,27,0,40.14,2,1,3,1,0,2,0,0,19275.246160
987666,61,0,38.48,4,1,0,0,1,2,0,1,26684.870073
554477,39,1,39.55,3,0,1,1,2,2,1,2,16647.970185
838426,40,0,26.13,5,1,0,1,2,3,2,0,17719.195876


In [9]:
from sklearn.model_selection import train_test_split

# Define the target and feature columns
X = data.drop(columns=['charges'])
y = data['charges']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [10]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import mlflow

# Step 4: Initialize MLflow experiment
mlflow.set_experiment("insurance_cost_prediction")

# Define models to compare
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(random_state=42, n_jobs=-1),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42)
}

# Store performance metrics for comparison
model_metrics = {}

# Step 5: Train and evaluate each model
for model_name, model in models.items():
    with mlflow.start_run(run_name=model_name):
        # Train the model
        model.fit(X_train, y_train)

        # Predictions
        y_pred = model.predict(X_test)

        # Calculate metrics
        rmse = mean_squared_error(y_test, y_pred, squared=False)
        mae = mean_absolute_error(y_test, y_pred)

        # Log metrics and model parameters
        mlflow.log_param("model_type", model_name)
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("mae", mae)

        # Save metrics for comparison
        model_metrics[model_name] = {"RMSE": rmse, "MAE": mae}

        # Log the model
        mlflow.sklearn.log_model(model, model_name)

# Display metrics
print("Model Performance Comparison:")
for model, metrics in model_metrics.items():
    print(f"{model}: RMSE = {metrics['RMSE']:.2f}, MAE = {metrics['MAE']:.2f}")



2024/11/10 02:37:18 INFO mlflow.tracking.fluent: Experiment with name 'insurance_cost_prediction' does not exist. Creating a new experiment.


Model Performance Comparison:
Linear Regression: RMSE = 3153.25, MAE = 2540.87
Random Forest: RMSE = 509.89, MAE = 408.17
Gradient Boosting: RMSE = 577.04, MAE = 461.85


In [11]:
from xgboost import XGBRegressor
from sklearn.svm import SVR

# Add SVR and XGBoost to our model dictionary
additional_models = {
    "Support Vector Regression": SVR(kernel='rbf'),  # Default RBF kernel
    "XGBoost": XGBRegressor(objective='reg:squarederror', random_state=42)
}

# Update the model metrics dictionary to include results for these new models
for model_name, model in additional_models.items():
    with mlflow.start_run(run_name=model_name):
        # Train the model
        model.fit(X_train, y_train)

        # Predictions
        y_pred = model.predict(X_test)

        # Calculate metrics
        rmse = mean_squared_error(y_test, y_pred, squared=False)
        mae = mean_absolute_error(y_test, y_pred)

        # Log metrics and model parameters
        mlflow.log_param("model_type", model_name)
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("mae", mae)

        # Save metrics for comparison
        model_metrics[model_name] = {"RMSE": rmse, "MAE": mae}

        # Log the model with input example
        mlflow.sklearn.log_model(model, model_name, input_example=X_test[:5])

# Display updated metrics
print("Updated Model Performance Comparison:")
for model, metrics in model_metrics.items():
    print(f"{model}: RMSE = {metrics['RMSE']:.2f}, MAE = {metrics['MAE']:.2f}")




Updated Model Performance Comparison:
Linear Regression: RMSE = 3153.25, MAE = 2540.87
Random Forest: RMSE = 509.89, MAE = 408.17
Gradient Boosting: RMSE = 577.04, MAE = 461.85
Support Vector Regression: RMSE = 4237.46, MAE = 3445.46
XGBoost: RMSE = 374.48, MAE = 306.60


In [12]:
# Assuming 'XGBoost' was the key for XGBoost in the models dictionary
xgboost_model = additional_models["XGBoost"]

# Save the XGBoost model to a file
import joblib
joblib.dump(xgboost_model, "xgboost_model.pkl")


['xgboost_model.pkl']