In [3]:
import pandas as pd

# Load your data file
df = pd.read_excel("../data/premiums.xlsx")  # Adjust path if needed

# Preview data
df.head()


Unnamed: 0,Age,Gender,Region,Marital_status,Number Of Dependants,BMI_Category,Smoking_Status,Employment_Status,Income_Level,Income_Lakhs,Medical History,Insurance_Plan,Annual_Premium_Amount
0,26,Male,Northwest,Unmarried,0,Normal,No Smoking,Salaried,<10L,6,Diabetes,Bronze,9053
1,29,Female,Southeast,Married,2,Obesity,Regular,Salaried,<10L,6,Diabetes,Bronze,16339
2,49,Female,Northeast,Married,2,Normal,No Smoking,Self-Employed,10L - 25L,20,High blood pressure,Silver,18164
3,30,Female,Southeast,Married,3,Normal,No Smoking,Salaried,> 40L,77,No Disease,Gold,20303
4,18,Male,Northeast,Unmarried,0,Overweight,Regular,Self-Employed,> 40L,99,High blood pressure,Silver,13365


In [4]:
# Check missing values in each column
df.isnull().sum() # type: ignore


Age                       0
Gender                    0
Region                    0
Marital_status            0
Number Of Dependants      0
BMI_Category              0
Smoking_Status           11
Employment_Status         2
Income_Level             13
Income_Lakhs              0
Medical History           0
Insurance_Plan            0
Annual_Premium_Amount     0
dtype: int64

In [None]:
# Fill missing categorical values with mode (most frequent value)
df["Smoking_Status"].fillna(df["Smoking_Status"].mode()[0], inplace=True)
df["Employment_Status"].fillna(df["Employment_Status"].mode()[0], inplace=True)
df["Income_Level"].fillna(df["Income_Level"].mode()[0], inplace=True)

# Confirm that there are no more missing values
df.isnull().sum()


In [6]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd

# Label encode binary categories
label_cols = ["Gender", "Marital_status", "Smoking_Status", "Insurance_Plan"]
le = LabelEncoder()
for col in label_cols:
    df[col] = le.fit_transform(df[col])

# Make sure 'Central' is present in Region (if not already in data, add it manually)
if 'Central' not in df['Region'].unique():
    df.loc[len(df)] = df.iloc[0]  # duplicate one row
    df.at[len(df)-1, 'Region'] = 'Central'  # set its region to Central

# One-Hot encode multi-category columns (including Central)
df = pd.get_dummies(df, columns=["Region", "BMI_Category", "Employment_Status", "Income_Level", "Medical History"], drop_first=True)

# ---- Disease Severity Mapping ----
severity_mapping = {
    'Medical History_No Disease': 0,
    'Medical History_Thyroid': 1,
    'Medical History_High blood pressure': 2,
    'Medical History_Diabetes & Thyroid': 3,
    'Medical History_Diabetes & High blood pressure': 4,
    'Medical History_Heart disease': 5,
    'Medical History_High blood pressure & Heart disease': 6,
    'Medical History_Diabetes & Heart disease': 7
}

# Function to calculate severity score
def get_severity(row):
    for col, score in severity_mapping.items():
        if col in row and row[col] == True:
            return score
    return 0

# Add new Medical_Severity feature
df['Medical_Severity'] = df.apply(get_severity, axis=1)

# Preview the updated DataFrame
df.head()



Unnamed: 0,Age,Gender,Marital_status,Number Of Dependants,Smoking_Status,Income_Lakhs,Insurance_Plan,Annual_Premium_Amount,Region_Northeast,Region_Northwest,...,Income_Level_> 40L,Medical History_Diabetes & Heart disease,Medical History_Diabetes & High blood pressure,Medical History_Diabetes & Thyroid,Medical History_Heart disease,Medical History_High blood pressure,Medical History_High blood pressure & Heart disease,Medical History_No Disease,Medical History_Thyroid,Medical_Severity
0,26,1,1,0,1,6,0,9053,False,True,...,False,False,False,False,False,False,False,False,False,0
1,29,0,0,2,4,6,0,16339,False,False,...,False,False,False,False,False,False,False,False,False,0
2,49,0,0,2,1,20,2,18164,True,False,...,False,False,False,False,False,True,False,False,False,2
3,30,0,0,3,1,77,1,20303,False,False,...,True,False,False,False,False,False,False,True,False,0
4,18,1,1,0,4,99,2,13365,True,False,...,True,False,False,False,False,True,False,False,False,2


In [7]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# 1. Split features and target
X = df.drop("Annual_Premium_Amount", axis=1)
y = df["Annual_Premium_Amount"]

# 2. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 3. Train Linear Regression model
lr = LinearRegression()
lr.fit(X_train, y_train)

# 4. Make predictions
y_pred = lr.predict(X_test)

# 5. Evaluate
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R² Score: {r2:.2f}")


MAE: 3292.51
RMSE: 4578.69
R² Score: 0.70


In [8]:
df.describe()

Unnamed: 0,Age,Gender,Marital_status,Number Of Dependants,Smoking_Status,Income_Lakhs,Insurance_Plan,Annual_Premium_Amount,Medical_Severity
count,50001.0,50001.0,50001.0,50001.0,50001.0,50001.0,50001.0,50001.0,50001.0
mean,34.593308,0.549609,0.51363,1.712046,2.218416,23.01786,0.910482,15767.98202,1.077318
std,15.000337,0.497538,0.499819,1.498253,1.376573,24.219074,0.874886,8419.809031,1.749257
min,18.0,0.0,0.0,-3.0,0.0,1.0,0.0,3501.0,0.0
25%,22.0,0.0,0.0,0.0,1.0,7.0,0.0,8608.0,0.0
50%,31.0,1.0,1.0,2.0,1.0,17.0,1.0,13929.0,0.0
75%,45.0,1.0,1.0,3.0,4.0,31.0,2.0,22275.0,2.0
max,356.0,1.0,1.0,5.0,5.0,930.0,2.0,43471.0,7.0


In [9]:
# Print intercept
print("Intercept (b0):", lr.intercept_)

# Print coefficients with feature names
coefficients = pd.DataFrame({
    "Feature": X.columns,
    "Coefficient": lr.coef_
})

coefficients.sort_values(by="Coefficient", ascending=False)


Intercept (b0): 5780.223437179751


Unnamed: 0,Feature,Coefficient
11,BMI_Category_Obesity,3694.03301
18,Income_Level_> 40L,2301.403602
12,BMI_Category_Overweight,1921.87744
6,Insurance_Plan,1855.584055
22,Medical History_Heart disease,1527.796028
15,Employment_Status_Self-Employed,1088.711472
4,Smoking_Status,682.206701
27,Medical_Severity,440.723896
14,Employment_Status_Salaried,331.637891
0,Age,257.630613


In [10]:
from sklearn.ensemble import RandomForestRegressor

# Train Random Forest model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Predict and evaluate
y_pred_rf = rf.predict(X_test)

mae_rf = mean_absolute_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
r2_rf = r2_score(y_test, y_pred_rf)

print("🌳 Random Forest Results:")
print(f"MAE: {mae_rf:.2f}")
print(f"RMSE: {rmse_rf:.2f}")
print(f"R² Score: {r2_rf:.2f}")


🌳 Random Forest Results:
MAE: 806.17
RMSE: 1226.39
R² Score: 0.98


In [11]:
# Clean up feature names by removing invalid characters
X_train.columns = X_train.columns.str.replace(r"[<>[\]]", "", regex=True)
X_test.columns = X_test.columns.str.replace(r"[<>[\]]", "", regex=True)


In [12]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Initialize and train XGBoost Regressor
xgb = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
xgb.fit(X_train, y_train)

# Predict on test data
y_pred_xgb = xgb.predict(X_test)

# Evaluate performance
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
rmse_xgb = np.sqrt(mean_squared_error(y_test, y_pred_xgb))
r2_xgb = r2_score(y_test, y_pred_xgb)

print("📦 XGBoost Results:")
print(f"MAE: {mae_xgb:.2f}")
print(f"RMSE: {rmse_xgb:.2f}")
print(f"R² Score: {r2_xgb:.2f}")


📦 XGBoost Results:
MAE: 766.10
RMSE: 1144.11
R² Score: 0.98


In [13]:
import joblib

# Save to model folder
joblib.dump(xgb, "../model/premium_predictor_xgb.pkl")
print("✅ Model saved to model/premium_predictor_xgb.pkl")


✅ Model saved to model/premium_predictor_xgb.pkl
