In [77]:

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

In [78]:
df=pd.read_csv(r'insurance.csv')

In [79]:
df.head()

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category
0,67,119.8,1.56,2.92,False,Jaipur,retired,High
1,36,101.1,1.83,34.28,False,Chennai,freelancer,Low
2,39,56.8,1.64,36.64,False,Indore,freelancer,Low
3,22,109.4,1.55,3.34,True,Mumbai,student,Medium
4,69,62.2,1.6,3.94,True,Indore,retired,High


In [80]:
df.sample(5)

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category
2,39,56.8,1.64,36.64,False,Indore,freelancer,Low
84,75,86.2,1.73,0.62,True,Jaipur,retired,High
55,47,75.7,1.73,24.93,False,Delhi,unemployed,Low
71,38,54.1,1.81,20.25,False,Chandigarh,unemployed,Low
25,59,60.2,1.55,30.0,False,Mysore,government_job,Low


In [81]:
df['occupation'].unique()

array(['retired', 'freelancer', 'student', 'government_job',
       'business_owner', 'unemployed', 'private_job'], dtype=object)

In [82]:
df_feat = df.copy()

In [83]:
# Feature 1: BMI
df_feat["bmi"] = df_feat["weight"] / (df_feat["height"] ** 2)

In [84]:
def age_group(age):
    if age <25:
        return 'young'
    elif age < 45:
        return 'adult'
    elif age <60:
        return 'middle_aged'
    return 'senior'    

In [85]:
df_feat['age_group']=df_feat['age'].apply(age_group)

In [86]:
df_feat

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category,bmi,age_group
0,67,119.8,1.56,2.92000,False,Jaipur,retired,High,49.227482,senior
1,36,101.1,1.83,34.28000,False,Chennai,freelancer,Low,30.189017,adult
2,39,56.8,1.64,36.64000,False,Indore,freelancer,Low,21.118382,adult
3,22,109.4,1.55,3.34000,True,Mumbai,student,Medium,45.535900,young
4,69,62.2,1.60,3.94000,True,Indore,retired,High,24.296875,senior
...,...,...,...,...,...,...,...,...,...,...
95,36,52.8,1.57,19.64000,False,Indore,business_owner,Low,21.420747,adult
96,26,113.8,1.54,34.01000,False,Delhi,private_job,Low,47.984483,adult
97,52,60.8,1.80,44.86000,False,Hyderabad,freelancer,Low,18.765432,middle_aged
98,27,101.1,1.82,28.30000,False,Kolkata,business_owner,Low,30.521676,adult


In [87]:
# feature 3: lifestyle risk 
#smoker  bmi > 30 high
# smoker bmi > 27 mdeium  retun low



def lifestyle_risk(row):
    if row['smoker'] and row['bmi']>30:
        return 'high'
    elif row['smoker'] or  row['bmi']>27:
        return 'medium'
    else:
        return 'low'
    

In [88]:
df_feat['lifestyle_risk']=df_feat.apply(lifestyle_risk,axis=1)

In [89]:
df_feat

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category,bmi,age_group,lifestyle_risk
0,67,119.8,1.56,2.92000,False,Jaipur,retired,High,49.227482,senior,medium
1,36,101.1,1.83,34.28000,False,Chennai,freelancer,Low,30.189017,adult,medium
2,39,56.8,1.64,36.64000,False,Indore,freelancer,Low,21.118382,adult,low
3,22,109.4,1.55,3.34000,True,Mumbai,student,Medium,45.535900,young,high
4,69,62.2,1.60,3.94000,True,Indore,retired,High,24.296875,senior,medium
...,...,...,...,...,...,...,...,...,...,...,...
95,36,52.8,1.57,19.64000,False,Indore,business_owner,Low,21.420747,adult,low
96,26,113.8,1.54,34.01000,False,Delhi,private_job,Low,47.984483,adult,medium
97,52,60.8,1.80,44.86000,False,Hyderabad,freelancer,Low,18.765432,middle_aged,low
98,27,101.1,1.82,28.30000,False,Kolkata,business_owner,Low,30.521676,adult,medium


In [90]:


tier_1_cities = ["Mumbai", "Delhi", "Bangalore", "Chennai", "Kolkata", "Hyderabad", "Pune"]
tier_2_cities = [
    "Jaipur", "Chandigarh", "Indore", "Lucknow", "Patna", "Ranchi", "Visakhapatnam", "Coimbatore",
    "Bhopal", "Nagpur", "Vadodara", "Surat", "Rajkot", "Jodhpur", "Raipur", "Amritsar", "Varanasi",
    "Agra", "Dehradun", "Mysore", "Jabalpur", "Guwahati", "Thiruvananthapuram", "Ludhiana", "Nashik",
    "Allahabad", "Udaipur", "Aurangabad", "Hubli", "Belgaum", "Salem", "Vijayawada", "Tiruchirappalli",
    "Bhavnagar", "Gwalior", "Dhanbad", "Bareilly", "Aligarh", "Gaya", "Kozhikode", "Warangal",
    "Kolhapur", "Bilaspur", "Jalandhar", "Noida", "Guntur", "Asansol", "Siliguri"
]
     

In [91]:
# feature 4: city tier
def city_tier(city):
    if city in tier_1_cities:
        return 1
    elif city in tier_2_cities:
        return 2
    else:
        return 3

In [92]:
df_feat['city_tier']=df_feat['city'].apply(city_tier)

In [93]:
df_feat

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category,bmi,age_group,lifestyle_risk,city_tier
0,67,119.8,1.56,2.92000,False,Jaipur,retired,High,49.227482,senior,medium,2
1,36,101.1,1.83,34.28000,False,Chennai,freelancer,Low,30.189017,adult,medium,1
2,39,56.8,1.64,36.64000,False,Indore,freelancer,Low,21.118382,adult,low,2
3,22,109.4,1.55,3.34000,True,Mumbai,student,Medium,45.535900,young,high,1
4,69,62.2,1.60,3.94000,True,Indore,retired,High,24.296875,senior,medium,2
...,...,...,...,...,...,...,...,...,...,...,...,...
95,36,52.8,1.57,19.64000,False,Indore,business_owner,Low,21.420747,adult,low,2
96,26,113.8,1.54,34.01000,False,Delhi,private_job,Low,47.984483,adult,medium,1
97,52,60.8,1.80,44.86000,False,Hyderabad,freelancer,Low,18.765432,middle_aged,low,1
98,27,101.1,1.82,28.30000,False,Kolkata,business_owner,Low,30.521676,adult,medium,1


In [94]:
df_feat=df_feat.drop(columns=['age','weight','height','smoker','city'])
df_feat

Unnamed: 0,income_lpa,occupation,insurance_premium_category,bmi,age_group,lifestyle_risk,city_tier
0,2.92000,retired,High,49.227482,senior,medium,2
1,34.28000,freelancer,Low,30.189017,adult,medium,1
2,36.64000,freelancer,Low,21.118382,adult,low,2
3,3.34000,student,Medium,45.535900,young,high,1
4,3.94000,retired,High,24.296875,senior,medium,2
...,...,...,...,...,...,...,...
95,19.64000,business_owner,Low,21.420747,adult,low,2
96,34.01000,private_job,Low,47.984483,adult,medium,1
97,44.86000,freelancer,Low,18.765432,middle_aged,low,1
98,28.30000,business_owner,Low,30.521676,adult,medium,1


In [95]:
#select feature and target 
X=df_feat[['income_lpa'	,'occupation','bmi','age_group','lifestyle_risk','city_tier']]
y=df_feat[['insurance_premium_category']]

In [96]:
X


Unnamed: 0,income_lpa,occupation,bmi,age_group,lifestyle_risk,city_tier
0,2.92000,retired,49.227482,senior,medium,2
1,34.28000,freelancer,30.189017,adult,medium,1
2,36.64000,freelancer,21.118382,adult,low,2
3,3.34000,student,45.535900,young,high,1
4,3.94000,retired,24.296875,senior,medium,2
...,...,...,...,...,...,...
95,19.64000,business_owner,21.420747,adult,low,2
96,34.01000,private_job,47.984483,adult,medium,1
97,44.86000,freelancer,18.765432,middle_aged,low,1
98,28.30000,business_owner,30.521676,adult,medium,1


In [97]:
y

Unnamed: 0,insurance_premium_category
0,High
1,Low
2,Low
3,Medium
4,High
...,...
95,Low
96,Low
97,Low
98,Low


In [98]:


# Define categorical and numeric features
categorical_features = ["age_group", "lifestyle_risk", "occupation", "city_tier"]
numeric_features = ["bmi", "income_lpa"]
     

In [99]:
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(), categorical_features),
        ("num", "passthrough", numeric_features)
    ]
)
  

In [100]:


# Create a pipeline with preprocessing and random forest classifier
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(random_state=42))
])
    

In [101]:

# Split data and train model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
pipeline.fit(X_train, y_train)


  return fit_method(estimator, *args, **kwargs)


In [102]:


# Predict and evaluate
y_pred = pipeline.predict(X_test)
accuracy_score(y_test, y_pred)

0.9

In [103]:
X_test.sample(5)

Unnamed: 0,income_lpa,occupation,bmi,age_group,lifestyle_risk,city_tier
10,32.78,business_owner,22.949982,adult,medium,1
80,50.0,unemployed,34.350461,middle_aged,medium,2
32,50.0,private_job,31.495845,middle_aged,medium,2
78,14.74,freelancer,27.932798,middle_aged,medium,2
82,12.96,unemployed,17.874812,adult,low,1


In [104]:


import pickle

# Save the trained pipeline using pickle
pickle_model_path = "model.pkl"
with open(pickle_model_path, "wb") as f:
    pickle.dump(pipeline, f)
