In [60]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [41]:
from google.colab import files
upload = files.upload()

Saving insurance.csv to insurance (1).csv


In [61]:
df = pd.read_csv("/content/insurance.csv")
print(df.head())

   age  weight  height  income_lpa  smoker     city  occupation  \
0   67   119.8    1.56        2.92   False   Jaipur     retired   
1   36   101.1    1.83       34.28   False  Chennai  freelancer   
2   39    56.8    1.64       36.64   False   Indore  freelancer   
3   22   109.4    1.55        3.34    True   Mumbai     student   
4   69    62.2    1.60        3.94    True   Indore     retired   

  insurance_premium_category  
0                       High  
1                        Low  
2                        Low  
3                     Medium  
4                       High  


In [62]:
dfFeatures = df.copy()

In [63]:
# Feature 1
dfFeatures["bmi"] = df["weight"] / (df["height"] ** 2)
dfFeatures

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category,bmi
0,67,119.8,1.56,2.92000,False,Jaipur,retired,High,49.227482
1,36,101.1,1.83,34.28000,False,Chennai,freelancer,Low,30.189017
2,39,56.8,1.64,36.64000,False,Indore,freelancer,Low,21.118382
3,22,109.4,1.55,3.34000,True,Mumbai,student,Medium,45.535900
4,69,62.2,1.60,3.94000,True,Indore,retired,High,24.296875
...,...,...,...,...,...,...,...,...,...
95,36,52.8,1.57,19.64000,False,Indore,business_owner,Low,21.420747
96,26,113.8,1.54,34.01000,False,Delhi,private_job,Low,47.984483
97,52,60.8,1.80,44.86000,False,Hyderabad,freelancer,Low,18.765432
98,27,101.1,1.82,28.30000,False,Kolkata,business_owner,Low,30.521676


In [64]:
# Feature 2
def ageGrp(age):
    if age < 25: return "Young"
    elif 25 <= age < 45: return "adult"
    elif 45 <= age < 65: return "middle age"
    else: return "senior"
dfFeatures["age_group"] = dfFeatures["age"].apply(ageGrp)
dfFeatures

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category,bmi,age_group
0,67,119.8,1.56,2.92000,False,Jaipur,retired,High,49.227482,senior
1,36,101.1,1.83,34.28000,False,Chennai,freelancer,Low,30.189017,adult
2,39,56.8,1.64,36.64000,False,Indore,freelancer,Low,21.118382,adult
3,22,109.4,1.55,3.34000,True,Mumbai,student,Medium,45.535900,Young
4,69,62.2,1.60,3.94000,True,Indore,retired,High,24.296875,senior
...,...,...,...,...,...,...,...,...,...,...
95,36,52.8,1.57,19.64000,False,Indore,business_owner,Low,21.420747,adult
96,26,113.8,1.54,34.01000,False,Delhi,private_job,Low,47.984483,adult
97,52,60.8,1.80,44.86000,False,Hyderabad,freelancer,Low,18.765432,middle age
98,27,101.1,1.82,28.30000,False,Kolkata,business_owner,Low,30.521676,adult


In [65]:
# Feature 3
def lifestyleRisk(row):
  if row['smoker'] and row['bmi'] > 30: return " high"
  elif row['smoker'] and row['bmi'] > 27: return "medium"
  else: return "low"
dfFeatures["lifestyle_risk"] = dfFeatures.apply(lifestyleRisk, axis=1)
dfFeatures

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category,bmi,age_group,lifestyle_risk
0,67,119.8,1.56,2.92000,False,Jaipur,retired,High,49.227482,senior,low
1,36,101.1,1.83,34.28000,False,Chennai,freelancer,Low,30.189017,adult,low
2,39,56.8,1.64,36.64000,False,Indore,freelancer,Low,21.118382,adult,low
3,22,109.4,1.55,3.34000,True,Mumbai,student,Medium,45.535900,Young,high
4,69,62.2,1.60,3.94000,True,Indore,retired,High,24.296875,senior,low
...,...,...,...,...,...,...,...,...,...,...,...
95,36,52.8,1.57,19.64000,False,Indore,business_owner,Low,21.420747,adult,low
96,26,113.8,1.54,34.01000,False,Delhi,private_job,Low,47.984483,adult,low
97,52,60.8,1.80,44.86000,False,Hyderabad,freelancer,Low,18.765432,middle age,low
98,27,101.1,1.82,28.30000,False,Kolkata,business_owner,Low,30.521676,adult,low


In [66]:
tier_1_cities = ["Mumbai", "Delhi", "Bangalore", "Chennai", "Kolkata", "Hyderabad", "Pune"]
tier_2_cities = [
    "Jaipur", "Chandigarh", "Indore", "Lucknow", "Patna", "Ranchi", "Visakhapatnam", "Coimbatore",
    "Bhopal", "Nagpur", "Vadodara", "Surat", "Rajkot", "Jodhpur", "Raipur", "Amritsar", "Varanasi",
    "Agra", "Dehradun", "Mysore", "Jabalpur", "Guwahati", "Thiruvananthapuram", "Ludhiana", "Nashik",
    "Allahabad", "Udaipur", "Aurangabad", "Hubli", "Belgaum", "Salem", "Vijayawada", "Tiruchirappalli",
    "Bhavnagar", "Gwalior", "Dhanbad", "Bareilly", "Aligarh", "Gaya", "Kozhikode", "Warangal",
    "Kolhapur", "Bilaspur", "Jalandhar", "Noida", "Guntur", "Asansol", "Siliguri"
]

In [67]:
# Feature 4: City Tier
def cityTier(city):
    if city in tier_1_cities:
        return 1
    elif city in tier_2_cities:
        return 2
    else:
        return 3
dfFeatures["city_tier"] = dfFeatures["city"].apply(cityTier)
dfFeatures

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category,bmi,age_group,lifestyle_risk,city_tier
0,67,119.8,1.56,2.92000,False,Jaipur,retired,High,49.227482,senior,low,2
1,36,101.1,1.83,34.28000,False,Chennai,freelancer,Low,30.189017,adult,low,1
2,39,56.8,1.64,36.64000,False,Indore,freelancer,Low,21.118382,adult,low,2
3,22,109.4,1.55,3.34000,True,Mumbai,student,Medium,45.535900,Young,high,1
4,69,62.2,1.60,3.94000,True,Indore,retired,High,24.296875,senior,low,2
...,...,...,...,...,...,...,...,...,...,...,...,...
95,36,52.8,1.57,19.64000,False,Indore,business_owner,Low,21.420747,adult,low,2
96,26,113.8,1.54,34.01000,False,Delhi,private_job,Low,47.984483,adult,low,1
97,52,60.8,1.80,44.86000,False,Hyderabad,freelancer,Low,18.765432,middle age,low,1
98,27,101.1,1.82,28.30000,False,Kolkata,business_owner,Low,30.521676,adult,low,1


In [71]:
dfFeatures = dfFeatures.drop(columns=['age', 'weight', 'height', 'smoker', 'city'])

In [75]:
Y = dfFeatures["insurance_premium_category"]
Y

Unnamed: 0,insurance_premium_category
0,High
1,Low
2,Low
3,Medium
4,High
...,...
95,Low
96,Low
97,Low
98,Low


In [76]:
X = dfFeatures[["bmi", "age_group", "lifestyle_risk", "city_tier", "income_lpa", "occupation"]]
X

Unnamed: 0,bmi,age_group,lifestyle_risk,city_tier,income_lpa,occupation
0,49.227482,senior,low,2,2.92000,retired
1,30.189017,adult,low,1,34.28000,freelancer
2,21.118382,adult,low,2,36.64000,freelancer
3,45.535900,Young,high,1,3.34000,student
4,24.296875,senior,low,2,3.94000,retired
...,...,...,...,...,...,...
95,21.420747,adult,low,2,19.64000,business_owner
96,47.984483,adult,low,1,34.01000,private_job
97,18.765432,middle age,low,1,44.86000,freelancer
98,30.521676,adult,low,1,28.30000,business_owner


In [77]:
categoricalFeaturs = ["age_group", "lifestyle_risk", "occupation", "city_tier"]
numericalFeatures = ["bmi", "income_lpa"]

In [81]:
preprocessor = ColumnTransformer(
    transformers= [
        ("cat", OneHotEncoder(), categoricalFeaturs),
        ("num", "passthrough", numericalFeatures)
    ]
)

In [82]:
from sklearn import random
pipeline = Pipeline(steps = [
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(random_state=42))
])

In [100]:
XTrain, XTest, YTrain, YTest = train_test_split(X, Y, test_size=0.2, random_state=12)
pipeline.fit(XTrain, YTrain)

In [101]:
yPrediction = pipeline.predict(XTest)
accuracy_score(YTest, yPrediction)

0.8

In [103]:
import pickle
modelPath = "model.pkl"
with open(modelPath, "wb") as f:
  pickle.dump(pipeline, f)