Building  & Exporting the Model

In [72]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report , accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

import numpy as np
import pandas as pd

In [73]:
df = pd.read_csv("data/people_data.csv")
df.head()

Unnamed: 0,name,age,height,weight,city,income_lpa,smoker,occupation
0,Rahul Sharma,32,175,72,Delhi,12,No,Engineer
1,Priya Verma,28,162,55,Mumbai,8,Yes,Doctor
2,Amit Singh,40,180,85,Bangalore,15,No,Manager
3,Neha Gupta,35,158,50,Chennai,10,No,Teacher
4,Vikas Patel,30,172,68,Pune,9,Yes,Designer


In [74]:
df_feat=df.copy()

In [75]:
df_feat["bmi"]=df_feat["weight"]/(df_feat["height"]**2)

In [76]:
def age_group(age):
    if age < 25:
        return "young"
    elif age <45:
        return "adult"
    elif age <60:
        return "middle_aged"
    return "senior"


In [77]:
df_feat["age_group"]=df_feat["age"].apply(age_group)

In [78]:
def lifestyle(row):
    if row["smoker"] and row["bmi"] > 30:
        return "high"
    if row["smoker"] and row["bmi"] >27:
        return " medium"
    else:
        return "low"

In [79]:
df_feat["lifestyle_risk"]=df_feat.apply(lifestyle,axis=1)

In [80]:
def premium_category(row):
    if row["income_lpa"] > 15 or row["smoker"] == "Yes":
        return "High"
    elif row["income_lpa"] >= 8:
        return "Medium"
    else:
        return "Low"

df_feat["insurance_premium_category"] = df_feat.apply(premium_category, axis=1)
df.head()

Unnamed: 0,name,age,height,weight,city,income_lpa,smoker,occupation
0,Rahul Sharma,32,175,72,Delhi,12,No,Engineer
1,Priya Verma,28,162,55,Mumbai,8,Yes,Doctor
2,Amit Singh,40,180,85,Bangalore,15,No,Manager
3,Neha Gupta,35,158,50,Chennai,10,No,Teacher
4,Vikas Patel,30,172,68,Pune,9,Yes,Designer


In [81]:
tier_1 = ["Delhi", "Mumbai", "Bangalore", "Kolkata", "Chennai", "Hyderabad", "Pune"]
tier_2 = ["Ahmedabad", "Jaipur", "Lucknow", "Chandigarh", "Surat", "Nagpur", "Indore"]
tier_3 = ["Patna", "Ranchi", "Bhubaneswar", "Mysore", "Coimbatore", "Varanasi", "Guwahati"]


def map_city_tier(city):
    if city in tier_1:
        return "city_tier_1"
    elif city in tier_2:
        return "city_tier_2"
    elif city in tier_3:
        return "city_tier_3"
    else:
        return "other"

df_feat["city_tier"] = df_feat["city"].apply(map_city_tier)

df_feat[["city", "city_tier"]].head(10)

Unnamed: 0,city,city_tier
0,Delhi,city_tier_1
1,Mumbai,city_tier_1
2,Bangalore,city_tier_1
3,Chennai,city_tier_1
4,Pune,city_tier_1
5,Hyderabad,city_tier_1
6,Kolkata,city_tier_1
7,Jaipur,city_tier_2
8,Ahmedabad,city_tier_2
9,Chandigarh,city_tier_2


In [82]:
df_feat.head()

Unnamed: 0,name,age,height,weight,city,income_lpa,smoker,occupation,bmi,age_group,lifestyle_risk,insurance_premium_category,city_tier
0,Rahul Sharma,32,175,72,Delhi,12,No,Engineer,0.002351,adult,low,Medium,city_tier_1
1,Priya Verma,28,162,55,Mumbai,8,Yes,Doctor,0.002096,adult,low,High,city_tier_1
2,Amit Singh,40,180,85,Bangalore,15,No,Manager,0.002623,adult,low,Medium,city_tier_1
3,Neha Gupta,35,158,50,Chennai,10,No,Teacher,0.002003,adult,low,Medium,city_tier_1
4,Vikas Patel,30,172,68,Pune,9,Yes,Designer,0.002299,adult,low,High,city_tier_1


In [83]:
x=df_feat[["bmi","age_group","lifestyle_risk","city_tier","income_lpa","occupation"]]
y=df_feat["insurance_premium_category"]

In [84]:
numerical_features = ['income_lpa', 'bmi']
categorical_features = ['occupation', 'age_group', 'lifestyle_risk', 'city_tier']

In [85]:
preprocessor=ColumnTransformer(
    transformers=[
        ("cat",OneHotEncoder(handle_unknown='ignore'),categorical_features),
        ("num","passthrough",numerical_features)
    ]
)

In [86]:
pipeline=Pipeline(steps=[
                  ("preprocessor",preprocessor),
                  ("classifier",RandomForestClassifier(random_state=42))
])

In [87]:
x_train, x_test, y_train, y_test=train_test_split(x,y,test_size=0.2,random_state=1)
pipeline.fit(x_train,y_train)

0,1,2
,steps,"[('preprocessor', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [88]:
y_pred=pipeline.predict(x_test)
accuracy_score(y_test,y_pred)

0.8571428571428571

In [91]:
import pickle

#saving the trained pipeline using pickle
pickle_model_path="model.pkl"
with open(pickle_model_path,"wb") as f:
    pickle.dump(pipeline,f)