In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
#load teh file 
df = pd.read_excel('processed_premiums.xlsx')
print(df.columns)


Index(['age', 'gender', 'region', 'marital_status', 'number_of_dependants',
       'bmi_category', 'smoking_status', 'employment_status', 'income_level',
       'income_lakhs', 'medical_history', 'insurance_plan',
       'annual_premium_amount', 'medical_score', 'normalized_risk_score',
       'insurance_plan_numerical', 'employment_score', 'smoking_score'],
      dtype='object')


In [3]:
#lets drop the columns 
df = df.drop(['smoking_status','employment_status','medical_history','insurance_plan','normalized_risk_score'],axis='columns')
df.columns

Index(['age', 'gender', 'region', 'marital_status', 'number_of_dependants',
       'bmi_category', 'income_level', 'income_lakhs', 'annual_premium_amount',
       'medical_score', 'insurance_plan_numerical', 'employment_score',
       'smoking_score'],
      dtype='object')

In [4]:
print(df.shape)
df.head(1)

(49808, 13)


Unnamed: 0,age,gender,region,marital_status,number_of_dependants,bmi_category,income_level,income_lakhs,annual_premium_amount,medical_score,insurance_plan_numerical,employment_score,smoking_score
0,26,Male,Northwest,Unmarried,0,Normal,<10L,6,9053,3,1,3,1


In [5]:
# #encode the categorical
# print(df.columns)
df_encoded = pd.get_dummies(df[['region','marital_status', 'gender','bmi_category']], drop_first=True)
df_encoded.columns


Index(['region_Northwest', 'region_Southeast', 'region_Southwest',
       'marital_status_Unmarried', 'gender_Male', 'bmi_category_Obesity',
       'bmi_category_Overweight', 'bmi_category_Underweight'],
      dtype='object')

In [6]:
df_encoded.shape

(49808, 8)

In [7]:
# #now drop the categorical
df = df.drop(['marital_status', 'region','gender', 'income_level','bmi_category'],axis='columns')
df.head(1)

Unnamed: 0,age,number_of_dependants,income_lakhs,annual_premium_amount,medical_score,insurance_plan_numerical,employment_score,smoking_score
0,26,0,6,9053,3,1,3,1


In [8]:
#now add the label encoded in the df
df_with_encoded= pd.concat([df, df_encoded], axis=1)
df_with_encoded.head(2)

Unnamed: 0,age,number_of_dependants,income_lakhs,annual_premium_amount,medical_score,insurance_plan_numerical,employment_score,smoking_score,region_Northwest,region_Southeast,region_Southwest,marital_status_Unmarried,gender_Male,bmi_category_Obesity,bmi_category_Overweight,bmi_category_Underweight
0,26,0,6,9053,3,1,3,1,True,False,False,True,True,False,False,False
1,29,2,6,16339,3,1,3,3,False,True,False,False,False,True,False,False


In [9]:
df_with_encoded.shape

(49808, 16)

In [10]:
#now scale the columns
from sklearn.preprocessing import MinMaxScaler

# Columns to scale
cols_to_scale = [
    'age', 'number_of_dependants', 'income_lakhs',
    'medical_score', 'employment_score',
    'smoking_score', 'insurance_plan_numerical'
]

# Extract target
x_target = df_with_encoded['annual_premium_amount']

# Remaining features
x_remain = df_with_encoded.drop('annual_premium_amount', axis=1)

# Convert True/False to 0/1
x_remain = x_remain.applymap(lambda v: 1 if v is True else (0 if v is False else v))

# Scale selected columns
scaler = MinMaxScaler()
x_remain[cols_to_scale] = scaler.fit_transform(x_remain[cols_to_scale])

# Check result
x_remain.head(2)



  x_remain = x_remain.applymap(lambda v: 1 if v is True else (0 if v is False else v))


Unnamed: 0,age,number_of_dependants,income_lakhs,medical_score,insurance_plan_numerical,employment_score,smoking_score,region_Northwest,region_Southeast,region_Southwest,marital_status_Unmarried,gender_Male,bmi_category_Obesity,bmi_category_Overweight,bmi_category_Underweight
0,0.148148,0.0,0.05102,0.428571,0.0,1.0,0.0,1,0,0,1,1,0,0,0
1,0.203704,0.4,0.05102,0.428571,0.0,1.0,1.0,0,1,0,0,0,1,0,0


In [11]:
print(f'X_remain shape;{x_remain.shape}')
print(f'x_target shape:{x_target.shape}')

X_remain shape;(49808, 15)
x_target shape:(49808,)


In [12]:
df_final = pd.concat([x_remain, x_target], axis=1)
df_final.head(4)

Unnamed: 0,age,number_of_dependants,income_lakhs,medical_score,insurance_plan_numerical,employment_score,smoking_score,region_Northwest,region_Southeast,region_Southwest,marital_status_Unmarried,gender_Male,bmi_category_Obesity,bmi_category_Overweight,bmi_category_Underweight,annual_premium_amount
0,0.148148,0.0,0.05102,0.428571,0.0,1.0,0.0,1,0,0,1,1,0,0,0,9053
1,0.203704,0.4,0.05102,0.428571,0.0,1.0,1.0,0,1,0,0,0,1,0,0,16339
2,0.574074,0.4,0.193878,0.285714,0.5,0.5,0.0,0,0,0,0,0,0,0,0,18164
3,0.222222,0.6,0.77551,0.0,1.0,1.0,0.0,0,1,0,0,0,0,0,0,20303


In [13]:
df_final.shape

(49808, 16)

In [14]:
#save the scale in pkl 
#for future testing of raw data
import pickle
with open('minmax_scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f) 

In [15]:
# now check the VIF in the feature columns (X)
# VIF stands for Variance Inflation Factor
# It measures how much a column is linearly explained by the other columns
# If VIF > 10, it indicates high multicollinearity
# --> The column can be mostly explained by other columns and may be dropped

from statsmodels.stats.outliers_influence import variance_inflation_factor

def calculate_vif(df):
   
    vif_data = pd.DataFrame()
    vif_data["Feature"] = df.columns
    vif_data["VIF"] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]
    return vif_data


In [16]:
# Convert boolean columns to int for VIF calculation
x_vif = x_remain.copy()

vif_result = calculate_vif(x_remain)
print(vif_result)

                     Feature       VIF
0                        age  4.154079
1       number_of_dependants  4.478887
2               income_lakhs  2.469477
3              medical_score  2.314755
4   insurance_plan_numerical  3.431011
5           employment_score  2.714551
6              smoking_score  1.944540
7           region_Northwest  2.096832
8           region_Southeast  2.913030
9           region_Southwest  2.662131
10  marital_status_Unmarried  3.379148
11               gender_Male  2.396690
12      bmi_category_Obesity  1.352330
13   bmi_category_Overweight  1.548520
14  bmi_category_Underweight  1.301910


In [17]:
#final features for model training 
df_final.head(4)

Unnamed: 0,age,number_of_dependants,income_lakhs,medical_score,insurance_plan_numerical,employment_score,smoking_score,region_Northwest,region_Southeast,region_Southwest,marital_status_Unmarried,gender_Male,bmi_category_Obesity,bmi_category_Overweight,bmi_category_Underweight,annual_premium_amount
0,0.148148,0.0,0.05102,0.428571,0.0,1.0,0.0,1,0,0,1,1,0,0,0,9053
1,0.203704,0.4,0.05102,0.428571,0.0,1.0,1.0,0,1,0,0,0,1,0,0,16339
2,0.574074,0.4,0.193878,0.285714,0.5,0.5,0.0,0,0,0,0,0,0,0,0,18164
3,0.222222,0.6,0.77551,0.0,1.0,1.0,0.0,0,1,0,0,0,0,0,0,20303


In [18]:
#final analysis before save
print(df_final.shape)
df_final.isnull().sum()

(49808, 16)


age                         0
number_of_dependants        0
income_lakhs                0
medical_score               0
insurance_plan_numerical    0
employment_score            0
smoking_score               0
region_Northwest            0
region_Southeast            0
region_Southwest            0
marital_status_Unmarried    0
gender_Male                 0
bmi_category_Obesity        0
bmi_category_Overweight     0
bmi_category_Underweight    0
annual_premium_amount       0
dtype: int64

In [19]:
#save the file 
df_final.to_csv('training_data.csv', index=False)