# Health Insurance Claims Prediction

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

Reading the dataset

In [2]:
df = pd.read_csv('/Users/arunekambaram/Desktop/Health Insurance Claim Prediction/Health Insurance Data/1651277648862_healthinsurance.csv')
df.head()

Unnamed: 0,age,sex,weight,bmi,hereditary_diseases,no_of_dependents,smoker,city,bloodpressure,diabetes,regular_ex,job_title,claim
0,60.0,male,64,24.3,NoDisease,1,0,NewYork,72,0,0,Actor,13112.6
1,49.0,female,75,22.6,NoDisease,1,0,Boston,78,1,1,Engineer,9567.0
2,32.0,female,64,17.8,Epilepsy,2,1,Phildelphia,88,1,1,Academician,32734.2
3,61.0,female,53,36.4,NoDisease,1,1,Pittsburg,72,1,0,Chef,48517.6
4,19.0,female,50,20.6,NoDisease,0,0,Buffalo,82,1,0,HomeMakers,1731.7


In [3]:
df.shape

(15000, 13)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   age                  14604 non-null  float64
 1   sex                  15000 non-null  object 
 2   weight               15000 non-null  int64  
 3   bmi                  14044 non-null  float64
 4   hereditary_diseases  15000 non-null  object 
 5   no_of_dependents     15000 non-null  int64  
 6   smoker               15000 non-null  int64  
 7   city                 15000 non-null  object 
 8   bloodpressure        15000 non-null  int64  
 9   diabetes             15000 non-null  int64  
 10  regular_ex           15000 non-null  int64  
 11  job_title            15000 non-null  object 
 12  claim                15000 non-null  float64
dtypes: float64(3), int64(6), object(4)
memory usage: 1.5+ MB


In [5]:
df.describe()

Unnamed: 0,age,weight,bmi,no_of_dependents,smoker,bloodpressure,diabetes,regular_ex,claim
count,14604.0,15000.0,14044.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0
mean,39.547521,64.9096,30.266413,1.129733,0.198133,68.650133,0.777,0.224133,13401.43762
std,14.015966,13.701935,6.12295,1.228469,0.398606,19.418515,0.416272,0.417024,12148.239619
min,18.0,34.0,16.0,0.0,0.0,0.0,0.0,0.0,1121.9
25%,27.0,54.0,25.7,0.0,0.0,64.0,1.0,0.0,4846.9
50%,40.0,63.0,29.4,1.0,0.0,71.0,1.0,0.0,9545.65
75%,52.0,76.0,34.4,2.0,0.0,80.0,1.0,0.0,16519.125
max,64.0,95.0,53.1,5.0,1.0,122.0,1.0,1.0,63770.4


In [6]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,14604.0,39.547521,14.015966,18.0,27.0,40.0,52.0,64.0
weight,15000.0,64.9096,13.701935,34.0,54.0,63.0,76.0,95.0
bmi,14044.0,30.266413,6.12295,16.0,25.7,29.4,34.4,53.1
no_of_dependents,15000.0,1.129733,1.228469,0.0,0.0,1.0,2.0,5.0
smoker,15000.0,0.198133,0.398606,0.0,0.0,0.0,0.0,1.0
bloodpressure,15000.0,68.650133,19.418515,0.0,64.0,71.0,80.0,122.0
diabetes,15000.0,0.777,0.416272,0.0,1.0,1.0,1.0,1.0
regular_ex,15000.0,0.224133,0.417024,0.0,0.0,0.0,0.0,1.0
claim,15000.0,13401.43762,12148.239619,1121.9,4846.9,9545.65,16519.125,63770.4


#### Data cleaning

In [7]:
df.isna().sum()

age                    396
sex                      0
weight                   0
bmi                    956
hereditary_diseases      0
no_of_dependents         0
smoker                   0
city                     0
bloodpressure            0
diabetes                 0
regular_ex               0
job_title                0
claim                    0
dtype: int64

In [8]:
from sklearn import preprocessing
columns = ['hereditary_diseases','job_title','city','sex']  # columns names where transform is required
for X in columns:
  exec(f'le_{X} = preprocessing.LabelEncoder()')  #create label encoder with name "le_X", where X is column name
  exec(f'df.{X} = le_{X}.fit_transform(df.{X})')  #execute fit transform for column X with respective lable encoder "le_X", where X is column name
df.head()  # to display transformed results

Unnamed: 0,age,sex,weight,bmi,hereditary_diseases,no_of_dependents,smoker,city,bloodpressure,diabetes,regular_ex,job_title,claim
0,60.0,1,64,24.3,8,1,0,55,72,0,0,2,13112.6
1,49.0,0,75,22.6,8,1,0,5,78,1,1,16,9567.0
2,32.0,0,64,17.8,4,2,1,63,88,1,1,0,32734.2
3,61.0,0,53,36.4,8,1,1,64,72,1,0,10,48517.6
4,19.0,0,50,20.6,8,0,0,8,82,1,0,22,1731.7


### Checking skewness of data for deciding over imputation of columns having null values

In [9]:
df.skew()

age                    0.005577
sex                    0.040546
weight                 0.286034
bmi                    0.375259
hereditary_diseases   -4.724370
no_of_dependents       0.928935
smoker                 1.514812
city                  -0.063712
bloodpressure         -1.963879
diabetes              -1.331037
regular_ex             1.323201
job_title             -0.198130
claim                  1.497829
dtype: float64

Some features were as a string and it is not possible for a string object to work with an int/float object, and this results into lot of errors in sebsequent code. So we perform Label Encoding, where we convert all the Object-Type features to Integer-Type features.

Here since the data in both the columns, age & bmi have skewness between the range -0.5 to +0.5, so we can say they are fairly symmetrical and hence we proceed with filling null values with mean.¶

In [11]:
updated_df = df
updated_df['age']=updated_df['age'].fillna(updated_df['age'].mean())

updated_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   age                  15000 non-null  float64
 1   sex                  15000 non-null  int64  
 2   weight               15000 non-null  int64  
 3   bmi                  14044 non-null  float64
 4   hereditary_diseases  15000 non-null  int64  
 5   no_of_dependents     15000 non-null  int64  
 6   smoker               15000 non-null  int64  
 7   city                 15000 non-null  int64  
 8   bloodpressure        15000 non-null  int64  
 9   diabetes             15000 non-null  int64  
 10  regular_ex           15000 non-null  int64  
 11  job_title            15000 non-null  int64  
 12  claim                15000 non-null  float64
dtypes: float64(3), int64(10)
memory usage: 1.5 MB


In [12]:
updated_df = df
updated_df['bmi']=updated_df['bmi'].fillna(updated_df['bmi'].mean())

updated_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   age                  15000 non-null  float64
 1   sex                  15000 non-null  int64  
 2   weight               15000 non-null  int64  
 3   bmi                  15000 non-null  float64
 4   hereditary_diseases  15000 non-null  int64  
 5   no_of_dependents     15000 non-null  int64  
 6   smoker               15000 non-null  int64  
 7   city                 15000 non-null  int64  
 8   bloodpressure        15000 non-null  int64  
 9   diabetes             15000 non-null  int64  
 10  regular_ex           15000 non-null  int64  
 11  job_title            15000 non-null  int64  
 12  claim                15000 non-null  float64
dtypes: float64(3), int64(10)
memory usage: 1.5 MB


In [13]:
df.duplicated().sum()

1096

In [14]:
df[df.duplicated()]

Unnamed: 0,age,sex,weight,bmi,hereditary_diseases,no_of_dependents,smoker,city,bloodpressure,diabetes,regular_ex,job_title,claim
605,46.0,0,68,30.266413,8,2,0,41,60,1,0,12,8825.1
608,27.0,0,82,30.266413,8,3,0,57,82,1,1,4,18804.8
898,48.0,0,67,33.100000,0,0,1,15,70,1,0,24,40974.2
919,26.0,1,56,23.700000,8,2,0,36,72,1,0,22,3484.3
970,48.0,0,70,28.900000,8,1,0,35,72,0,0,4,9249.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
14966,46.0,1,46,22.300000,8,0,0,42,72,1,0,0,7147.1
14971,18.0,0,53,27.300000,8,3,1,5,85,1,0,33,18223.5
14987,47.0,1,94,47.500000,8,1,0,68,61,1,0,0,8083.9
14989,44.0,1,90,38.100000,8,1,0,26,76,1,0,16,7152.7


In [15]:
updated_df = updated_df.drop_duplicates()

In [16]:
updated_df.duplicated().sum()

0