# Importing Packages

In [36]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Reading Dataset

In [2]:
df = pd.read_csv("medical_insurance_dataset.csv")

# Viewing First Few Rows From the Dataset

In [3]:
df.head()

Unnamed: 0,19,1,27.9,0,1.1,3,16884.924
0,18,2,33.77,1,0,4,1725.5523
1,28,2,33.0,3,0,4,4449.462
2,33,2,22.705,0,0,1,21984.47061
3,32,2,28.88,0,0,1,3866.8552
4,31,1,25.74,0,?,4,3756.6216


# Adding new Headers to Our Dataset

In [4]:
headers = [
    "age",
    "gender",
    "bmi",
    "no_of_children",
    "smoker",
    "region",
    "charges"
]

In [5]:
df.columns = headers

In [6]:
df.head()

Unnamed: 0,age,gender,bmi,no_of_children,smoker,region,charges
0,18,2,33.77,1,0,4,1725.5523
1,28,2,33.0,3,0,4,4449.462
2,33,2,22.705,0,0,1,21984.47061
3,32,2,28.88,0,0,1,3866.8552
4,31,1,25.74,0,?,4,3756.6216


# Getting the information about Our Dataset

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2771 entries, 0 to 2770
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             2771 non-null   object 
 1   gender          2771 non-null   int64  
 2   bmi             2771 non-null   float64
 3   no_of_children  2771 non-null   int64  
 4   smoker          2771 non-null   object 
 5   region          2771 non-null   int64  
 6   charges         2771 non-null   float64
dtypes: float64(2), int64(3), object(2)
memory usage: 151.7+ KB


# Replacing '?' with 'NaN' in all Cells

In [10]:
df.replace('?' , np.nan , inplace = True)

In [12]:
df.head()

Unnamed: 0,age,gender,bmi,no_of_children,smoker,region,charges
0,18,2,33.77,1,0.0,4,1725.5523
1,28,2,33.0,3,0.0,4,4449.462
2,33,2,22.705,0,0.0,1,21984.47061
3,32,2,28.88,0,0.0,1,3866.8552
4,31,1,25.74,0,,4,3756.6216


# Calculate missing values in each Feature

In [13]:
missing_data = df.isnull()

In [14]:
missing_data.head()

Unnamed: 0,age,gender,bmi,no_of_children,smoker,region,charges
0,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False
4,False,False,False,False,True,False,False


# Get summary of missing values in each feature

In [15]:
for column in missing_data.columns.values.tolist():
    print(missing_data[column].value_counts())
    print(" ")

age
False    2767
True        4
Name: count, dtype: int64
 
gender
False    2771
Name: count, dtype: int64
 
bmi
False    2771
Name: count, dtype: int64
 
no_of_children
False    2771
Name: count, dtype: int64
 
smoker
False    2764
True        7
Name: count, dtype: int64
 
region
False    2771
Name: count, dtype: int64
 
charges
False    2771
Name: count, dtype: int64
 


# Data Wrangling

Replace missing values in 'age' with mean

In [20]:
ages_mean = df['age'].astype(float).mean(axis = 0)
print("Average of ages : " , ages_mean)

Average of ages :  39.11709432598482


In [21]:
df['age'].replace(np.nan , ages_mean , inplace = True)

Replace missing values in 'smoker' with the frequency

Getting the frequency of each value in 'smoker' Feature

In [22]:
df['smoker'].value_counts()

smoker
0    2201
1     563
Name: count, dtype: int64

Getting the most Frequency value in 'smoker' Feature

In [23]:
df['smoker'].value_counts().idxmax()

'0'

In [24]:
df['smoker'].replace(np.nan , df['smoker'].value_counts().idxmax() , inplace = True)

# Evaluating missing values again

In [25]:
missing_data = df.isnull()

In [26]:
for column in missing_data.columns.tolist():
    print(missing_data[column].value_counts())
    print(" ")

age
False    2771
Name: count, dtype: int64
 
gender
False    2771
Name: count, dtype: int64
 
bmi
False    2771
Name: count, dtype: int64
 
no_of_children
False    2771
Name: count, dtype: int64
 
smoker
False    2771
Name: count, dtype: int64
 
region
False    2771
Name: count, dtype: int64
 
charges
False    2771
Name: count, dtype: int64
 


# Correct Data Format

In [27]:
df.dtypes

age                object
gender              int64
bmi               float64
no_of_children      int64
smoker             object
region              int64
charges           float64
dtype: object

Convert 'age' dtype to 'int64'

In [29]:
df['age'].astype('int64')

0       18
1       28
2       33
3       32
4       31
        ..
2766    47
2767    21
2768    19
2769    23
2770    54
Name: age, Length: 2771, dtype: int64

In [30]:
df.head()

Unnamed: 0,age,gender,bmi,no_of_children,smoker,region,charges
0,18,2,33.77,1,0,4,1725.5523
1,28,2,33.0,3,0,4,4449.462
2,33,2,22.705,0,0,1,21984.47061
3,32,2,28.88,0,0,1,3866.8552
4,31,1,25.74,0,0,4,3756.6216


# Round 'charges' feature values to 2 decimal points

In [31]:
df['charges'] = np.round(df[['charges']] , 2)

In [32]:
df.head()

Unnamed: 0,age,gender,bmi,no_of_children,smoker,region,charges
0,18,2,33.77,1,0,4,1725.55
1,28,2,33.0,3,0,4,4449.46
2,33,2,22.705,0,0,1,21984.47
3,32,2,28.88,0,0,1,3866.86
4,31,1,25.74,0,0,4,3756.62


# Saving the cleaned dataset :

In [45]:
df.to_csv('medical_insurance_dataset_cleaned.csv')