In [29]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [14]:
insurance_data = pd.read_csv("cleaned_dataset.csv")  # Replace with the actual file path


In [15]:
insurance_data.head()

Unnamed: 0.1,Unnamed: 0,age,gender,bmi,children,smoker,region,medical_history,family_medical_history,exercise_frequency,occupation,coverage_level,charges,is_child,is_active,weight_status
0,0,55,female,36.41,0,yes,northeast,No history,No history,Never,Student,Basic,11896.83661,False,False,obese
1,1,64,female,20.12,2,no,northeast,High blood pressure,High blood pressure,Never,Blue collar,Basic,9563.655011,False,False,normal
2,2,40,female,44.93,2,yes,northeast,No history,Diabetes,Occasionally,Unemployed,Basic,14036.54413,False,True,obese
3,3,22,female,32.13,5,yes,northeast,Diabetes,No history,Never,Student,Basic,13669.57783,False,False,obese
4,4,64,female,29.31,2,no,northeast,High blood pressure,No history,Frequently,Unemployed,Basic,9414.800786,False,True,overweight


In [16]:
insurance_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250343 entries, 0 to 250342
Data columns (total 16 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   Unnamed: 0              250343 non-null  int64  
 1   age                     250343 non-null  int64  
 2   gender                  250343 non-null  object 
 3   bmi                     250343 non-null  float64
 4   children                250343 non-null  int64  
 5   smoker                  250343 non-null  object 
 6   region                  250343 non-null  object 
 7   medical_history         250343 non-null  object 
 8   family_medical_history  250343 non-null  object 
 9   exercise_frequency      250343 non-null  object 
 10  occupation              250343 non-null  object 
 11  coverage_level          250343 non-null  object 
 12  charges                 250343 non-null  float64
 13  is_child                250343 non-null  bool   
 14  is_active           

In [17]:
insurance_data.columns

Index(['Unnamed: 0', 'age', 'gender', 'bmi', 'children', 'smoker', 'region',
       'medical_history', 'family_medical_history', 'exercise_frequency',
       'occupation', 'coverage_level', 'charges', 'is_child', 'is_active',
       'weight_status'],
      dtype='object')

In [19]:
# Check for missing values
print(insurance_data.isnull().sum())

Unnamed: 0                0
age                       0
gender                    0
bmi                       0
children                  0
smoker                    0
region                    0
medical_history           0
family_medical_history    0
exercise_frequency        0
occupation                0
coverage_level            0
charges                   0
is_child                  0
is_active                 0
weight_status             0
dtype: int64


In [25]:
categorical_columns = ['gender', 'smoker', 'region', 'children', 'medical_history', 'family_medical_history', 'exercise_frequency',
       'occupation', 'is_child','coverage_level','weight_status', 'is_active']

for col in categorical_columns:
    insurance_data[col] = insurance_data[col].astype('category')

In [26]:
# One-hot encoding for categorical columns
insurance_data_encoded = pd.get_dummies(insurance_data, columns = categorical_columns, drop_first = True).astype(int)

In [27]:
insurance_data_encoded.head()

Unnamed: 0.1,Unnamed: 0,age,bmi,charges,gender_male,smoker_yes,children_1,children_2,children_3,children_4,...,exercise_frequency_Occasionally,exercise_frequency_Rarely,occupation_Student,occupation_Unemployed,occupation_White collar,coverage_level_Premium,coverage_level_Standard,weight_status_obese,weight_status_overweight,is_active_True
0,0,55,36,11896,0,1,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
1,1,64,20,9563,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,40,44,14036,0,1,0,1,0,0,...,1,0,0,1,0,0,0,1,0,1
3,3,22,32,13669,0,1,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
4,4,64,29,9414,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,1,1


In [32]:
 # Define numerical features to standardize (all non-categorical features)
numerical_features = ["age", "bmi", "charges"]
scaler = StandardScaler()
insurance_data_encoded[numerical_features] = scaler.fit_transform(insurance_data_encoded[numerical_features])
    

In [34]:
insurance_data_encoded.head()

Unnamed: 0.1,Unnamed: 0,age,bmi,charges,gender_male,smoker_yes,children_1,children_2,children_3,children_4,...,exercise_frequency_Occasionally,exercise_frequency_Rarely,occupation_Student,occupation_Unemployed,occupation_White collar,coverage_level_Premium,coverage_level_Standard,weight_status_obese,weight_status_overweight,is_active_True
0,0,0.974011,0.270158,-1.211659,0,1,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
1,1,1.622392,-1.462953,-1.741081,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,-0.106624,1.136713,-0.726035,0,1,0,1,0,0,...,1,0,0,1,0,0,0,1,0,1
3,3,-1.403386,-0.16312,-0.809317,0,1,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
4,4,1.622392,-0.488078,-1.774893,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,1,1


# Splitting the Dataset in to training & testing

In [36]:
X = insurance_data_encoded.drop(['charges'], axis=1)
y = insurance_data_encoded['charges']

In [37]:
# Splitting the dataset for training and testing the model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [38]:
# Let us check shape of the training & test set
print(X_train.shape, y_test.shape)

(200274, 27) (50069,)
