## insurance.csv
| Column    | Data Type | Description                                                      |
|-----------|-----------|------------------------------------------------------------------|
| `age`       | int       | Age of the primary beneficiary.                                  |
| `sex`       | object    | Gender of the insurance contractor (male or female).             |
| `bmi`       | float     | Body mass index, a key indicator of body fat based on height and weight. |
| `children`  | int       | Number of dependents covered by the insurance plan.              |
| `smoker`    | object    | Indicates whether the beneficiary smokes (yes or no).            |
| `region`    | object    | The beneficiary's residential area in the US, divided into eight regions. |
| `charges`   | float     | Individual medical costs billed by health insurance.             |


In [1]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression

In [2]:
# Loading the insurance dataset
insurance_data_path = 'insurance.csv'
insurance = pd.read_csv(insurance_data_path)
insurance.head()
# insurance.tail()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19.0,female,27.9,0.0,yes,southwest,16884.924
1,18.0,male,33.77,1.0,no,Southeast,1725.5523
2,28.0,male,33.0,3.0,no,southeast,$4449.462
3,33.0,male,22.705,0.0,no,northwest,$21984.47061
4,32.0,male,28.88,0.0,no,northwest,$3866.8552


In [3]:
insurance.shape

(1338, 7)

In [4]:
insurance.isnull().sum()

age         66
sex         66
bmi         66
children    66
smoker      66
region      66
charges     54
dtype: int64

In [5]:
# Drop the null values in the insurance dataframe
insurance.dropna(inplace=True)

In [6]:
# Drop duplicated if needed
insurance.drop_duplicates(inplace=True)
print(insurance.duplicated().sum())
# insurance.shape
# insurance.info()

0


In [7]:
insurance.describe()

Unnamed: 0,age,bmi,children
count,1208.0,1208.0,1208.0
mean,35.35596,30.574971,0.942881
std,22.061241,6.117562,1.311809
min,-64.0,15.96,-4.0
25%,24.75,26.195,0.0
50%,38.0,30.23,1.0
75%,51.0,34.58,2.0
max,64.0,53.13,5.0


In [8]:
#remove age is negative
insurance = insurance[insurance['age']>0]

#remove children is negative
insurance = insurance[insurance['children']>=0]

In [9]:
# Remove the dollar sign and convert the 'charges' column from object data type to float data type
insurance['charges'] = insurance['charges'].replace('[\$,]', '', regex=True).astype(float)

In [10]:
# Clean column sex data and converted to int from str 
insurance['sex'].unique()
insurance['sex'] = insurance['sex'].replace({'male':1, 'man':1, 'M':1, 'female':0, 'woman':0, 'F':0})

In [11]:
# Clean column smoker data and converted to int from str
insurance['smoker'].unique()
insurance['smoker'] = insurance['smoker'].map({'yes':1, 'no':0})

In [12]:
# Encode region column
insurance['region'].unique()
insurance['region'] = insurance['region'].str.lower()
insurance = pd.get_dummies(insurance)

In [13]:
# insurance.describe()
insurance.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1108 entries, 0 to 1337
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   age               1108 non-null   float64
 1   sex               1108 non-null   int64  
 2   bmi               1108 non-null   float64
 3   children          1108 non-null   float64
 4   smoker            1108 non-null   int64  
 5   charges           1108 non-null   float64
 6   region_northeast  1108 non-null   uint8  
 7   region_northwest  1108 non-null   uint8  
 8   region_southeast  1108 non-null   uint8  
 9   region_southwest  1108 non-null   uint8  
dtypes: float64(4), int64(2), uint8(4)
memory usage: 64.9 KB


In [14]:
# Build Model

X = insurance.drop('charges', axis=1)
y = insurance['charges']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

lr = LinearRegression()
scores = cross_val_score(lr, X_scaled, y, cv = 5, scoring = 'r2')
print(scores.mean())

lr.fit(X_scaled, y)

0.7440405079631597


LinearRegression()

In [15]:
#predict the charges in validation_dataset.csv
validation_data_path = 'validation_dataset.csv'
validation = pd.read_csv(validation_data_path)

validation.dropna(inplace=True)
validation.drop_duplicates(inplace=True)
validation = validation[validation['age']>0]
validation = validation[validation['children']>=0]

validation['sex'] = validation['sex'].replace({'male':1, 'man':1, 'M':1, 'female':0, 'woman':0, 'F':0})
validation['smoker'] = validation['smoker'].map({'yes':1, 'no':0})
validation.reset_index(drop=True, inplace=True)
validation['region'] = validation['region'].str.lower()
validation = pd.get_dummies(validation)
validation = validation.reindex(columns=X.columns, fill_value=0)
validation_scaled = scaler.transform(validation)

val_pred = lr.predict(validation_scaled)
validation['charges'] = val_pred
validation.loc[validation['charges']<0, 'charges'] = 1000
validation

Unnamed: 0,age,sex,bmi,children,smoker,region_northeast,region_northwest,region_southeast,region_southwest,charges
0,18.0,0,24.09,1.0,0,0,0,1,0,1000.0
1,39.0,1,26.41,0.0,1,1,0,0,0,23304.467288
2,27.0,1,29.15,0.0,1,0,0,1,0,20831.590876
3,71.0,1,65.502135,13.0,1,0,0,1,0,34220.000469
4,28.0,1,38.06,0.0,0,0,0,1,0,1928.069366
5,70.0,0,72.958351,11.0,1,0,0,1,0,34839.15716
6,29.0,0,32.11,2.0,0,0,1,0,0,2331.739186
7,42.0,0,41.325,1.0,0,1,0,0,0,5739.459517
8,48.0,0,36.575,0.0,0,0,1,0,0,5913.078936
9,63.0,1,33.66,3.0,0,0,0,1,0,7890.854146
