# Medical cost insurance project

# Importing Libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

%matplotlib inline


# #Data Loading and Exploration

In [4]:
import pandas as pd

# Load the data
data = pd.read_csv("https://raw.githubusercontent.com/FlipRoboTechnologies/ML-Datasets/main/Medical%20Cost%20Insurance/medical_cost_insurance.csv")

# Explore the dataset
print("Data Info:")
print(data.info())
print("\nData Description:")
print(data.describe())
print("\nFirst few rows of the data:")
print(data.head())


Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB
None

Data Description:
               age          bmi     children       charges
count  1338.000000  1338.000000  1338.000000   1338.000000
mean     39.207025    30.663397     1.094918  13270.422265
std      14.049960     6.098187     1.205493  12110.011237
min      18.000000    15.960000     0.000000   1121.873900
25%      27.000000    26.296250     0.000000   4740.287150
50%      39.000000    30.400000     1.000000   9382.033000
75%      51.000000    34.693750 

# Based on the output provided for Step 1, here are the observations and analysis:

# Data Info:
The dataset contains 1338 entries (rows) and 7 columns.
There are no missing values in any of the columns (all columns have 1338 non-null entries).
The data types of the columns include integers (int64), floating-point numbers (float64), and objects (object).

# Data Description:
The age, bmi, children, and charges columns have numerical data types.
The age column ranges from 18 to 64 years, with a mean age of approximately 39 years.
The bmi (Body Mass Index) column ranges from 15.96 to 53.13, with a mean BMI of approximately 30.66.
The children column represents the number of children covered by health insurance, ranging from 0 to 5, with an average of approximately 1.09 children per individual.
The charges column represents individual medical costs billed by health insurance, ranging from $1121.87 to $63770.43, with a mean charge of approximately $13270.42.
These statistics provide an overview of the distribution of numerical features in the dataset, including measures of central tendency and spread.

# First Few Rows of the Data:
The first few rows of the dataset show a sample of individual records, including information such as age, sex, BMI, number of children, smoker status, region, and charges.
This allows for a quick glimpse into the structure and content of the dataset, providing insight into the types of data present and their formatting.

# Data Preprocessing. 

In [7]:

# Handle missing data
data.dropna(inplace=True)  # Assuming there are no missing values in the provided data

# Convert categorical variables into numerical format using one-hot encoding
data_encoded = pd.get_dummies(data, columns=["sex", "smoker", "region"], drop_first=True)

# Split the data into features (X) and target (y)
X = data_encoded.drop("charges", axis=1)
y = data_encoded["charges"]


In [8]:
print(X.head()) 
print(y.head())  

   age     bmi  children  sex_male  smoker_yes  region_northwest  \
0   19  27.900         0         0           1                 0   
1   18  33.770         1         1           0                 0   
2   28  33.000         3         1           0                 0   
3   33  22.705         0         1           0                 1   
4   32  28.880         0         1           0                 1   

   region_southeast  region_southwest  
0                 0                 1  
1                 1                 0  
2                 1                 0  
3                 0                 0  
4                 0                 0  
0    16884.92400
1     1725.55230
2     4449.46200
3    21984.47061
4     3866.85520
Name: charges, dtype: float64


In [9]:
print(X.dtypes)  
print(y.dtypes)  

age                   int64
bmi                 float64
children              int64
sex_male              uint8
smoker_yes            uint8
region_northwest      uint8
region_southeast      uint8
region_southwest      uint8
dtype: object
float64


In [10]:
print(X.shape)
print(y.shape)  


(1338, 8)
(1338,)


# Train-Test Split

In [11]:
# Step 3: Train-Test Split
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [12]:
print("Training set - X:", X_train.shape, "y:", y_train.shape)
print("Testing set - X:", X_test.shape, "y:", y_test.shape)


Training set - X: (1070, 8) y: (1070,)
Testing set - X: (268, 8) y: (268,)


# Feature Scaling

In [13]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Define the numerical features to be scaled
numeric_features = ["age", "bmi", "children"]

# Create a preprocessor pipeline for feature scaling
numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])

# Apply feature scaling to numerical features
preprocessor = ColumnTransformer(transformers=[("num", numeric_transformer, numeric_features)])

# Apply the preprocessor to training and testing sets
X_train_scaled = preprocessor.fit_transform(X_train)
X_test_scaled = preprocessor.transform(X_test)


In [18]:
print(type(X_train_scaled))
print(type(X_test_scaled))


<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [19]:
print(X.columns)


Index(['age', 'bmi', 'children', 'sex_male', 'smoker_yes', 'region_northwest',
       'region_southeast', 'region_southwest'],
      dtype='object')


# Model Selection and Training

In [21]:
from sklearn.ensemble import RandomForestRegressor

# Define the regression model (Random Forest Regressor)
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model on the scaled training data
model.fit(X_train_scaled, y_train)


# Model Evaluation

In [22]:
from sklearn.metrics import mean_squared_error

# Make predictions on the testing set
y_pred = model.predict(X_test_scaled)

# Evaluate the model using Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)


Mean Squared Error: 157761148.90860045


# Prediction.

In [23]:
# Assuming you have new data points for prediction
new_data = pd.DataFrame({
    "age": [30, 40],
    "sex_male": [1, 0],  # Assuming male and female
    "bmi": [25.5, 30.0],
    "children": [2, 1],
    "smoker_yes": [0, 1],  # Assuming non-smoker and smoker
    "region_northwest": [1, 0],  # Assuming different regions
    "region_southeast": [0, 1],
    "region_southwest": [0, 0]  # Assuming only two regions for simplicity
})

# Apply the preprocessor to the new data
new_data_scaled = preprocessor.transform(new_data)

# Make predictions on the new data points
predictions = model.predict(new_data_scaled)

print("Predictions for new data points:")
print(predictions)


Predictions for new data points:
[ 8456.589677  14468.4396015]


# 
Conclusion:

In this project, we aimed to predict medical insurance costs using a dataset containing information about individuals' age, BMI, number of children, gender, smoking status, and region. After performing data exploration, preprocessing, feature engineering, model selection, and evaluation, we successfully trained a Random Forest Regressor model to predict insurance charges.

The model achieved a Mean Squared Error of approximately 157,761,148.91 on the testing set, indicating the average squared difference between the predicted and actual insurance charges. While this performance may be acceptable for initial predictions, further refinement and tuning of the model could potentially improve its accuracy.