In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_squared_error, accuracy_score, classification_report
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

In [2]:
# Load the dataset
df = pd.read_csv("insurance.csv")

In [3]:
# Display basic information
display(df.head())
display(df.info())


Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


None

In [4]:
# Statistical Summary
numerical_features = df.select_dtypes(include=['int64', 'float64'])
categorical_features = df.select_dtypes(include=['object'])

summary = numerical_features.describe()
summary.loc['range'] = summary.loc['max'] - summary.loc['min']
display(summary)



Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801
range,46.0,37.17,5.0,62648.55411


In [5]:
# Display value counts for categorical features
for col in categorical_features.columns:
    print(f"\nValue counts for {col}:")
    display(df[col].value_counts())




Value counts for sex:


Unnamed: 0_level_0,count
sex,Unnamed: 1_level_1
male,676
female,662



Value counts for smoker:


Unnamed: 0_level_0,count
smoker,Unnamed: 1_level_1
no,1064
yes,274



Value counts for region:


Unnamed: 0_level_0,count
region,Unnamed: 1_level_1
southeast,364
southwest,325
northwest,325
northeast,324


In [6]:
# Min-Max Scaling
scaler = MinMaxScaler()
df_scaled = df.copy()
df_scaled[numerical_features.columns] = scaler.fit_transform(numerical_features)



In [7]:
# Display first-10 and last-10 rows of scaled features
display(df_scaled[numerical_features.columns].head(10))
display(df_scaled[numerical_features.columns].tail(10))


Unnamed: 0,age,bmi,children,charges
0,0.021739,0.321227,0.0,0.251611
1,0.0,0.47915,0.2,0.009636
2,0.217391,0.458434,0.6,0.053115
3,0.326087,0.181464,0.0,0.33301
4,0.304348,0.347592,0.0,0.043816
5,0.282609,0.263115,0.0,0.042056
6,0.608696,0.470272,0.2,0.113629
7,0.413043,0.316922,0.6,0.09832
8,0.413043,0.37315,0.4,0.084352
9,0.913043,0.265806,0.0,0.443765


Unnamed: 0,age,bmi,children,charges
1328,0.108696,0.222357,0.4,0.339575
1329,0.73913,0.609093,0.4,0.146904
1330,0.847826,0.263115,0.4,0.18368
1331,0.108696,0.469196,0.0,0.154418
1332,0.73913,0.773204,0.6,0.164247
1333,0.695652,0.40382,0.6,0.151299
1334,0.0,0.429379,0.0,0.017305
1335,0.0,0.562012,0.0,0.008108
1336,0.065217,0.26473,0.0,0.014144
1337,0.934783,0.352704,0.0,0.447249


In [8]:
# Label Encoding
encoder = LabelEncoder()
df_encoded = df.copy()
for col in categorical_features.columns:
    df_encoded[col] = encoder.fit_transform(df_encoded[col])


In [9]:
# Display first-10 and last-10 rows of encoded categorical features
display(df_encoded[categorical_features.columns].head(10))
display(df_encoded[categorical_features.columns].tail(10))



Unnamed: 0,sex,smoker,region
0,0,1,3
1,1,0,2
2,1,0,2
3,1,0,1
4,1,0,1
5,0,0,2
6,0,0,2
7,0,0,1
8,1,0,0
9,0,0,1


Unnamed: 0,sex,smoker,region
1328,0,0,0
1329,1,0,3
1330,0,0,2
1331,0,0,3
1332,0,0,3
1333,1,0,1
1334,0,0,0
1335,0,0,2
1336,0,0,3
1337,0,1,1


In [10]:
# Classification Tasks
print("Classification Task: Predicting Smoker Status based on Age, BMI, and Children")
X_clf = df_encoded[['age', 'bmi', 'children']]
y_clf = df_encoded['smoker']
X_train, X_test, y_train, y_test = train_test_split(X_clf, y_clf, test_size=0.2, random_state=42)
clf_model = LogisticRegression(max_iter=1000)
clf_model.fit(X_train, y_train)
y_pred = clf_model.predict(X_test)
print(f"Classification Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))


Classification Task: Predicting Smoker Status based on Age, BMI, and Children
Classification Accuracy: 0.7985074626865671
              precision    recall  f1-score   support

           0       0.80      1.00      0.89       214
           1       0.00      0.00      0.00        54

    accuracy                           0.80       268
   macro avg       0.40      0.50      0.44       268
weighted avg       0.64      0.80      0.71       268



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [11]:
# Regression Tasks
print("Regression Task 1: Predicting Insurance Charges based on Age, BMI, and Smoker Status")
X_reg = df_encoded[['age', 'bmi', 'smoker']]
y_reg = df_encoded['charges']
X_train, X_test, y_train, y_test = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)
reg_model = LinearRegression()
reg_model.fit(X_train, y_train)
y_pred = reg_model.predict(X_test)
print(f"Regression MSE: {mean_squared_error(y_test, y_pred)}")



Regression Task 1: Predicting Insurance Charges based on Age, BMI, and Smoker Status
Regression MSE: 34512843.8802279


In [13]:
# Regression Task with Random Forest
print("\nRegression Task with Random Forest: Predicting Insurance Charges")
rf_reg_model = RandomForestRegressor(n_estimators=100, random_state=42) # You can adjust n_estimators
rf_reg_model.fit(X_train, y_train)  # X_train and y_train from Regression Task
y_pred_rf_reg = rf_reg_model.predict(X_test)
print(f"Random Forest Regression MSE: {mean_squared_error(y_test, y_pred_rf_reg)}")


# Classification Task with Random Forest
print("\nClassification Task with Random Forest: Predicting Smoker Status")
# ----> Re-define X_train, X_test, y_train, y_test for classification
X_clf = df_encoded[['age', 'bmi', 'children']]  # Features for classification
y_clf = df_encoded['smoker']  # Target variable for classification
X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(
    X_clf, y_clf, test_size=0.2, random_state=42
)
# ----> Use the classification data for fitting the classifier
rf_clf_model = RandomForestClassifier(n_estimators=100, random_state=42) # You can adjust n_estimators
rf_clf_model.fit(X_train_clf, y_train_clf)
y_pred_rf_clf = rf_clf_model.predict(X_test_clf)
print(f"Random Forest Classification Accuracy: {accuracy_score(y_test_clf, y_pred_rf_clf)}")
print(classification_report(y_test_clf, y_pred_rf_clf))


Regression Task with Random Forest: Predicting Insurance Charges
Random Forest Regression MSE: 25668135.254829608

Classification Task with Random Forest: Predicting Smoker Status
Random Forest Classification Accuracy: 0.7611940298507462
              precision    recall  f1-score   support

           0       0.81      0.92      0.86       214
           1       0.29      0.13      0.18        54

    accuracy                           0.76       268
   macro avg       0.55      0.53      0.52       268
weighted avg       0.70      0.76      0.72       268



In [None]:
# Data Visualizations
plt.figure(figsize=(12,6))
sns.histplot(df['charges'], bins=30, kde=True)
plt.title('Distribution of Insurance Charges')
plt.show()



In [None]:
plt.figure(figsize=(12,6))
sns.boxplot(x=df['smoker'], y=df['charges'])
plt.title('Charges by Smoking Status')
plt.show()



In [None]:
plt.figure(figsize=(10,6))
sns.heatmap(numerical_features.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()



In [None]:
plt.figure(figsize=(12,6))
sns.scatterplot(x=df['bmi'], y=df['charges'], hue=df['smoker'])
plt.title('BMI vs Charges with Smoking Status')
plt.show()



In [None]:
plt.figure(figsize=(12,6))
sns.countplot(x=df['region'])
plt.title('Count of Patients by Region')
plt.show()



In [None]:
# Business Insights
print("\nBusiness Insights:")
print("- Higher BMI and being a smoker significantly increase insurance charges.")
print("- Logistic regression can help identify potential smokers based on health parameters.")
print("- The correlation heatmap highlights key relationships that insurers can use to adjust premiums.")