In [None]:
# 1. Import necessary dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# ML tools
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

: 

In [None]:
# 2. Read and load the dataset
df = pd.read_csv("insurance.csv")
df.head()

: 

In [None]:
# Step 3: Basic info
print("Dataset Info:\n")
df.info()
print("\nDescriptive Statistics:\n")
print(df.describe(include='all'))

print("\nMissing Values:")
print(df.isnull().sum())

: 

In [1]:
# 4. Data visualization of target variables
plt.figure(figsize=(8, 5))
sns.histplot(df['charges'], kde=True, bins=30, color='orange')
plt.title('Distribution of Insurance Charges')
plt.xlabel('Charges')
plt.ylabel('Frequency')
plt.show()

NameError: name 'plt' is not defined

In [None]:
# 5. Data preprocessing
le = LabelEncoder()
df['sex'] = le.fit_transform(df['sex'])
df['smoker'] = le.fit_transform(df['smoker'])
df['region'] = le.fit_transform(df['region'])

In [None]:
# 6. Splitting our data into Train and Test subsets
X = df.drop('charges', axis=1)
y = df['charges']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# 7. Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# 8. Function for model evaluation
def evaluate_model(y_true, y_pred):
    print("MAE:", mean_absolute_error(y_true, y_pred))
    print("MSE:", mean_squared_error(y_true, y_pred))
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    print("RMSE:", rmse)
    print("R² Score:", r2_score(y_true, y_pred))

In [None]:
## 9. Model building
# Linear Regression
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
y_pred_lr = lr.predict(X_test_scaled)
print("\n--- Linear Regression ---")
evaluate_model(y_test, y_pred_lr)

# Random Forest Regressor
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train_scaled, y_train)
y_pred_rf = rf.predict(X_test_scaled)
print("\n--- Random Forest Regressor ---")
evaluate_model(y_test, y_pred_rf)

In [2]:
# Step 10: Visualization of predictions
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred_rf, color='green', label='Random Forest', alpha=0.6)
plt.scatter(y_test, y_pred_lr, color='blue', label='Linear Regression', alpha=0.4)
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--')
plt.xlabel('Actual Charges')
plt.ylabel('Predicted Charges')
plt.title('Actual vs Predicted Charges')
plt.legend()
plt.show()

NameError: name 'plt' is not defined

In [3]:
# Visualizations
# Set style
sns.set(style="whitegrid")

NameError: name 'sns' is not defined

In [4]:
# 2. Charges vs Age
plt.figure(figsize=(8, 5))
sns.scatterplot(x='age', y='charges', data=df, hue='smoker', palette='coolwarm')
plt.title('Charges vs Age (Smoker vs Non-Smoker)')
plt.xlabel('Age')
plt.ylabel('Charges')
plt.show()

NameError: name 'plt' is not defined

In [5]:
# 3. Boxplot: Charges by Smoker
plt.figure(figsize=(6, 4))
sns.boxplot(x='smoker', y='charges', data=df)
plt.title('Charges by Smoking Status')
plt.xlabel('Smoker (0=No, 1=Yes)')
plt.ylabel('Charges')
plt.show()

NameError: name 'plt' is not defined

In [6]:
# 4. Charges vs BMI (colored by smoking status)
plt.figure(figsize=(8, 5))
sns.scatterplot(x='bmi', y='charges', data=df, hue='smoker', alpha=0.6)
plt.title('Charges vs BMI (Colored by Smoker)')
plt.xlabel('BMI')
plt.ylabel('Charges')
plt.show()

NameError: name 'plt' is not defined

In [7]:
# 5. Charges by Region
plt.figure(figsize=(6, 4))
sns.boxplot(x='region', y='charges', data=df)
plt.title('Charges by Region')
plt.xlabel('Region')
plt.ylabel('Charges')
plt.show()

NameError: name 'plt' is not defined

In [None]:
# 6. Correlation heatmap
plt.figure(figsize=(8, 6))
corr_matrix = df.corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap')
plt.show()