In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import seaborn as sns
import matplotlib.pyplot as plt


#### Import the CSV Data as Pandas DataFrame

In [6]:
df = pd.read_csv('data/medical_insurance.csv')

#### 1. Data Preprocessing

In [None]:


def preprocess_data(df):
    # Create a copy to avoid modifying original data
    df_processed = df.copy()
    
    # Log transform the target variable (charges) due to right skew
    df_processed['charges_log'] = np.log(df_processed['charges'])
    
    # Create interaction terms
    df_processed['age_bmi'] = df_processed['age'] * df_processed['bmi']
    df_processed['smoker_bmi'] = (df_processed['smoker'] == 'yes').astype(int) * df_processed['bmi']
    
    return df_processed



####  2. Feature Engineering

In [None]:


def create_feature_pipeline():
    # Define numeric and categorical columns
    numeric_features = ['age', 'bmi', 'children', 'age_bmi', 'smoker_bmi']
    categorical_features = ['sex', 'smoker', 'region']
    
    # Create preprocessing pipelines for numeric and categorical data
    numeric_transformer = StandardScaler()
    categorical_transformer = OneHotEncoder(drop='first', sparse_output=False)
    
    # Combine transformers
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ])
    
    return preprocessor



#### 3. Model Building and Evaluation

In [None]:

def build_and_evaluate_model(df_processed):
    # Prepare features and target
    X = df_processed.drop(['charges', 'charges_log'], axis=1)
    y = df_processed['charges_log']
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Create pipeline
    preprocessor = create_feature_pipeline()
    model = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', LinearRegression())
    ])
    
    # Fit the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    # Calculate metrics
    metrics = {
        'R2 Score (Train)': r2_score(y_train, y_pred_train),
        'R2 Score (Test)': r2_score(y_test, y_pred_test),
        'RMSE (Train)': np.sqrt(mean_squared_error(y_train, y_pred_train)),
        'RMSE (Test)': np.sqrt(mean_squared_error(y_test, y_pred_test)),
        'MAE (Train)': mean_absolute_error(y_train, y_pred_train),
        'MAE (Test)': mean_absolute_error(y_test, y_pred_test)
    }
    
    return model, metrics, (y_test, y_pred_test)



### Main execution

In [None]:

# 1. Preprocess the data
df_processed = preprocess_data(df)

# 2. Build and evaluate the model
model, metrics, (y_test, y_pred_test) = build_and_evaluate_model(df_processed)

# Print metrics
print("\nModel Performance Metrics:")
for metric, value in metrics.items():
    print(f"{metric}: {value:.4f}")

# Calculate and print feature importances
feature_names = (
    ['age', 'bmi', 'children', 'age_bmi', 'smoker_bmi'] +
    ['sex_male'] +
    ['smoker_yes'] +
    ['region_northwest', 'region_southeast', 'region_southwest']
)

coefficients = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': model.named_steps['regressor'].coef_
})
coefficients = coefficients.sort_values('Coefficient', key=abs, ascending=False)

print("\nFeature Importances (Standardized Coefficients):")
print(coefficients)




Model Performance Metrics:
R2 Score (Train): 0.7739
R2 Score (Test): 0.8171
RMSE (Train): 0.4333
RMSE (Test): 0.4055
MAE (Train): 0.2637
MAE (Test): 0.2531

Feature Importances (Standardized Coefficients):
            Feature  Coefficient
4        smoker_bmi     0.591400
0               age     0.547714
8  region_southeast    -0.146986
9  region_southwest    -0.135277
6        smoker_yes     0.125476
2          children     0.113663
5          sex_male    -0.089268
3           age_bmi    -0.075814
7  region_northwest    -0.065620
1               bmi     0.054776


Model Performance:
- Training R² Score: 0.7739 (77.39% variance explained)

- Test R² Score: 0.8171 (81.71% variance explained)

- Training RMSE: 0.4333

- Test RMSE: 0.4055

- Training MAE: 0.2637

- Test MAE: 0.2531

Key Observations:
1. The model performs well, with similar performance on training and test sets

2. High R² indicates good predictive power

3. Low RMSE and MAE suggest accurate predictions

Top Feature Importances (by absolute coefficient value):
1. Smoker BMI (0.5914): Strongest predictor

2. Age (0.5477): Second most important feature

3. Region (Southeast and Southwest have negative impacts)

4. Smoker status (0.1255)

5. Number of children (0.1137)

The model suggests that smoking status, BMI, age, and region significantly influence insurance charges, with interaction terms like smoker_bmi providing key insights.