In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_selection import SelectKBest, f_regression
import statsmodels.api as sm

# Load Data
df = pd.read_csv('insurance.csv')

# ETL Process
## Basic Data Cleaning
df.dropna(inplace=True)  # Drop missing values if any

## Convert categorical variables into dummy/indicator variables
df = pd.get_dummies(df, drop_first=True)

# EDA - Exploratory Data Analysis
## Descriptive Statistics
print(df.describe())

## Correlation Matrix
plt.figure(figsize=(10, 6))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

## Distribution of Target Variable (Insurance Cost)
sns.histplot(df['charges'], kde=True)
plt.title('Distribution of Insurance Charges')
plt.show()

## Pairplot to see relationships
sns.pairplot(df)
plt.show()

# Feature Engineering
## Create interaction terms or polynomial features if needed
df['age_squared'] = df['age'] ** 2
df['bmi_age_interaction'] = df['bmi'] * df['age']

# Feature Selection
## Using SelectKBest to select top features
X = df.drop('charges', axis=1)
y = df['charges']

selector = SelectKBest(score_func=f_regression, k='all')
selector.fit(X, y)

## Displaying feature scores
feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': selector.scores_})
feature_scores = feature_scores.sort_values(by='Score', ascending=False)
print(feature_scores)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Model Building and Evaluation
## Linear Regression Model
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

## Model Evaluation
print('Linear Regression Results:')
print(f'R^2 Score: {r2_score(y_test, y_pred_lr)}')
print(f'RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_lr))}')

# Ridge Regression
ridge = Ridge()
ridge.fit(X_train, y_train)
y_pred_ridge = ridge.predict(X_test)

## Model Evaluation
print('Ridge Regression Results:')
print(f'R^2 Score: {r2_score(y_test, y_pred_ridge)}')
print(f'RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_ridge))}')

# Lasso Regression
lasso = Lasso()
lasso.fit(X_train, y_train)
y_pred_lasso = lasso.predict(X_test)

## Model Evaluation
print('Lasso Regression Results:')
print(f'R^2 Score: {r2_score(y_test, y_pred_lasso)}')
print(f'RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_lasso))}')

# Advanced Model Evaluation with Statsmodels
X_train_sm = sm.add_constant(X_train)  # Adding a constant for the intercept
ols_model = sm.OLS(y_train, X_train_sm).fit()

## Model Summary
print(ols_model.summary())
