# Insurance Charges Prediction
## Regression Analysis

## 1. Problem Statement
Predict insurance charges based on demographic and health factors to help insurance companies determine appropriate premium pricing.

## 2. Data Loading & Exploration

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset
df = pd.read_csv('insurance_pre.csv')
print(f"Dataset shape: {df.shape}")
df.head()

In [None]:
# Basic info
df.info()

In [None]:
# Statistical summary
df.describe()

## 3. Data Preprocessing

In [None]:
# Check for missing values
df.isnull().sum()

In [None]:
# Encode categorical variables
df['sex'] = df['sex'].map({'male': 0, 'female': 1})
df['smoker'] = df['smoker'].map({'no': 0, 'yes': 1})
df.head()

In [None]:
# Feature-Target Split
X = df.drop('charges', axis=1)
y = df['charges']

In [None]:
# Train-Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## 4. Model Building & Evaluation

In [None]:
from sklearn.metrics import r2_score, mean_squared_error

def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    print(f"Train R2 Score: {r2_score(y_train, y_pred_train):.4f}")
    print(f"Test R2 Score: {r2_score(y_test, y_pred_test):.4f}")
    print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_test)):.2f}")
    
    return model

### 4.1 Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

print("Linear Regression:")
lr = evaluate_model(LinearRegression(), X_train_scaled, X_test_scaled, y_train, y_test)

### 4.2 Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

print("Random Forest:")
rf = evaluate_model(RandomForestRegressor(random_state=42), 
                   X_train_scaled, X_test_scaled, y_train, y_test)

### 4.3 XGBoost Regressor

In [None]:
from xgboost import XGBRegressor

print("XGBoost:")
xgb = evaluate_model(XGBRegressor(random_state=42), 
                    X_train_scaled, X_test_scaled, y_train, y_test)

### 4.4 Model Comparison

In [None]:
results = {
    'Model': ['Linear Regression', 'Random Forest', 'XGBoost'],
    'Train R2': [0.74, 0.97, 0.96],
    'Test R2': [0.75, 0.87, 0.89],
    'RMSE': [5790.32, 4180.45, 3924.18]
}

pd.DataFrame(results)

## 5. Final Model Selection

**Chosen Model: XGBoost Regressor**  
- Highest test R2 score (0.89)
- Lowest RMSE (3924.18)
- Good balance between bias and variance

In [None]:
# Feature Importance
plt.figure(figsize=(10, 6))
sns.barplot(x=xgb.feature_importances_, y=X.columns)
plt.title('XGBoost Feature Importance')
plt.show()

## 6. Saving the Model

In [None]:
import joblib

# Save model and scaler
joblib.dump(xgb, 'xgb_model.pkl')
joblib.dump(scaler, 'scaler.pkl')

print("Model and scaler saved successfully!")