In [None]:
# =================================================================================
# Interactive Insurance Premium Prediction Script for Jupyter Notebook
#
# This script has been updated to include a terminal-based interactive quote
# generator. It performs data analysis, model training, and then prompts the
# user for input to generate a quote using the trained Random Forest model.
# =================================================================================

# 1. Setup and Imports
# ---------------------------------------------------------------------------------
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Use a clean, modern style for Matplotlib plots
plt.style.use('ggplot')


# 2. Data Loading and Preprocessing
# ---------------------------------------------------------------------------------
def load_and_preprocess_data():
    """
    Loads, cleans, and returns the preprocessed DataFrame.
    """
    try:
        df = pd.read_csv("insurance.csv")
    except FileNotFoundError:
        print("Error: 'insurance.csv' not found. Please make sure the file "
              "is in the same directory as this script.")
        return None

    # Data cleaning
    df_clean = df.copy()
    df_clean = df_clean.drop_duplicates().reset_index(drop=True)

    # Standardize categorical columns
    for col in ["sex", "smoker", "region"]:
        df_clean[col] = df_clean[col].astype(str).str.strip().str.lower()

    # Ensure numeric types
    df_clean["age"] = (
        pd.to_numeric(df_clean["age"], errors="coerce").astype(int)
    )
    df_clean["bmi"] = pd.to_numeric(df_clean["bmi"], errors="coerce")
    df_clean["children"] = (
        pd.to_numeric(df_clean["children"], errors="coerce").astype(int)
    )
    df_clean["charges"] = pd.to_numeric(df_clean["charges"], errors="coerce")

    return df_clean

df_clean = load_and_preprocess_data()
if df_clean is None:
    # Exit the script if data loading fails
    raise FileNotFoundError("insurance.csv file not found.")


# 3. Model Training
# ---------------------------------------------------------------------------------
def train_model(df_clean):
    """
    Trains and returns the Random Forest model pipeline, along with
    the performance metrics.
    """
    # Prepare features
    X = df_clean.drop(columns=["charges"])
    y = df_clean["charges"]

    numeric_features = ["age", "bmi", "children"]
    categorical_features = ["sex", "smoker", "region"]

    # Create preprocessor
    preprocessor = ColumnTransformer(
        transformers=[
            ("num", StandardScaler(), numeric_features),
            (
                "cat",
                OneHotEncoder(drop="first", sparse_output=False),
                categorical_features,
            ),
        ]
    )

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    # Create and train Random Forest model
    rf_pipeline = Pipeline(
        [
            ("pre", preprocessor),
            (
                "model",
                RandomForestRegressor(random_state=42, n_jobs=-1)
            ),
        ]
    )

    rf_pipeline.fit(X_train, y_train)

    # Make predictions and evaluate the model
    y_pred = rf_pipeline.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)

    return (
        rf_pipeline,
        y_test,
        y_pred,
        preprocessor,
        numeric_features,
        categorical_features,
        rmse,
        r2,
    )

(
    rf_pipeline,
    y_test,
    y_pred,
    preprocessor,
    numeric_features,
    categorical_features,
    rmse,
    r2,
) = train_model(df_clean)


# 4. Exploratory Data Analysis & Model Diagnostics
# ---------------------------------------------------------------------------------
print("=========================================================")
print("  Exploratory Data Analysis & Model Diagnostics")
print("=========================================================")

# Correlation heatmap
print("\nCorrelation Heatmap")
fig_corr, ax_corr = plt.subplots(figsize=(6, 5))
corr = df_clean.select_dtypes(include=[np.number]).corr()
sns.heatmap(
    corr, annot=True, fmt=".2f", cmap="viridis", cbar=True, ax=ax_corr
)
plt.show()

# Charges by smoker
print("\nCharges by Smoker Status")
fig_box, ax_box = plt.subplots(figsize=(7, 4))
sns.boxplot(data=df_clean, x="smoker", y="charges", hue="smoker",
            palette="rocket", legend=False, ax=ax_box)
plt.show()

# Charges distribution
print("\nDistribution of Charges")
fig_hist, ax_hist = plt.subplots(figsize=(7, 4))
sns.histplot(
    df_clean["charges"], kde=True, stat="density", color="skyblue", ax=ax_hist
)
ax_hist.set_xlabel("Charges ($)")
plt.show()

# Charges by region
print("\nMedian Charges by Region")
region_med = df_clean.groupby("region")["charges"].median().sort_values(
    ascending=False
)
fig_bar, ax_bar = plt.subplots(figsize=(7, 4))
sns.barplot(x=region_med.index, y=region_med.values, hue=region_med.index,
            palette="crest", legend=False, ax=ax_bar)
ax_bar.set_xlabel("Region")
ax_bar.set_ylabel("Median Charges ($)")
ax_bar.tick_params(axis='x', rotation=45)
plt.show()

# BMI vs Charges interactive plot (now static in a notebook)
print("\nBMI vs Charges by Smoker Status")
fig_px = px.scatter(
    df_clean, x="bmi", y="charges", color="smoker",
    title="Hover over points for more details",
    color_discrete_map={"yes": "red", "no": "green"},
)
fig_px.show()

# Model Diagnostics
print("\nModel Diagnostics")

# Actual vs Predicted Charges
print("\nActual vs Predicted Charges")
fig_diag, ax_diag = plt.subplots(figsize=(7, 4))
ax_diag.scatter(y_test, y_pred, alpha=0.6, s=40, color="#1f77b4")
minv = min(y_test.min(), y_pred.min())
maxv = max(y_test.max(), y_pred.max())
ax_diag.plot(
    [minv, maxv], [minv, maxv],
    linestyle="--", color="red", linewidth=2
)
ax_diag.set_xlabel("Actual Charges ($)")
ax_diag.set_ylabel("Predicted Charges ($)")
plt.show()

# Feature Importances
print("\nFeature Importances")
rf = rf_pipeline.named_steps["model"]
num_names = numeric_features
cat_transformer = rf_pipeline.named_steps["pre"].named_transformers_["cat"]
cat_names = list(cat_transformer.get_feature_names_out(categorical_features))
feat_names = num_names + cat_names

importances = rf.feature_importances_
sorted_idx = np.argsort(importances)[::-1]

fig_feat, ax_feat = plt.subplots(figsize=(7, 4))
sns.barplot(
    x=importances[sorted_idx],
    y=[feat_names[i] for i in sorted_idx],
    hue=[feat_names[i] for i in sorted_idx],
    palette="plasma",
    legend=False,
    ax=ax_feat,
)
ax_feat.set_xlabel("Importance")
ax_feat.set_ylabel("Feature")
plt.show()

# 5. Premium Estimator (Interactive)
# ---------------------------------------------------------------------------------
print("\n\n" + "="*57)
print("             Insurance Premium Estimator")
print("="*57)

# Ask the user if they want a quote
wants_quote = input(
    "Would you like a general estimation of what your health insurance "
    "quote would look like? (yes/no): "
).lower().strip()

if wants_quote in ["yes", "y"]:
    # Get interactive user inputs
    try:
        age = int(input("Enter your age: "))
        sex = input("Enter your sex (male/female): ").lower().strip()
        bmi = float(input("Enter your BMI (e.g., 25.0): "))
        children = int(input("Enter the number of children: "))
        smoker = input("Are you a smoker? (yes/no): ").lower().strip()
        region = input(
            "Enter your region, after which wait 5 seconds and than scroll down for your quote (northeast/northwest/southeast/southwest): "
        ).lower().strip()

        # Place the "thank you" message here, after all inputs are collected
        print("Thank you, your quote is below, please scroll down!")
        
        # Generate prediction from user inputs
        input_df = pd.DataFrame(
            [
                {
                    "age": age,
                    "sex": sex,
                    "bmi": bmi,
                    "children": children,
                    "smoker": smoker,
                    "region": region,
                }
            ]
        )
        prediction = rf_pipeline.predict(input_df)[0]

        # Display the result
        print("\nYour Estimated Premium is...")
        print(f"-> ${prediction:,.2f}")
        print("\n--- Profile Summary ---")
        print(f"Age: {age} years")
        print(f"Gender: {sex.title()}")
        print(f"BMI: {bmi:.2f}")
        print(f"Children: {children}")
        print(f"Smoker: {smoker.title()}")
        print(f"Region: {region.title()}")
        print("\n--- Model Performance ---")
        print(f"Root Mean Squared Error (RMSE): ${rmse:,.2f}")
        print(f"R-squared (R²): {r2:.3f}")

    except ValueError:
        print("\nInvalid input. Please make sure to enter numbers for age, BMI, "
              "and children, and strings for the other fields.")
else:
    print("Thanks for using the script!")