In [None]:
# =============================
# Insurance Premium Prediction Notebook
# Simplified version with Random Forest only
# =============================

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# -----------------------------
# Load and clean data
# -----------------------------
try:
    df = pd.read_csv("insurance.csv")
except FileNotFoundError:
    print("Error: 'insurance.csv' not found. Please make sure the file is in the same directory.")
    df = None  # Set df to None to prevent further errors

if df is not None:
    df_clean = df.copy()
    df_clean = df_clean.drop_duplicates().reset_index(drop=True)

    # Standardize categorical columns
    for col in ["sex", "smoker", "region"]:
        df_clean[col] = df_clean[col].astype(str).str.strip().str.lower()

    # Ensure numeric types
    df_clean["age"] = pd.to_numeric(df_clean["age"], errors="coerce").astype(int)
    df_clean["bmi"] = pd.to_numeric(df_clean["bmi"], errors="coerce")
    df_clean["children"] = pd.to_numeric(df_clean["children"], errors="coerce").astype(int)
    df_clean["charges"] = pd.to_numeric(df_clean["charges"], errors="coerce")

    print(f"Data loaded: {df_clean.shape[0]} rows, {df_clean.shape[1]} columns")

    # -----------------------------
    # Model Training
    # -----------------------------

    # Prepare features
    X = df_clean.drop(columns=["charges"])
    y = df_clean["charges"]

    numeric_features = ["age", "bmi", "children"]
    categorical_features = ["sex", "smoker", "region"]

    # Create preprocessor
    preprocessor = ColumnTransformer(transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(drop="first", sparse_output=False), categorical_features)
    ])

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Create and train Random Forest model
    rf_pipeline = Pipeline([
        ("pre", preprocessor),
        ("model", RandomForestRegressor(random_state=42, n_jobs=-1))
    ])

    print("Training Random Forest model...")
    rf_pipeline.fit(X_train, y_train)

    # Make predictions
    y_pred = rf_pipeline.predict(X_test)

    # Calculate metrics
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)

    print(f"Random Forest Performance:")
    print(f"RMSE: {rmse:.2f}")
    print(f"R²: {r2:.3f}")

    # -----------------------------
    # Interactive Premium Estimation Function
    # -----------------------------
    def interactive_premium_estimator():
        """
        Interactive insurance premium estimator with guided questions.
        """
        print("=" * 60)
        print("Hi! I am an Insurance Premium Estimator designed by Younus.")
        print("I am going to ask a few questions which will allow me to generate a premium.")
        print("=" * 60)
        print()
        
        # Question 1 - Age
        while True:
            try:
                age = int(input("Question 1 of 6: What is your age? "))
                if 18 <= age <= 100:
                    break
                else:
                    print("Please enter an age between 18 and 100.")
            except ValueError:
                print("Please enter a valid number for age.")
        
        # Question 2 - Sex
        while True:
            sex_input = input("Question 2 of 6: Please tell me if you are Male or Female. Write M for Male and F for Female: ").strip().upper()
            if sex_input in ['M', 'F']:
                sex = 'male' if sex_input == 'M' else 'female'
                break
            else:
                print("Please enter M for Male or F for Female.")
        
        # Question 3 - BMI
        while True:
            try:
                bmi = float(input("Question 3 of 6: What is your BMI in numbers? "))
                if 15 <= bmi <= 50:
                    break
                else:
                    print("Please enter a BMI between 15 and 50.")
            except ValueError:
                print("Please enter a valid number for BMI.")
        
        # Question 4 - Children
        while True:
            try:
                children = int(input("Question 4 of 6: How many children do you have in number format? "))
                if children >= 0:
                    break
                else:
                    print("Please enter 0 or a positive number.")
            except ValueError:
                print("Please enter a valid number for children.")
        
        # Question 5 - Smoker
        while True:
            smoker_input = input("Question 5 of 6: Are you a smoker? Type 'yes' or 'no': ").strip().lower()
            if smoker_input in ['yes', 'no']:
                smoker = smoker_input
                break
            else:
                print("Please enter yes or no.")
        
        # Question 6 - Region
        print("Question 6 of 6: What region are you in?")
        print("Write 1 for northeast, 2 for northwest, 3 for southeast, 4 for southwest")
        while True:
            try:
                region_input = int(input("Enter your choice (1-4): "))
                if region_input == 1:
                    region = 'northeast'
                    break
                elif region_input == 2:
                    region = 'northwest'
                    break
                elif region_input == 3:
                    region = 'southeast'
                    break
                elif region_input == 4:
                    region = 'southwest'
                    break
                else:
                    print("Please enter 1, 2, 3, or 4.")
            except ValueError:
                print("Please enter a valid number (1-4).")
        
        # Generate prediction
        input_df = pd.DataFrame([{
            "age": age,
            "sex": sex,
            "bmi": bmi,
            "children": children,
            "smoker": smoker,
            "region": region
        }])
        
        prediction = rf_pipeline.predict(input_df)[0]
        
        print("\n" + "=" * 60)
        print("PREMIUM CALCULATION COMPLETE")
        print("=" * 60)
        print(f"Your estimated insurance premium is: ${prediction:,.2f}")
        print()
        print("Summary of your profile:")
        print(f"• Age: {age} years")
        print(f"• Gender: {sex.title()}")
        print(f"• BMI: {bmi}")
        print(f"• Children: {children}")
        print(f"• Smoker: {smoker.title()}")
        print(f"• Region: {region.title()}")
        print("=" * 60)
        
        return prediction

    def estimate_premium(age, sex, bmi, children, smoker, region):
        """
        Direct premium estimation function (for programmatic use).
        """
        # Normalize inputs
        sex = str(sex).strip().lower()
        smoker = str(smoker).strip().lower()
        region = str(region).strip().lower()
        
        input_df = pd.DataFrame([{
            "age": int(age),
            "sex": sex,
            "bmi": float(bmi),
            "children": int(children),
            "smoker": smoker,
            "region": region
        }])
        
        prediction = rf_pipeline.predict(input_df)[0]
        return prediction

    # Run the interactive estimator
    interactive_premium_estimator()

Data loaded: 1337 rows, 7 columns
Training Random Forest model...
Random Forest Performance:
RMSE: 4729.97
R²: 0.878
Hi! I am an Insurance Premium Estimator designed by Younus.
I am going to ask a few questions which will allow me to generate a premium.

Question 6 of 6: What region are you in?
Write 1 for northeast, 2 for northwest, 3 for southeast, 4 for southwest

PREMIUM CALCULATION COMPLETE
Your estimated insurance premium is: $15,930.63

Summary of your profile:
• Age: 25 years
• Gender: Male
• BMI: 22.0
• Children: 5
• Smoker: Yes
• Region: Northeast
