In [None]:
import pandas as pd
import numpy as np
import random
import torch
import torch.nn as nn
import torch.optim
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE

# # Load Dataset
# df = pd.read_csv("/content/modified_insurance_v2.csv")
# df.head()

In [None]:
# 🔹 Remove Noise (Handle Missing Values, Outliers, and Duplicates)
df.dropna(inplace=True)  # Drop missing values
df.drop_duplicates(inplace=True)  # Remove duplicates

df.head()

Unnamed: 0,age,sex,bmi,children,smoker,income
0,19,female,27.9,0,yes,48173
1,18,male,33.77,1,no,24022
2,28,male,33.0,3,no,1139365
3,33,male,22.705,0,no,718290
4,32,male,28.88,0,no,500122


In [None]:
# Remove Outliers using Z-Score
df = df[(np.abs(stats.zscore(df.select_dtypes(include=[np.number]))) < 3).all(axis=1)].copy()

df.head()

Unnamed: 0,age,sex,bmi,children,smoker,income
0,19,female,27.9,0,yes,48173
1,18,male,33.77,1,no,24022
2,28,male,33.0,3,no,1139365
3,33,male,22.705,0,no,718290
4,32,male,28.88,0,no,500122


In [None]:
# Function to generate output column
def term_life_random(row):
    score = 0
    if row["income"] > 750000: score += 2
    elif row["income"] > 350000: score += 1
    if 28 <= row["age"] <= 47: score += 2
    elif 22 <= row["age"] < 28 or 47 < row["age"] <= 55: score += 1
    if row["smoker"] == "yes": score += 1
    if row["children"] >= 2: score += 2
    elif row["children"] == 1: score += 1
    if row["bmi"] < 17 or row["bmi"] > 34: score += 1
    score += random.choice([-1, 0, 1])
    return 1 if score >= 4 else 0

df = df.copy()  # Prevent modification of a slice
df.loc[:, "has_insurance"] = df.apply(term_life_random, axis=1)

df.head()

Unnamed: 0,age,sex,bmi,children,smoker,income,has_insurance
0,19,female,27.9,0,yes,48173,0
1,18,male,33.77,1,no,24022,0
2,28,male,33.0,3,no,1139365,1
3,33,male,22.705,0,no,718290,0
4,32,male,28.88,0,no,500122,1


In [None]:
# Encode categorical variables
label_encoders = {}
for col in ["sex", "smoker"]:
    le = LabelEncoder()
    df.loc[:, col] = le.fit_transform(df[col])  # Use .loc to avoid warnings
    label_encoders[col] = le

# Feature Engineering
df.loc[:, "income_per_child"] = df["income"] / (df["children"] + 1)
df.loc[:, "bmi_age_ratio"] = df["bmi"] / df["age"]
df.loc[:, "log_income"] = np.log(df["income"] + 1)  # Log transform income

df.head()

Unnamed: 0,age,sex,bmi,children,smoker,income,has_insurance,income_per_child,bmi_age_ratio,log_income
0,19,0,27.9,0,1,48173,0,48173.0,1.468421,10.782575
1,18,1,33.77,1,0,24022,0,12011.0,1.876111,10.086767
2,28,1,33.0,3,0,1139365,1,284841.25,1.178571,13.945983
3,33,1,22.705,0,0,718290,0,718290.0,0.68803,13.48463
4,32,1,28.88,0,0,500122,1,500122.0,0.9025,13.122609


In [None]:
# # prompt: save current df to a new csv file

# df.to_csv('modified_insurance_v3.csv', index=False)


In [None]:
import pandas as pd
import numpy as np
import random
import torch
import torch.nn as nn
import torch.optim
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE

# # Load Dataset
df = pd.read_csv("/content/modified_insurance_v3.csv")

In [None]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,income,has_insurance,income_per_child,bmi_age_ratio,log_income
0,19,0,27.9,0,1,48173,0,48173.0,1.468421,10.782575
1,18,1,33.77,1,0,24022,0,12011.0,1.876111,10.086767
2,28,1,33.0,3,0,1139365,1,284841.25,1.178571,13.945983
3,33,1,22.705,0,0,718290,0,718290.0,0.68803,13.48463
4,32,1,28.88,0,0,500122,0,500122.0,0.9025,13.122609


#Current working

In [None]:
import pandas as pd
import pickle
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from scipy import stats

# Load Dataset
df = pd.read_csv("/content/modified_insurance_with_targets.csv")

df.dropna(inplace=True)  # Drop missing values
df.drop_duplicates(inplace=True)  # Remove duplicates

# Remove Outliers using Z-Score
df = df[(np.abs(stats.zscore(df.select_dtypes(include=[np.number]))) < 3).all(axis=1)]

# 🔹 Encode Categorical Variables
label_encoders = {}
for col in ["sex", "smoker"]:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# 🔹 Define Insurance Plans and Corresponding Features
insurance_plans = {
    "term_insurance": ["age", "sex", "smoker", "income", "has_insurance", "log_income"],
    "savings_investment": ["age", "income", "log_income", "children", "income_per_child"],
    "child_plans": ["age", "income", "children", "income_per_child"],
    "pension_plans": ["age", "income", "log_income", "has_insurance"],
    "money_back_plans": ["age", "income", "log_income", "has_insurance", "bmi_age_ratio"],
    "ulips": ["age", "income", "log_income", "has_insurance", "smoker"],
    "protection_plans": ["age", "income", "log_income", "has_insurance", "smoker"],
    "group_insurance": ["age", "income", "has_insurance", "children", "income_per_child"]
}

# 🔹 Train a Model for Each Insurance Plan
results = {}
for plan, features in insurance_plans.items():
    print(f"\n🚀 Training model for {plan}...")

    # Define Features & Target
    X = df[features]
    y = df[plan]  # Use actual target labels from dataset

    # Handle Class Imbalance using SMOTE
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X, y)

    # Split Dataset
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)

    # Train RandomForestClassifier
    rf_model = RandomForestClassifier(n_estimators=300, max_depth=12, random_state=42, class_weight="balanced")
    rf_model.fit(X_train, y_train)

    # Make predictions
    y_pred = rf_model.predict(X_test)

    # Evaluate performance
    accuracy = accuracy_score(y_test, y_pred)
    print(f"🔥 Accuracy for {plan}:", accuracy)
    print("\n📊 Classification Report:\n", classification_report(y_test, y_pred))

    # Store results
    results[plan] = accuracy

    import pandas as pd
import numpy as np
import random
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from scipy import stats

# Load Dataset
df = pd.read_csv("modified_insurance_with_targets.csv")

# 🔹 Remove Noise (Handle Missing Values, Outliers, and Duplicates)
df.dropna(inplace=True)  # Drop missing values
df.drop_duplicates(inplace=True)  # Remove duplicates

# Remove Outliers using Z-Score
df = df[(np.abs(stats.zscore(df.select_dtypes(include=[np.number]))) < 3).all(axis=1)]

# 🔹 Encode Categorical Variables
label_encoders = {}
for col in ["sex", "smoker"]:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# 🔹 Define Insurance Plans and Corresponding Features
insurance_plans = {
    "term_insurance": ["age", "sex", "smoker", "income", "has_insurance", "log_income"],
    "savings_investment": ["age", "income", "log_income", "children", "income_per_child"],
    "child_plans": ["age", "income", "children", "income_per_child"],
    "pension_plans": ["age", "income", "log_income", "has_insurance"],
    "money_back_plans": ["age", "income", "log_income", "has_insurance", "bmi_age_ratio"],
    "ulips": ["age", "income", "log_income", "has_insurance", "smoker"],
    "protection_plans": ["age", "income", "log_income", "has_insurance", "smoker"],
    "group_insurance": ["age", "income", "has_insurance", "children", "income_per_child"]
}

# 🔹 Train a Model for Each Insurance Plan
results = {}
for plan, features in insurance_plans.items():
    print(f"\n🚀 Training model for {plan}...")

    # Define Features & Target
    X = df[features]
    y = df[plan]  # Use actual target labels from dataset

    # Handle Class Imbalance using SMOTE
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X, y)

    # Split Dataset
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)

    # Train RandomForestClassifier
    rf_model = RandomForestClassifier(n_estimators=300, max_depth=12, random_state=42, class_weight="balanced")
    rf_model.fit(X_train, y_train)

    # Make predictions
    y_pred = rf_model.predict(X_test)

    # Evaluate performance
    accuracy = accuracy_score(y_test, y_pred)
    print(f"🔥 Accuracy for {plan}:", accuracy)
    print("\n📊 Classification Report:\n", classification_report(y_test, y_pred))

    # Store results
    results[plan] = accuracy

    # Save the trained model
    model_filename = f"{plan}_rf_model.pkl"
    with open(model_filename, "wb") as file:
        pickle.dump(rf_model, file)
    print(f"✅ Model saved as {model_filename}")


🚀 Training model for term_insurance...
🔥 Accuracy for term_insurance: 0.9937694704049844

📊 Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.99      0.99       161
           1       0.99      1.00      0.99       160

    accuracy                           0.99       321
   macro avg       0.99      0.99      0.99       321
weighted avg       0.99      0.99      0.99       321


🚀 Training model for savings_investment...
🔥 Accuracy for savings_investment: 1.0

📊 Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       132
           1       1.00      1.00      1.00       132

    accuracy                           1.00       264
   macro avg       1.00      1.00      1.00       264
weighted avg       1.00      1.00      1.00       264


🚀 Training model for child_plans...
🔥 Accuracy for child_plans: 1.0

📊 Classification Report:
               precision

In [None]:
insurance_plans = {
    "term_insurance": ["age", "sex", "smoker", "income", "has_insurance", "log_income"],
    "savings_investment": ["age", "income", "log_income", "children", "income_per_child"],
    "child_plans": ["age", "income", "children", "income_per_child"],
    "pension_plans": ["age", "income", "log_income", "has_insurance"],
    "money_back_plans": ["age", "income", "log_income", "has_insurance", "bmi_age_ratio"],
    "ulips": ["age", "income", "log_income", "has_insurance", "smoker"],
    "protection_plans": ["age", "income", "log_income", "has_insurance", "smoker"],
    "group_insurance": ["age", "income", "has_insurance", "children", "income_per_child"]
}

# Sample new user data (Modify this based on actual user input)
new_user = {
    "age": 1,
    "sex": 0,  # Assuming encoded as 1 for Male, 0 for Female
    "bmi": 22,
    "children": 0,
    "smoker": 1,
    "income": 750000,
    "has_insurance": 1,
    "log_income": np.log(750000 + 1),  # Apply the same log transformation
    "income_per_child": 750000 / (2 + 1),
    "bmi_age_ratio": 22 / 35  # Example: Assume BMI is 22
}

# Convert to DataFrame
new_user_df = pd.DataFrame([new_user])

# Load models and make predictions
recommendations = {}
for plan, features in insurance_plans.items():
    model_filename = f"/content/{plan}_rf_model.pkl"

    # Load the saved model
    with open(model_filename, "rb") as file:
        model = pickle.load(file)

    # Extract only the relevant features
    user_features = new_user_df[features]

    # Predict
    prediction = model.predict(user_features)[0]  # 0 = No, 1 = Yes
    recommendations[plan] = "Recommended" if prediction == 1 else "Not Recommended"

# Display results
for plan, result in recommendations.items():
    print(f"{plan}: {result}")


term_insurance: Not Recommended
savings_investment: Recommended
child_plans: Recommended
pension_plans: Not Recommended
money_back_plans: Not Recommended
ulips: Not Recommended
protection_plans: Not Recommended
group_insurance: Recommended
