# Insurance Claim Prediction - Complete Analysis

## Project Overview
**Objective**: Build a machine learning model to predict if a building will have an insurance claim during the insurance period

**Dataset**:
- Training: 5012 observations with 12 descriptive attributes + target variable (Claim)
- Test: 2147 observations for final model validation

## Project Tasks (Per PDF Requirements):
1. **Analyze and visualize data** ✓
2. **Clean data if necessary** ✓
3. **Select most discriminant features if necessary** ✓ (Using RFE)
4. **Encode data and generate prediction models using supervised learning** ✓
5. **Evaluate model performance and interpret results** ✓

## Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
from scipy import stats

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler, LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.feature_selection import RFE, mutual_info_classif

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import (
    classification_report, confusion_matrix, f1_score, 
    roc_auc_score, roc_curve, auc, accuracy_score, 
    precision_score, recall_score
)

from imblearn.over_sampling import SMOTE, BorderlineSMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTETomek

import warnings
warnings.filterwarnings('ignore')

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

print("✓ All libraries loaded successfully")

## TASK 1: ANALYZE AND VISUALIZE DATA
### 1.1 Load Data and Basic Exploration

In [None]:
train_raw = pd.read_csv("train_Insurance.csv")
test_raw = pd.read_csv("test_Insurance.csv")

print("="*80)
print("DATASET OVERVIEW")
print("="*80)
print(f"Training set shape: {train_raw.shape} (Expected: 5012 rows)")
print(f"Test set shape: {test_raw.shape} (Expected: 2147 rows)")
print(f"\nFeatures ({len(train_raw.columns)}): {list(train_raw.columns)}")

print(f"\n{'-'*80}")
print("First 10 rows of training data:")
print(train_raw.head(10))

print(f"\n{'-'*80}")
print("Data types:")
print(train_raw.dtypes)

print(f"\n{'-'*80}")
print("Statistical summary:")
print(train_raw.describe())

print(f"\n{'-'*80}")
print("Dataset information:")
train_raw.info()

### 1.2 Missing Values Analysis

In [None]:
print("="*80)
print("MISSING VALUES ANALYSIS")
print("="*80)

missing_train = train_raw.isna().sum()
missing_train_pct = (missing_train / len(train_raw)) * 100

missing_df = pd.DataFrame({
    'Column': missing_train.index,
    'Missing_Count': missing_train.values,
    'Missing_Percentage': missing_train_pct.values
})
missing_df = missing_df[missing_df['Missing_Count'] > 0].sort_values('Missing_Count', ascending=False)

if len(missing_df) > 0:
    print("\nMissing values detected:")
    print(missing_df.to_string(index=False))
    
    fig, axes = plt.subplots(1, 2, figsize=(15, 5))
    
    axes[0].barh(missing_df['Column'], missing_df['Missing_Count'], color='coral')
    axes[0].set_xlabel('Missing Count', fontweight='bold')
    axes[0].set_title('Missing Values Count by Column', fontweight='bold', fontsize=12)
    axes[0].grid(axis='x', alpha=0.3)
    
    axes[1].barh(missing_df['Column'], missing_df['Missing_Percentage'], color='steelblue')
    axes[1].set_xlabel('Missing Percentage (%)', fontweight='bold')
    axes[1].set_title('Missing Values Percentage by Column', fontweight='bold', fontsize=12)
    axes[1].grid(axis='x', alpha=0.3)
    
    plt.tight_layout()
    plt.show()
else:
    print("\n✓ No missing values found in training data")

### 1.3 Target Variable (Claim) Analysis

In [None]:
print("="*80)
print("TARGET VARIABLE ANALYSIS: Claim")
print("="*80)

claim_counts = train_raw['Claim'].value_counts()
claim_pct = train_raw['Claim'].value_counts(normalize=True) * 100

print(f"\nClaim Distribution:")
for label in claim_counts.index:
    print(f"  {label}: {claim_counts[label]} samples ({claim_pct[label]:.2f}%)")

fig, axes = plt.subplots(1, 3, figsize=(18, 5))

train_raw['Claim'].value_counts().plot.pie(
    autopct='%1.1f%%',
    ax=axes[0],
    ylabel='',
    colors=['#66b3ff', '#ff9999'],
    startangle=90,
    explode=(0.05, 0.05)
)
axes[0].set_title('Target Distribution (Pie Chart)', fontweight='bold', fontsize=12)

claim_counts.plot(kind='bar', ax=axes[1], color=['#66b3ff', '#ff9999'], width=0.6)
axes[1].set_title('Target Distribution (Bar Chart)', fontweight='bold', fontsize=12)
axes[1].set_ylabel('Count')
axes[1].set_xlabel('Claim Status')
axes[1].set_xticklabels(axes[1].get_xticklabels(), rotation=0)
for i, v in enumerate(claim_counts.values):
    axes[1].text(i, v + 50, str(v), ha='center', va='bottom', fontweight='bold')

claim_pct.plot(kind='barh', ax=axes[2], color=['#66b3ff', '#ff9999'])
axes[2].set_title('Target Distribution (Percentage)', fontweight='bold', fontsize=12)
axes[2].set_xlabel('Percentage (%)')
axes[2].set_ylabel('Claim Status')
for i, v in enumerate(claim_pct.values):
    axes[2].text(v + 1, i, f"{v:.2f}%", ha='left', va='center', fontweight='bold')

plt.tight_layout()
plt.show()

### 1.4 Categorical Features Distribution

In [None]:
cat_cols = [col for col in train_raw.select_dtypes(include=["object", "category"]).columns
            if col not in ["Claim", "Customer Id"]]

print(f"Categorical features ({len(cat_cols)}): {cat_cols}")

n_cols = 3
n_rows = math.ceil(len(cat_cols) / n_cols)

fig, axes = plt.subplots(n_rows, n_cols, figsize=(18, 5 * n_rows))
axes = axes.flatten() if isinstance(axes, np.ndarray) else [axes]

for i, col in enumerate(cat_cols):
    value_counts = train_raw[col].value_counts()
    sns.countplot(data=train_raw, x=col, ax=axes[i], palette='Set2', order=value_counts.index)
    axes[i].set_title(f"Distribution of {col}", fontweight='bold')
    axes[i].tick_params(axis='x', rotation=45)
    
    for container in axes[i].containers:
        axes[i].bar_label(container)

for j in range(len(cat_cols), len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()

### 1.5 Categorical Features vs Target (Claim)

In [None]:
print("="*80)
print("CATEGORICAL FEATURES vs TARGET ANALYSIS")
print("="*80)

fig, axes = plt.subplots(n_rows, n_cols, figsize=(18, 5 * n_rows))
axes = axes.flatten() if isinstance(axes, np.ndarray) else [axes]

for i, col in enumerate(cat_cols):
    ct = pd.crosstab(train_raw[col], train_raw['Claim'], normalize='index')
    ct.plot(kind='bar', ax=axes[i], stacked=False, color=['#66b3ff', '#ff9999'])
    axes[i].set_title(f"{col} vs Claim", fontweight='bold')
    axes[i].set_ylabel('Proportion')
    axes[i].set_xlabel(col)
    axes[i].legend(title='Claim', loc='best')
    axes[i].tick_params(axis='x', rotation=45)

for j in range(len(cat_cols), len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()

### 1.6 Numerical Features Distribution

In [None]:
num_cols = train_raw.select_dtypes(include=["int64", "float64"]).columns.tolist()
num_cols = [c for c in num_cols if c not in ["Claim", "Customer Id"]]

print(f"Numerical features ({len(num_cols)}): {num_cols}")

n_cols_plot = 3
n_rows_plot = math.ceil(len(num_cols) / n_cols_plot)

fig, axes = plt.subplots(n_rows_plot, n_cols_plot, figsize=(18, 5 * n_rows_plot))
axes = axes.flatten() if isinstance(axes, np.ndarray) else [axes]

for i, col in enumerate(num_cols):
    sns.histplot(train_raw[col].dropna(), kde=True, ax=axes[i], color='steelblue', bins=30)
    axes[i].set_title(f"Distribution of {col}", fontweight='bold')
    axes[i].set_xlabel(col)
    axes[i].set_ylabel('Frequency')

for j in range(len(num_cols), len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()

### 1.7 Numerical Features: Box Plots (Outlier Detection)

In [None]:
fig, axes = plt.subplots(n_rows_plot, n_cols_plot, figsize=(18, 5 * n_rows_plot))
axes = axes.flatten() if isinstance(axes, np.ndarray) else [axes]

for i, col in enumerate(num_cols):
    sns.boxplot(x=train_raw[col].dropna(), ax=axes[i], color='orange')
    axes[i].set_title(f"Box Plot: {col}", fontweight='bold')
    axes[i].set_xlabel(col)

for j in range(len(num_cols), len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()

print("\nOutlier Analysis (IQR method):")
print("-" * 60)
for col in num_cols:
    Q1 = train_raw[col].quantile(0.25)
    Q3 = train_raw[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    outliers = ((train_raw[col] < lower) | (train_raw[col] > upper)).sum()
    outlier_pct = (outliers / len(train_raw)) * 100
    print(f"{col}:")
    print(f"  Outliers: {outliers} ({outlier_pct:.2f}%)")
    print(f"  Expected range: [{lower:.2f}, {upper:.2f}]")

### 1.8 Numerical Features vs Target

In [None]:
fig, axes = plt.subplots(n_rows_plot, n_cols_plot, figsize=(18, 5 * n_rows_plot))
axes = axes.flatten() if isinstance(axes, np.ndarray) else [axes]

for i, col in enumerate(num_cols):
    for claim_val in train_raw['Claim'].unique():
        data = train_raw[train_raw['Claim'] == claim_val][col].dropna()
        axes[i].hist(data, alpha=0.6, label=claim_val, bins=30)
    axes[i].set_title(f"{col} by Claim Status", fontweight='bold')
    axes[i].set_xlabel(col)
    axes[i].set_ylabel('Frequency')
    axes[i].legend(title='Claim')

for j in range(len(num_cols), len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()

## TASK 2: CLEAN DATA IF NECESSARY
### 2.1 Remove Customer ID (Not Predictive)

In [None]:
print("="*80)
print("DATA CLEANING PIPELINE")
print("="*80)
print("\nStep 1: Remove Customer ID\n" + "-"*60)

if 'Customer Id' in train_raw.columns:
    train_raw = train_raw.drop('Customer Id', axis=1)
    print("✓ Customer Id removed from training data")

if 'Customer Id' in test_raw.columns:
    test_raw = test_raw.drop('Customer Id', axis=1)
    print("✓ Customer Id removed from test data")

print(f"\nRemaining columns ({len(train_raw.columns)}): {list(train_raw.columns)}")

### 2.2 Clean NumberOfWindows

In [None]:
print("\nStep 2: Clean NumberOfWindows\n" + "-"*60)

print(f"Unique values before cleaning: {sorted(train_raw['NumberOfWindows'].unique())}")

train_raw['NumberOfWindows'] = train_raw['NumberOfWindows'].replace({'without': 0, '>=10': 10})
train_raw['NumberOfWindows'] = train_raw['NumberOfWindows'].astype(int)

test_raw['NumberOfWindows'] = test_raw['NumberOfWindows'].replace({'without': 0, '>=10': 10})
test_raw['NumberOfWindows'] = test_raw['NumberOfWindows'].astype(int)

print(f"Unique values after cleaning: {sorted(train_raw['NumberOfWindows'].unique())}")
print("✓ NumberOfWindows cleaned successfully")

### 2.3 Remove Duplicates and Conflicting Records

In [None]:
print("\nStep 3: Remove Duplicates and Conflicts\n" + "-"*60)

n_exact = train_raw.duplicated().sum()
features = [c for c in train_raw.columns if c != "Claim"]
n_same_features = train_raw.duplicated(subset=features).sum()

print(f"Exact duplicates: {n_exact}")
print(f"Records with same features (may have different target): {n_same_features}")

if n_same_features > 0:
    dups = train_raw[train_raw.duplicated(subset=features, keep=False)]
    conflicts = dups.groupby(features)['Claim'].nunique()
    n_conflicts = (conflicts > 1).sum()
    print(f"Conflicting records (same features, different targets): {n_conflicts}")
    
    if n_conflicts > 0:
        conflicting_groups = conflicts[conflicts > 1].reset_index()
        before = len(train_raw)
        train_raw = train_raw.merge(conflicting_groups[features], on=features, how='left', indicator=True)
        train_raw = train_raw[train_raw['_merge'] == 'left_only'].drop(columns=['_merge'])
        after = len(train_raw)
        print(f"✓ Removed {before - after} conflicting records")

before_dup = len(train_raw)
train_raw = train_raw.drop_duplicates()
train_raw = train_raw.reset_index(drop=True)
after_dup = len(train_raw)

if before_dup > after_dup:
    print(f"✓ Removed {before_dup - after_dup} duplicate records")
else:
    print("✓ No duplicates found")

print(f"\nTrain shape after cleaning: {train_raw.shape}")

### 2.4 Handle Missing Values

In [None]:
print("\nStep 4: Handle Missing Values\n" + "-"*60)

print("Missing values before imputation:")
missing_before = train_raw.isna().sum()[train_raw.isna().sum() > 0]
if len(missing_before) > 0:
    print(missing_before)
else:
    print("None (except Geo_Code, handled separately)")

mf_imputer = SimpleImputer(missing_values=np.nan, strategy="most_frequent")
if train_raw["Garden"].isna().sum() > 0:
    train_raw[["Garden"]] = mf_imputer.fit_transform(train_raw[["Garden"]])
    print("\n✓ Garden: Imputed with most frequent value")

median_imputer = SimpleImputer(strategy="median")
if train_raw["Building Dimension"].isna().sum() > 0:
    train_raw[["Building Dimension"]] = median_imputer.fit_transform(train_raw[["Building Dimension"]])
    print("✓ Building Dimension: Imputed with median")

print(f"\nMissing values after imputation:")
remaining = train_raw.isna().sum()[train_raw.isna().sum() > 0]
if len(remaining) > 0:
    print(remaining)
else:
    print("None (except Geo_Code)")

### 2.5 Fill Missing Geo_Code

In [None]:
print("\nStep 5: Fill Missing Geo_Code\n" + "-"*60)

print(f"Missing Geo_Code before filling: {train_raw['Geo_Code'].isna().sum()}")

mode_geo_train = (
    train_raw[train_raw["Geo_Code"].notna()]
    .groupby(["Settlement", "Residential"])["Geo_Code"]
    .agg(lambda x: x.mode()[0] if len(x.mode()) > 0 else x.iloc[0])
    .reset_index()
    .rename(columns={"Geo_Code": "Geo_Code_mode"})
)

print("\nGeo_Code modes by Settlement + Residential:")
print(mode_geo_train)

train_raw = train_raw.merge(mode_geo_train, on=["Settlement", "Residential"], how="left")
train_raw["Geo_Code"] = train_raw["Geo_Code"].fillna(train_raw["Geo_Code_mode"])
train_raw = train_raw.drop(columns=["Geo_Code_mode"])

print(f"\n✓ Missing Geo_Code after filling: {train_raw['Geo_Code'].isna().sum()}")

### 2.6 Clean Geo_Code (Remove Alphanumeric)

In [None]:
print("\nStep 6: Clean Geo_Code (Remove Alphanumeric)\n" + "-"*60)

mask_numeric = train_raw["Geo_Code"].astype(str).str.isnumeric()
print(f"Numeric Geo_Code: {mask_numeric.sum()}")
print(f"Alphanumeric Geo_Code: {(~mask_numeric).sum()}")

if (~mask_numeric).sum() > 0:
    train_raw = train_raw[mask_numeric].copy()
    train_raw["Geo_Code"] = train_raw["Geo_Code"].astype(int)
    train_raw = train_raw.reset_index(drop=True)
    print(f"\n✓ Removed alphanumeric Geo_Code entries")
else:
    train_raw["Geo_Code"] = train_raw["Geo_Code"].astype(int)

print(f"Train shape after Geo_Code cleaning: {train_raw.shape}")

### 2.7 Handle Outliers in Building Dimension

In [None]:
print("\nStep 7: Handle Outliers in Building Dimension\n" + "-"*60)

Q1 = train_raw['Building Dimension'].quantile(0.25)
Q3 = train_raw['Building Dimension'].quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR

print(f"Q1: {Q1:.2f}, Q3: {Q3:.2f}, IQR: {IQR:.2f}")
print(f"Valid range: [{lower:.2f}, {upper:.2f}]")

outliers_before = ((train_raw['Building Dimension'] < lower) | (train_raw['Building Dimension'] > upper)).sum()
print(f"Outliers detected: {outliers_before}")

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

sns.boxplot(x=train_raw['Building Dimension'], ax=axes[0], color='orange')
axes[0].set_title('Building Dimension - Before Outlier Treatment', fontweight='bold')

train_raw['Building Dimension'] = train_raw['Building Dimension'].clip(lower, upper)

sns.boxplot(x=train_raw['Building Dimension'], ax=axes[1], color='skyblue')
axes[1].set_title('Building Dimension - After Outlier Treatment (Clipping)', fontweight='bold')

plt.tight_layout()
plt.show()

print(f"\n✓ Outliers clipped to range [{lower:.2f}, {upper:.2f}]")

### 2.8 Scale Numerical Features

In [None]:
print("\nStep 8: Scale Numerical Features\n" + "-"*60)

cols_to_scale = ['Building Dimension', 'NumberOfWindows']
scaler = RobustScaler()

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

for i, col in enumerate(cols_to_scale):
    sns.histplot(train_raw[col], kde=True, ax=axes[i, 0], color='coral', bins=30)
    axes[i, 0].set_title(f"{col} - Before Scaling", fontweight='bold')

train_raw[cols_to_scale] = scaler.fit_transform(train_raw[cols_to_scale])

for i, col in enumerate(cols_to_scale):
    sns.histplot(train_raw[col], kde=True, ax=axes[i, 1], color='steelblue', bins=30)
    axes[i, 1].set_title(f"{col} - After RobustScaler", fontweight='bold')

plt.tight_layout()
plt.show()

print("✓ RobustScaler fitted and applied to training data")

### 2.9 Encode Categorical Variables

In [None]:
print("\nStep 9: Encode Categorical Variables\n" + "-"*60)

train_transformed = train_raw.copy()

train_transformed["Building_Painted"] = train_transformed["Building_Painted"].map({'N': 1, 'V': 0}).astype('int32')
train_transformed["Building_Fenced"] = train_transformed["Building_Fenced"].map({'N': 1, 'V': 0}).astype('int32')
train_transformed["Garden"] = train_transformed["Garden"].map({'V': 1, 'O': 0}).astype('int32')

print("✓ Binary encoding: Building_Painted, Building_Fenced, Garden")

train_transformed = pd.get_dummies(train_transformed, columns=["Settlement", "Building_Type"], drop_first=True, dtype='int32')

print("✓ One-hot encoding: Settlement, Building_Type")

le_claim = LabelEncoder()
train_transformed["Claim"] = le_claim.fit_transform(train_transformed["Claim"])

print(f"✓ Target encoding: Claim (non=0, oui=1)")

cols = [c for c in train_transformed.columns if c != "Claim"] + ["Claim"]
train_transformed = train_transformed[cols]

print(f"\nShape after encoding: {train_transformed.shape}")
print(f"Columns: {list(train_transformed.columns)}")

### 2.10 Initial Correlation Analysis

In [None]:
print("\nStep 10: Correlation Analysis\n" + "-"*60)

df_corr = train_transformed.corr(numeric_only=True)
corr_with_claim = df_corr[["Claim"]].sort_values(by="Claim", ascending=False)

print("\nCorrelation with Target (Claim):")
print(corr_with_claim)

fig, axes = plt.subplots(1, 2, figsize=(16, max(8, len(corr_with_claim) * 0.35)))

sns.heatmap(corr_with_claim, annot=True, fmt='.3f', vmin=-1, vmax=1, 
            cmap='coolwarm', center=0, ax=axes[0], cbar_kws={'label': 'Correlation'})
axes[0].set_title('Feature Correlation with Target (Claim)', fontsize=14, fontweight='bold')

sns.heatmap(df_corr, annot=False, vmin=-1, vmax=1, cmap='coolwarm', 
            center=0, ax=axes[1], cbar_kws={'label': 'Correlation'})
axes[1].set_title('Full Correlation Matrix', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

print("\n" + "="*80)
print("CORRELATION INTERPRETATION")
print("="*80)

max_corr = corr_with_claim.drop('Claim').abs().max()[0]
print(f"\nMaximum absolute correlation with target: {max_corr:.4f}")

if max_corr < 0.3:
    print("\n⚠ WARNING: VERY LOW CORRELATION DETECTED!")
    print("\nThis suggests:")
    print("  1. Weak linear relationships between features and target")
    print("  2. Non-linear relationships may be present (good for tree-based models)")
    print("  3. Feature interactions may be more important than individual features")
    print("  4. Complex patterns that require ensemble methods")
    print("\nRECOMMENDATION:")
    print("  ✓ Prioritize tree-based models: Random Forest, Gradient Boosting, Extra Trees")
    print("  ✓ Consider feature interactions and polynomial features")
    print("  ✓ Ensemble methods will likely perform best")
    print("  ✗ Linear models may struggle with this dataset")
elif max_corr < 0.5:
    print("\n✓ Moderate correlation detected")
    print("  - Both linear and non-linear models may work well")
else:
    print("\n✓ Strong correlation detected")
    print("  - Linear models should perform well")

weak_features = corr_with_claim.drop('Claim')[corr_with_claim.drop('Claim')['Claim'].abs() < 0.05]
if len(weak_features) > 0:
    print(f"\nFeatures with very weak correlation (|r| < 0.05): {len(weak_features)}")
    for feat in weak_features.index:
        print(f"  - {feat}: {weak_features.loc[feat, 'Claim']:.4f}")
    
    cols_to_drop = ['Building_Painted', 'Geo_Code', 'YearOfObservation']
    cols_to_drop = [c for c in cols_to_drop if c in train_transformed.columns]
    
    if len(cols_to_drop) > 0:
        train_transformed = train_transformed.drop(columns=cols_to_drop)
        print(f"\n✓ Dropped very weakly correlated features: {cols_to_drop}")
    
    train_transformed = train_transformed.reset_index(drop=True)

print(f"\nFinal feature count: {train_transformed.shape[1] - 1} (excluding target)")
print(f"Final shape: {train_transformed.shape}")

### 2.11 Apply Same Transformations to Test Data

In [None]:
print("="*80)
print("APPLYING TRANSFORMATIONS TO TEST DATA")
print("="*80)

if test_raw["Garden"].isna().sum() > 0:
    test_raw[["Garden"]] = mf_imputer.transform(test_raw[["Garden"]])
if test_raw["Building Dimension"].isna().sum() > 0:
    test_raw[["Building Dimension"]] = median_imputer.transform(test_raw[["Building Dimension"]])

test_raw = test_raw.merge(mode_geo_train, on=["Settlement", "Residential"], how="left")
test_raw["Geo_Code"] = test_raw["Geo_Code"].fillna(test_raw["Geo_Code_mode"])
test_raw = test_raw.drop(columns=["Geo_Code_mode"])

mask_numeric_test = test_raw["Geo_Code"].astype(str).str.isnumeric()
test_raw = test_raw[mask_numeric_test].copy()
test_raw["Geo_Code"] = test_raw["Geo_Code"].astype(int)

test_raw[cols_to_scale] = scaler.transform(test_raw[cols_to_scale])

test_transformed = test_raw.copy()
test_transformed["Building_Painted"] = test_transformed["Building_Painted"].map({'N': 1, 'V': 0}).astype('int32')
test_transformed["Building_Fenced"] = test_transformed["Building_Fenced"].map({'N': 1, 'V': 0}).astype('int32')
test_transformed["Garden"] = test_transformed["Garden"].map({'V': 1, 'O': 0}).astype('int32')

test_transformed = pd.get_dummies(test_transformed, columns=["Settlement", "Building_Type"], drop_first=True, dtype='int32')
test_transformed["Claim"] = le_claim.transform(test_transformed["Claim"])

cols = [c for c in test_transformed.columns if c != "Claim"] + ["Claim"]
test_transformed = test_transformed[cols]

if 'cols_to_drop' in locals() and len(cols_to_drop) > 0:
    cols_to_drop_test = [c for c in cols_to_drop if c in test_transformed.columns]
    if len(cols_to_drop_test) > 0:
        test_transformed = test_transformed.drop(columns=cols_to_drop_test)

test_transformed = test_transformed.reset_index(drop=True)

print(f"\n✓ Test data transformed")
print(f"Test shape: {test_transformed.shape}")
print(f"Columns match train: {list(train_transformed.columns) == list(test_transformed.columns)}")

## TASK 3: SELECT MOST DISCRIMINANT FEATURES IF NECESSARY
### 3.1 Mutual Information Analysis

In [None]:
print("="*80)
print("FEATURE SELECTION ANALYSIS")
print("="*80)
print("\nMethod 1: Mutual Information\n" + "-"*60)

X_all = train_transformed.drop('Claim', axis=1)
y_all = train_transformed['Claim']

mi_scores = mutual_info_classif(X_all, y_all, random_state=42)
mi_df = pd.DataFrame({
    'Feature': X_all.columns,
    'MI_Score': mi_scores
}).sort_values('MI_Score', ascending=False)

print("\nMutual Information Scores (higher = more informative):")
print(mi_df.to_string(index=False))

plt.figure(figsize=(10, max(6, len(mi_df) * 0.4)))
plt.barh(mi_df['Feature'], mi_df['MI_Score'], color='teal')
plt.xlabel('Mutual Information Score', fontweight='bold')
plt.title('Feature Importance: Mutual Information with Target', fontweight='bold', fontsize=14)
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

print(f"\nTop 5 most informative features:")
for i, row in mi_df.head(5).iterrows():
    print(f"  {row['Feature']}: {row['MI_Score']:.4f}")

### 3.2 Recursive Feature Elimination (RFE)

In [None]:
print("\nMethod 2: Recursive Feature Elimination (RFE)\n" + "-"*60)

rf_estimator = RandomForestClassifier(n_estimators=100, random_state=42, max_depth=10, n_jobs=-1)
n_features_to_select = max(5, len(X_all.columns) // 2)

print(f"Using Random Forest as estimator")
print(f"Selecting top {n_features_to_select} features out of {len(X_all.columns)}...\n")

rfe = RFE(estimator=rf_estimator, n_features_to_select=n_features_to_select, step=1)
rfe.fit(X_all, y_all)

rfe_df = pd.DataFrame({
    'Feature': X_all.columns,
    'Selected': rfe.support_,
    'Ranking': rfe.ranking_
}).sort_values('Ranking')

print("RFE Results:")
print(rfe_df.to_string(index=False))

selected_features_rfe = rfe_df[rfe_df['Selected']]['Feature'].tolist()
print(f"\n✓ RFE selected {len(selected_features_rfe)} features:")
for feat in selected_features_rfe:
    print(f"  - {feat}")

fig, axes = plt.subplots(1, 2, figsize=(16, 6))

colors = ['green' if s else 'red' for s in rfe_df['Selected']]
axes[0].barh(rfe_df['Feature'], rfe_df['Ranking'], color=colors, alpha=0.7)
axes[0].set_xlabel('Ranking (1 = best)', fontweight='bold')
axes[0].set_title('RFE Feature Ranking', fontweight='bold', fontsize=12)
axes[0].invert_yaxis()
axes[0].axvline(x=1.5, color='blue', linestyle='--', linewidth=2, label='Selection threshold')
axes[0].legend()

selected_count = rfe_df['Selected'].sum()
rejected_count = (~rfe_df['Selected']).sum()
axes[1].pie([selected_count, rejected_count], labels=['Selected', 'Rejected'],
            autopct='%1.1f%%', colors=['green', 'red'], startangle=90, explode=(0.05, 0))
axes[1].set_title(f'Feature Selection Summary\n({selected_count} selected, {rejected_count} rejected)', 
                  fontweight='bold', fontsize=12)

plt.tight_layout()
plt.show()

print(f"\n{'='*80}")
print("FEATURE SELECTION DECISION")
print("="*80)
print(f"\nOriginal features: {len(X_all.columns)}")
print(f"RFE recommended: {len(selected_features_rfe)}")
print(f"\nNote: We will test models with BOTH:")
print(f"  1. Full feature set ({len(X_all.columns)} features)")
print(f"  2. RFE-selected features ({len(selected_features_rfe)} features)")
print(f"\nFinal decision will be based on validation performance.")

## 4. Data Splitting

In [None]:
print("="*80)
print("DATA SPLITTING STRATEGY")
print("="*80)

X_train_full = train_transformed.drop('Claim', axis=1)
y_train_full = train_transformed['Claim']

X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full,
    test_size=0.2,
    random_state=42,
    stratify=y_train_full
)

X_test = test_transformed.drop('Claim', axis=1)
y_test = test_transformed['Claim']

print(f"\nSplit Configuration:")
print(f"  Train set:      {X_train.shape[0]:5d} samples ({X_train.shape[1]} features)")
print(f"  Validation set: {X_val.shape[0]:5d} samples ({X_val.shape[1]} features)")
print(f"  Test set:       {X_test.shape[0]:5d} samples ({X_test.shape[1]} features)")

print(f"\nTarget distribution:")
print(f"  Train:      {dict(y_train.value_counts())}")
print(f"  Validation: {dict(y_val.value_counts())}")
print(f"  Test:       {dict(y_test.value_counts())}")

fig, axes = plt.subplots(1, 3, figsize=(15, 4))
sets = [('Train', y_train), ('Validation', y_val), ('Test', y_test)]

for ax, (name, y_data) in zip(axes, sets):
    counts = y_data.value_counts()
    ax.bar(['No Claim', 'Claim'], counts.values, color=['#66b3ff', '#ff9999'], width=0.6)
    ax.set_title(f'{name} Set Distribution', fontweight='bold')
    ax.set_ylabel('Count')
    for i, v in enumerate(counts.values):
        ax.text(i, v + 10, str(v), ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

## 5. Class Imbalance Analysis: Resampling Strategy Decision

In [None]:
print("="*80)
print("CLASS IMBALANCE ANALYSIS & RESAMPLING STRATEGY")
print("="*80)

minority_class = y_train.value_counts().min()
majority_class = y_train.value_counts().max()
imbalance_ratio = majority_class / minority_class
minority_percentage = (minority_class / len(y_train)) * 100

print(f"\nClass Distribution in Training Set:")
print(f"  Majority class: {majority_class:4d} samples ({100 - minority_percentage:.2f}%)")
print(f"  Minority class: {minority_class:4d} samples ({minority_percentage:.2f}%)")
print(f"  Imbalance Ratio: {imbalance_ratio:.2f}:1")

print(f"\n{'-'*80}")
print("SEVERITY ASSESSMENT")
print("-"*80)

if imbalance_ratio < 1.5:
    severity = "BALANCED"
    color_name = "green"
    recommendation = "No special handling needed"
    use_resampling = False
elif imbalance_ratio < 3:
    severity = "MILD IMBALANCE"
    color_name = "yellow"
    recommendation = "Use class_weight='balanced' parameter"
    use_resampling = False
elif imbalance_ratio < 9:
    severity = "MODERATE IMBALANCE"
    color_name = "orange"
    recommendation = "Test multiple strategies (class_weight + resampling)"
    use_resampling = True
else:
    severity = "SEVERE IMBALANCE"
    color_name = "red"
    recommendation = "Resampling strongly recommended"
    use_resampling = True

print(f"Severity Level: {severity}")
print(f"Recommendation: {recommendation}")

print(f"\n{'-'*80}")
print("SAMPLE SIZE ANALYSIS")
print("-"*80)

print(f"\nTotal training samples: {len(X_train)}")
print(f"Features: {X_train.shape[1]}")
print(f"Samples per feature: {len(X_train) / X_train.shape[1]:.1f}")
print(f"Minority class samples: {minority_class}")

if minority_class < 50:
    size_assessment = "CRITICAL: Very few minority samples"
    smote_feasible = False
elif minority_class < 100:
    size_assessment = "WARNING: Limited minority samples"
    smote_feasible = False
elif minority_class < 200:
    size_assessment = "CAUTION: Moderate minority samples"
    smote_feasible = True
else:
    size_assessment = "GOOD: Sufficient samples for resampling"
    smote_feasible = True

print(f"\nSample Size Assessment: {size_assessment}")
print(f"SMOTE Feasibility: {'✓ Feasible' if smote_feasible else '✗ Not recommended'}")

print(f"\n{'='*80}")
print("FINAL RESAMPLING STRATEGY")
print("="*80)

if not smote_feasible:
    use_resampling = False
    final_strategy = "class_weight='balanced' ONLY"
    print(f"\nStrategy: {final_strategy}")
    print("\nReason: Insufficient minority samples for reliable synthetic generation")
    print("Risk: SMOTE would likely cause overfitting with so few samples")
elif not use_resampling:
    final_strategy = "class_weight='balanced' ONLY"
    print(f"\nStrategy: {final_strategy}")
    print("\nReason: Imbalance is mild enough for class weighting alone")
else:
    final_strategy = "TEST MULTIPLE APPROACHES"
    print(f"\nStrategy: {final_strategy}")
    print("\nWill compare the following approaches:")
    print("  1. Baseline (no handling)")
    print("  2. class_weight='balanced'")
    print("  3. SMOTE (oversampling minority)")
    print("  4. SMOTETomek (hybrid: SMOTE + noise removal)")
    print("\nBest approach will be selected based on validation performance.")

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].bar(['Majority\n(No Claim)', 'Minority\n(Claim)'], [majority_class, minority_class],
            color=['#66b3ff', '#ff9999'], width=0.6)
axes[0].set_title('Class Distribution in Training Set', fontweight='bold', fontsize=12)
axes[0].set_ylabel('Number of Samples')
for i, v in enumerate([majority_class, minority_class]):
    axes[0].text(i, v + 10, f"{v}\n({[100-minority_percentage, minority_percentage][i]:.1f}%)", 
                ha='center', va='bottom', fontweight='bold')

categories = ['Balanced\n(<1.5:1)', 'Mild\n(1.5-3:1)', 'Moderate\n(3-9:1)', 'Severe\n(>9:1)']
thresholds = [1.5, 3, 9, 15]
colors_scale = ['green', 'yellow', 'orange', 'red']

current_pos = 0
for i, threshold in enumerate(thresholds):
    if imbalance_ratio <= threshold:
        current_pos = i
        break
else:
    current_pos = len(categories) - 1

for i, (cat, col) in enumerate(zip(categories, colors_scale)):
    alpha = 0.8 if i == current_pos else 0.3
    edge_width = 3 if i == current_pos else 0
    axes[1].bar(i, thresholds[i], color=col, alpha=alpha, edgecolor='black', linewidth=edge_width)

axes[1].axhline(y=imbalance_ratio, color='blue', linestyle='--', linewidth=2,
                label=f'Your Data: {imbalance_ratio:.2f}:1')
axes[1].set_xticks(range(len(categories)))
axes[1].set_xticklabels(categories)
axes[1].set_ylabel('Imbalance Ratio')
axes[1].set_title('Imbalance Severity Scale', fontweight='bold', fontsize=12)
axes[1].legend(loc='upper left')
axes[1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()