# Insurance Claim Prediction - Clean Pipeline
Complete ML pipeline with proper train/test split handling

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler, LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, f1_score, roc_auc_score, roc_curve, auc
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings('ignore')

import math

print("Libraries loaded successfully")

## 1. Load Raw Data

In [None]:
train_raw = pd.read_csv("train_Insurance.csv")
test_raw = pd.read_csv("test_Insurance.csv")

print(f"Train shape: {train_raw.shape}")
print(f"Test shape: {test_raw.shape}")
print(f"\nTrain columns: {list(train_raw.columns)}")
print(f"\nTrain missing values:\n{train_raw.isna().sum()}")

## 2. Data Cleaning Pipeline (Train Only for Fitting)

### Step 1: Remove Customer ID from both datasets

In [None]:
train_raw.drop(labels=['Customer Id'], axis=1, inplace=True)
test_raw.drop(labels=['Customer Id'], axis=1, inplace=True)

print(f"Columns after removing Customer ID: {list(train_raw.columns)}")

### Step 2: Clean NumberOfWindows (Train)

In [None]:
train_raw['NumberOfWindows'] = train_raw['NumberOfWindows'].replace({'without': 0, '>=10': 10})
train_raw['NumberOfWindows'] = train_raw['NumberOfWindows'].astype(int)

test_raw['NumberOfWindows'] = test_raw['NumberOfWindows'].replace({'without': 0, '>=10': 10})
test_raw['NumberOfWindows'] = test_raw['NumberOfWindows'].astype(int)

print("NumberOfWindows cleaned in both datasets")

### Step 3: Remove Duplicates and Conflicts (Train Only)

In [None]:
n_exact = train_raw.duplicated().sum()
n_same_features = train_raw.duplicated(subset=[c for c in train_raw.columns if c != 'Claim']).sum()

print(f"Exact duplicates: {n_exact}")
print(f"Duplicates with same features: {n_same_features}")

if n_same_features > 0:
    features = [c for c in train_raw.columns if c != "Claim"]
    dups = train_raw[train_raw.duplicated(subset=features, keep=False)]
    conflicts = dups.groupby(features)['Claim'].nunique()
    n_conflicts = (conflicts > 1).sum()
    print(f"Conflicting records: {n_conflicts}")
    
    if n_conflicts > 0:
        conflicting_groups = conflicts[conflicts > 1].reset_index()
        before = len(train_raw)
        train_raw = train_raw.merge(conflicting_groups[features], on=features, how='left', indicator=True)
        train_raw = train_raw[train_raw['_merge'] == 'left_only'].drop(columns=['_merge'])
        after = len(train_raw)
        print(f"Removed {before - after} conflicting records")

train_raw.drop_duplicates(inplace=True)
print(f"Train shape after cleaning: {train_raw.shape}")

### Step 4: Handle Missing Values in Train (fit imputers here)

In [None]:
print("Missing values in train:")
print(train_raw.isna().sum())

mf_imputer = SimpleImputer(missing_values=np.nan, strategy="most_frequent")
train_raw[["Garden"]] = mf_imputer.fit_transform(train_raw[["Garden"]])

median_imputer = SimpleImputer(strategy="median")
train_raw[["Building Dimension"]] = median_imputer.fit_transform(train_raw[["Building Dimension"]])

print("\nImputers fitted on train data")
print(f"Missing values after imputation:\n{train_raw.isna().sum()}")

### Step 5: Fill Missing Geo_Code using Train Mode

In [None]:
mode_geo_train = (
    train_raw[train_raw["Geo_Code"].notna()]
    .groupby(["Settlement", "Residential"])["Geo_Code"]
    .agg(lambda x: x.mode()[0] if len(x.mode()) > 0 else x.iloc[0])
    .reset_index()
    .rename(columns={"Geo_Code": "Geo_Code_mode"})
)

print("Train Geo_Code modes by Settlement+Residential:")
print(mode_geo_train)

train_raw = train_raw.merge(mode_geo_train, on=["Settlement", "Residential"], how="left")
train_raw["Geo_Code"] = train_raw["Geo_Code"].fillna(train_raw["Geo_Code_mode"])
train_raw = train_raw.drop(columns=["Geo_Code_mode"])

print(f"\nTrain missing values after Geo_Code fill: {train_raw['Geo_Code'].isna().sum()}")

### Step 6: Clean Geo_Code (remove alphanumeric, convert to int)

In [None]:
mask_numeric = train_raw["Geo_Code"].astype(str).str.isnumeric()
print(f"Train numeric Geo_Code: {mask_numeric.sum()}")
print(f"Train alphanumeric Geo_Code: {(~mask_numeric).sum()}")

train_raw = train_raw[mask_numeric].copy()
train_raw["Geo_Code"] = train_raw["Geo_Code"].astype(int)

print(f"Train shape after Geo_Code cleaning: {train_raw.shape}")

### Step 7: Handle Outliers in Building Dimension (Train)

In [None]:
Q1 = train_raw['Building Dimension'].quantile(0.25)
Q3 = train_raw['Building Dimension'].quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR

print(f"Building Dimension - Q1: {Q1}, Q3: {Q3}, IQR: {IQR}")
print(f"Bounds: [{lower}, {upper}]")

outliers_before = (train_raw['Building Dimension'] < lower) | (train_raw['Building Dimension'] > upper)
print(f"Outliers found: {outliers_before.sum()}")

train_raw['Building Dimension'] = train_raw['Building Dimension'].clip(lower, upper)
print("Outliers clipped")

### Step 8: Fit RobustScaler on Train Data

In [None]:
cols_to_scale = ['Building Dimension', 'NumberOfWindows']
scaler = RobustScaler()

train_raw[cols_to_scale] = scaler.fit_transform(train_raw[cols_to_scale])

print("RobustScaler fitted and applied to train data")
print(f"Scaled columns: {cols_to_scale}")

### Step 9: Encode Categorical Variables (Train)

In [None]:
train_transformed = train_raw.copy()

train_transformed["Building_Painted"] = train_transformed["Building_Painted"].map({'N': 1, 'V': 0}).astype('int32')
train_transformed["Building_Fenced"] = train_transformed["Building_Fenced"].map({'N': 1, 'V': 0}).astype('int32')
train_transformed["Garden"] = train_transformed["Garden"].map({'V': 1, 'O': 0}).astype('int32')

train_transformed = pd.get_dummies(train_transformed, columns=["Settlement", "Building_Type"], drop_first=True, dtype='int32')

le_claim = LabelEncoder()
train_transformed["Claim"] = le_claim.fit_transform(train_transformed["Claim"])

cols = [c for c in train_transformed.columns if c != "Claim"] + ["Claim"]
train_transformed = train_transformed[cols]

print("Train data encoded")
print(f"Train shape: {train_transformed.shape}")
print(f"Train columns: {list(train_transformed.columns)}")

### Step 10: Drop Low Correlation Features

In [None]:
df_corr = train_transformed.corr(numeric_only=True)
corr_with_claim = df_corr[["Claim"]].sort_values(by="Claim", ascending=False)

print("Correlation with Claim:")
print(corr_with_claim)

cols_to_drop = [
    'Building_Painted',
    'Geo_Code',
    'YearOfObservation',
    'Building_Type_Non-combustible'
]

train_transformed = train_transformed.drop(columns=cols_to_drop)
train_transformed = train_transformed.reset_index(drop=True)

print(f"\nFinal train shape: {train_transformed.shape}")
print(f"Final train columns: {list(train_transformed.columns)}")

## 3. Apply Same Transformations to Test Data

### Step 1: Fill Missing Values Using Train-Fitted Imputers

In [None]:
print("Test missing values before imputation:")
print(test_raw.isna().sum())

test_raw[["Garden"]] = mf_imputer.transform(test_raw[["Garden"]])
test_raw[["Building Dimension"]] = median_imputer.transform(test_raw[["Building Dimension"]])

print("\nTest data imputed using train-fitted imputers")

### Step 2: Fill Missing Geo_Code Using Train Mode

In [None]:
test_raw = test_raw.merge(mode_geo_train, on=["Settlement", "Residential"], how="left")
test_raw["Geo_Code"] = test_raw["Geo_Code"].fillna(test_raw["Geo_Code_mode"])
test_raw = test_raw.drop(columns=["Geo_Code_mode"])

print(f"Test missing Geo_Code after fill: {test_raw['Geo_Code'].isna().sum()}")

### Step 3: Clean Geo_Code (same as train)

In [None]:
mask_numeric_test = test_raw["Geo_Code"].astype(str).str.isnumeric()
print(f"Test numeric Geo_Code: {mask_numeric_test.sum()}")
print(f"Test alphanumeric Geo_Code: {(~mask_numeric_test).sum()}")

test_raw = test_raw[mask_numeric_test].copy()
test_raw["Geo_Code"] = test_raw["Geo_Code"].astype(int)

print(f"Test shape after Geo_Code cleaning: {test_raw.shape}")

### Step 4: Apply Train Scaler (TRANSFORM only, not fit)

In [None]:
test_raw[cols_to_scale] = scaler.transform(test_raw[cols_to_scale])

print("Test data scaled using train-fitted scaler")

### Step 5: Encode Categorical Variables (Test) - Same as Train

In [None]:
test_transformed = test_raw.copy()

test_transformed["Building_Painted"] = test_transformed["Building_Painted"].map({'N': 1, 'V': 0}).astype('int32')
test_transformed["Building_Fenced"] = test_transformed["Building_Fenced"].map({'N': 1, 'V': 0}).astype('int32')
test_transformed["Garden"] = test_transformed["Garden"].map({'V': 1, 'O': 0}).astype('int32')

test_transformed = pd.get_dummies(test_transformed, columns=["Settlement", "Building_Type"], drop_first=True, dtype='int32')

test_transformed["Claim"] = le_claim.transform(test_transformed["Claim"])

cols = [c for c in test_transformed.columns if c != "Claim"] + ["Claim"]
test_transformed = test_transformed[cols]

print("Test data encoded")
print(f"Test shape before dropping columns: {test_transformed.shape}")
print(f"Test columns: {list(test_transformed.columns)}")

### Step 6: Drop Same Columns as Train

In [None]:
test_transformed = test_transformed.drop(columns=cols_to_drop)
test_transformed = test_transformed.reset_index(drop=True)

print(f"Final test shape: {test_transformed.shape}")
print(f"Final test columns: {list(test_transformed.columns)}")

assert list(train_transformed.columns) == list(test_transformed.columns), "Column mismatch between train and test!"
print("\nColumn structure verified: MATCH")

## 4. Prepare Features and Target

In [None]:
X_train = train_transformed.drop('Claim', axis=1)
y_train = train_transformed['Claim']

X_test = test_transformed.drop('Claim', axis=1)
y_test = test_transformed['Claim']

print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

print(f"\nTrain target distribution:")
print(y_train.value_counts())
print(f"\nTest target distribution:")
print(y_test.value_counts())

## 5. Train Logistic Regression with Hyperparameter Tuning

In [None]:
pipeline = Pipeline([
    ('model', LogisticRegression(max_iter=500, class_weight='balanced', solver='liblinear', random_state=42))
])

param_grid = {
    'model__penalty': ['l1', 'l2'],
    'model__C': [0.01, 0.1, 1, 10, 100]
}

grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=StratifiedKFold(n_splits=5),
    scoring='f1',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

print(f"\nBest parameters: {grid_search.best_params_}")
print(f"Best F1 score (CV): {grid_search.best_score_:.4f}")

## 6. Evaluate on Test Data

In [None]:
best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

print("\n" + "="*60)
print("TEST SET EVALUATION")
print("="*60)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

print(f"\nF1 Score: {f1:.4f}")
print(f"ROC-AUC Score: {roc_auc:.4f}")

## 7. ROC Curve Visualization

In [None]:
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
roc_auc_calc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC curve (AUC = {roc_auc_calc:.4f})')
plt.plot([0, 1], [0, 1], color='red', lw=2, linestyle='--', label='Random Classifier')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - Logistic Regression')
plt.legend(loc="lower right")
plt.grid(alpha=0.3)
plt.show()

## 8. Predictions Summary

In [None]:
predictions_df = pd.DataFrame({
    'Actual': y_test.values,
    'Predicted': y_pred,
    'Probability_Claim': y_pred_proba,
    'Correct': y_test.values == y_pred
})

print("\nPrediction Summary:")
print(f"Total predictions: {len(predictions_df)}")
print(f"Correct predictions: {predictions_df['Correct'].sum()}")
print(f"Incorrect predictions: {(~predictions_df['Correct']).sum()}")
print(f"Accuracy: {predictions_df['Correct'].mean():.4f}")

print("\nFirst 10 predictions:")
print(predictions_df.head(10))