<a href="https://colab.research.google.com/github/asritha7125/WarrantyML/blob/7/Phase_7_claim_approval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files
uploaded = files.upload()

Saving warranty_final.csv to warranty_final (2).csv


In [None]:
# ============================
# PHASE 9: CLAIM APPROVAL PREDICTION (END-TO-END)
# ============================

# 1) INSTALLS (Colab-safe; no effect if already installed)
!pip -q install scikit-learn pandas numpy joblib

# 2) IMPORTS
import pandas as pd
import numpy as np
import json
import joblib
from datetime import datetime
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.metrics import (roc_auc_score, average_precision_score, f1_score,
                             confusion_matrix, classification_report, brier_score_loss)
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import RandomForestClassifier

# 3) CONFIG
CSV_PATH = "warranty_final.csv"     # <-- make sure this file is uploaded
PIPELINE_PATH = "claim_pipeline.pkl"
FEATURE_SPEC_PATH = "claim_feature_spec.json"

# Allowed regions for this project (STRICT)
ALLOWED_REGIONS = ["central", "north", "south", "east", "west"]

# If your historical data has city names, map them to the 5 allowed regions here.
# Extend the mapping as needed for your dataset.
CITY_TO_REGION = {
    # North
    "delhi":"north","gurgaon":"north","noida":"north","chandigarh":"north","lucknow":"north","kanpur":"north","jaipur":"north","amritsar":"north",
    # South
    "chennai":"south","bangalore":"south","bengaluru":"south","hyderabad":"south","kochi":"south","coimbatore":"south","vizag":"south",
    # West
    "mumbai":"west","thane":"west","pune":"west","ahmedabad":"west","surat":"west","vadodara":"west","rajkot":"west","indore":"west",
    # East
    "kolkata":"east","bhubaneswar":"east","patna":"east","guwahati":"east","ranchi":"east","durgapur":"east","siliguri":"east",
    # Central (fallback for MP/CG or unknown)
    "bhopal":"central","indore":"central","raipur":"central","nagpur":"central","gwalior":"central"
}

# 4) UTILS

def to_lower(s):
    if pd.isna(s):
        return s
    return str(s).strip().lower()

def normalize_region(raw):
    """
    Normalize any incoming region value to one of:
    ['central','north','south','east','west'].
    If already one of these (any case), keep it.
    If a city is provided, map via CITY_TO_REGION.
    Unknowns default to 'central'.
    """
    if pd.isna(raw):
        return "central"
    v = to_lower(raw)
    # already one of allowed?
    if v in ALLOWED_REGIONS:
        return v
    # try city mapping
    if v in CITY_TO_REGION:
        return CITY_TO_REGION[v]
    # if it looks like "<state> - <zone>", try take the last token
    parts = [p.strip() for p in v.replace("_"," ").split()]
    for p in parts[::-1]:
        if p in ALLOWED_REGIONS:
            return p
        if p in CITY_TO_REGION:
            return CITY_TO_REGION[p]
    # fallback
    return "central"

def parse_date_safe(s):
    if pd.isna(s): return pd.NaT
    for fmt in ("%Y-%m-%d","%d-%m-%Y","%d/%m/%Y","%Y/%m/%d","%d %b %Y","%d %B %Y"):
        try:
            return pd.to_datetime(s, format=fmt)
        except:
            continue
    # last resort: let pandas infer
    try:
        return pd.to_datetime(s, infer_datetime_format=True, errors="coerce")
    except:
        return pd.NaT

def ensure_severity(series):
    # If missing or empty, fill with Medium
    ser = series.astype(str).str.strip()
    ser = ser.replace({"nan":"", "None":"", "":"Medium"})
    # Normalize to proper capitalization
    ser = ser.str.lower().map({"low":"Low","medium":"Medium","med":"Medium","moderate":"Medium",
                               "high":"High","critical":"Critical"})
    ser = ser.fillna("Medium")
    # Any unknown -> Medium
    ser = ser.where(ser.isin(["Low","Medium","High","Critical"]), "Medium")
    return ser

def binarize_target(status):
    """
    Map Claim_Status -> 1 for 'Approved', 0 otherwise.
    Accepts variants like 'approved', 'APPROVED', etc.
    """
    s = status.astype(str).str.strip().str.lower()
    return (s == "approved").astype(int)

# 5) LOAD DATA
df_raw = pd.read_csv(CSV_PATH)

# 6) STANDARDIZE COLUMN NAMES (case-insensitive)
# Create a mapping from lower names to original
lower_map = {c.lower(): c for c in df_raw.columns}

def pick(colname, alternatives):
    # return the existing original column matching any alt (case-insensitive)
    for alt in alternatives:
        if alt.lower() in lower_map:
            return lower_map[alt.lower()]
    return None

col_Product_Type   = pick("Product_Type",   ["Product_Type","product","producttype","product_type"])
col_Region         = pick("Region",         ["Region","location","zone"])
col_Purchase_Date  = pick("Purchase_Date",  ["Purchase_Date","purchase","buy_date","purchasedate"])
col_Claim_Date     = pick("Claim_Date",     ["Claim_Date","claimdate","date_of_claim"])
col_Warr_Period    = pick("Warranty_Period",["Warranty_Period","warranty_period","warr_days"])
col_Claim_Status   = pick("Claim_Status",   ["Claim_Status","status","claim_status"])
col_Issue_Type     = pick("Issue_Type",     ["Issue_Type","issue","problem","fault_type"])
col_Severity       = pick("Severity_Level", ["Severity_Level","severity","damage_severity"])

required_cols = [col_Product_Type, col_Region, col_Purchase_Date, col_Claim_Date, col_Warr_Period, col_Claim_Status]
missing = [nm for nm in ["Product_Type","Region","Purchase_Date","Claim_Date","Warranty_Period","Claim_Status"]
           if pick(nm,[nm]) is None]
if missing:
    raise ValueError(f"Missing required columns in CSV: {missing}. Please ensure these exist in {CSV_PATH}.")

# 7) CLEAN / FEATURE ENGINEERING

df = pd.DataFrame()
df["Product_Type"]   = df_raw[col_Product_Type].astype(str).str.strip()
df["Region"]         = df_raw[col_Region].apply(normalize_region)
df["Purchase_Date"]  = df_raw[col_Purchase_Date].apply(parse_date_safe)
df["Claim_Date"]     = df_raw[col_Claim_Date].apply(parse_date_safe)
df["Warranty_Period"] = pd.to_numeric(df_raw[col_Warr_Period], errors="coerce")  # days
df["Issue_Type"]     = df_raw[col_Issue_Type].astype(str).str.strip() if col_Issue_Type else "General"
df["Severity_Level"] = ensure_severity(df_raw[col_Severity]) if col_Severity else pd.Series(["Medium"]*len(df_raw))
df["Target"]         = binarize_target(df_raw[col_Claim_Status])

# Drop rows with essential NaT/NaN
df = df.dropna(subset=["Purchase_Date","Claim_Date","Warranty_Period"])

# Derived features
df["Days_To_Claim"] = (df["Claim_Date"] - df["Purchase_Date"]).dt.days.clip(lower=0)
df["Within_Warranty"] = (df["Days_To_Claim"] <= df["Warranty_Period"]).astype(int)

# Keep only allowed region labels
df["Region"] = df["Region"].apply(lambda r: r if r in ALLOWED_REGIONS else "central")

# Sanity: If Warranty_Period negative (data error), fix to 0
df.loc[df["Warranty_Period"] < 0, "Warranty_Period"] = 0

# 8) DEFINE FEATURES / TARGET

# Categorical
cat_features = ["Product_Type","Region","Issue_Type"]
# Ordinal categorical (Severity with explicit order)
ord_features = ["Severity_Level"]
ord_categories = [["Low","Medium","High","Critical"]]

# Numeric
num_features = ["Warranty_Period","Days_To_Claim","Within_Warranty"]

X = df[cat_features + ord_features + num_features].copy()
y = df["Target"].astype(int)

# 9) TRAIN/TEST SPLIT
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# 10) PREPROCESSOR
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_features),
        ("ord", OrdinalEncoder(categories=ord_categories), ord_features),
        ("num", StandardScaler(), num_features),
    ],
    remainder="drop"
)

# 11) MODEL
# Use RandomForest + probability calibration for reliable % predictions
base_clf = RandomForestClassifier(
    n_estimators=400,
    max_depth=None,
    min_samples_split=4,
    min_samples_leaf=2,
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
)

clf = CalibratedClassifierCV(
    estimator=base_clf,
    method="isotonic",  # better calibration if enough data; 'sigmoid' if data is small
    cv=5
)

# 12) FULL PIPELINE
pipeline = Pipeline(steps=[
    ("prep", preprocessor),
    ("clf", clf)
])

# 13) TRAIN
pipeline.fit(X_train, y_train)

# 14) EVALUATE
probs_test = pipeline.predict_proba(X_test)[:,1]
preds_test = (probs_test >= 0.5).astype(int)

auc = roc_auc_score(y_test, probs_test)
ap  = average_precision_score(y_test, probs_test)
f1  = f1_score(y_test, preds_test)
cm  = confusion_matrix(y_test, preds_test)
brier = brier_score_loss(y_test, probs_test)

print("==== EVALUATION (TEST) ====")
print(f"ROC AUC:              {auc:.4f}")
print(f"Average Precision:    {ap:.4f}")
print(f"F1 Score (0.5 thr):   {f1:.4f}")
print(f"Brier Score (lower=better calibration): {brier:.4f}")
print("Confusion Matrix [ [TN FP]\n                   [FN TP] ]")
print(cm)
print("\nClassification Report:")
print(classification_report(y_test, preds_test, target_names=["Rejected(0)","Approved(1)"]))

# 15) SAVE ARTIFACTS
joblib.dump(pipeline, PIPELINE_PATH)

feature_spec = {
    "categorical": cat_features,
    "ordinal": ord_features,
    "ordinal_categories": ord_categories,
    "numeric": num_features,
    "allowed_regions": ALLOWED_REGIONS
}
with open(FEATURE_SPEC_PATH, "w") as f:
    json.dump(feature_spec, f, indent=2)

print(f"\n✅ Saved pipeline to: {PIPELINE_PATH}")
print(f"✅ Saved feature spec to: {FEATURE_SPEC_PATH}")

# 16) QUICK INFERENCE EXAMPLE (simulate how the app will call it)
example = pd.DataFrame([{
    "Product_Type": "Laptop",
    "Region": "north",                 # must be one of: central/north/south/east/west
    "Issue_Type": "Battery Failure",
    "Severity_Level": "High",          # Low/Medium/High/Critical
    "Warranty_Period": 365,
    "Days_To_Claim": 120,
    "Within_Warranty": 1
}])

loaded = joblib.load(PIPELINE_PATH)
p = loaded.predict_proba(example)[0,1]
print(f"\n🔎 Example predicted approval probability: {p*100:.2f}%")

# Load the trained pipeline
pipeline = joblib.load("claim_pipeline.pkl")

# Select the features for prediction
feature_cols = ["Product_Type","Region","Issue_Type","Severity_Level",
                "Warranty_Period","Days_To_Claim","Within_Warranty"]

X = df[feature_cols]

# Predict probabilities
df["Approval_Probability"] = pipeline.predict_proba(X)[:,1]

# Optional: assign predicted status
df["Predicted_Status"] = (df["Approval_Probability"] >= 0.5).map({True:"Approved", False:"Rejected"})

# Save to CSV
df.to_csv("claims_with_predictions.csv", index=False)
print("✅ Predictions saved to claims_with_predictions.csv")


==== EVALUATION (TEST) ====
ROC AUC:              0.5026
Average Precision:    0.3104
F1 Score (0.5 thr):   0.0000
Brier Score (lower=better calibration): 0.2131
Confusion Matrix [ [TN FP]
                   [FN TP] ]
[[38  0]
 [15  0]]

Classification Report:
              precision    recall  f1-score   support

 Rejected(0)       0.72      1.00      0.84        38
 Approved(1)       0.00      0.00      0.00        15

    accuracy                           0.72        53
   macro avg       0.36      0.50      0.42        53
weighted avg       0.51      0.72      0.60        53



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



✅ Saved pipeline to: claim_pipeline.pkl
✅ Saved feature spec to: claim_feature_spec.json

🔎 Example predicted approval probability: 16.82%
✅ Predictions saved to claims_with_predictions.csv


In [None]:
# After training your pipeline
joblib.dump(pipeline, "claim_pipeline.pkl")


['claim_pipeline.pkl']

In [None]:
import joblib
pipeline = joblib.load("claim_pipeline.pkl")


In [None]:
test_examples = pd.DataFrame([
    {"Product_Type":"Mobile","Region":"south","Issue_Type":"Screen","Severity_Level":"Medium",
     "Warranty_Period":180,"Days_To_Claim":50,"Within_Warranty":1},
    {"Product_Type":"Washing Machine","Region":"west","Issue_Type":"Motor Failure","Severity_Level":"High",
     "Warranty_Period":365,"Days_To_Claim":400,"Within_Warranty":0}
])

probs = loaded.predict_proba(test_examples)[:,1]
for i, p in enumerate(probs):
    print(f"Row {i+1} predicted approval probability: {p*100:.2f}%")


Row 1 predicted approval probability: 17.97%
Row 2 predicted approval probability: 32.67%


In [None]:
print(df.columns)


Index(['Product_Type', 'Region', 'Purchase_Date', 'Claim_Date',
       'Warranty_Period', 'Issue_Type', 'Severity_Level', 'Target',
       'Days_To_Claim', 'Within_Warranty', 'Approval_Probability',
       'Predicted_Status'],
      dtype='object')


In [None]:
import pandas as pd
import numpy as np

# Ensure dates are datetime
df["Purchase_Date"] = pd.to_datetime(df["Purchase_Date"])
df["Claim_Date"] = pd.to_datetime(df["Claim_Date"])

# Create derived features
df["Days_To_Claim"] = (df["Claim_Date"] - df["Purchase_Date"]).dt.days.clip(lower=0)
df["Within_Warranty"] = (df["Days_To_Claim"] <= df["Warranty_Period"]).astype(int)

# Ensure Severity_Level exists (if missing, assign random value)
severity_categories = ["Low", "Medium", "High", "Critical"]
if "Severity_Level" not in df.columns:
    df["Severity_Level"] = np.random.choice(severity_categories, size=len(df))


In [None]:
from google.colab import files

# Suppose your file is claims_with_predictions.csv
files.download("claims_with_predictions.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Select features for prediction
feature_cols = ["Product_Type","Region","Issue_Type","Severity_Level",
                "Warranty_Period","Days_To_Claim","Within_Warranty"]

X_dashboard = df[feature_cols]

# Predict approval probabilities
df["Approval_Probability"] = pipeline.predict_proba(X_dashboard)[:,1]

# Optional: threshold 0.5 for predicted status
df["Predicted_Status"] = (df["Approval_Probability"] >= 0.5).map({True:"Approved", False:"Rejected"})

# Save predictions to CSV
df.to_csv("claims_with_predictions.csv", index=False)


In [None]:
# ==============================
# Phase 10: Predict Claims (Colab-ready)
# ==============================

# 1️⃣ Imports
import pandas as pd
import numpy as np
import joblib
import io
from google.colab import files

# Allowed regions (strict)
ALLOWED_REGIONS = ["central", "north", "south", "east", "west"]

# Normalize region function (Phase 9)
def normalize_region(raw):
    if pd.isna(raw):
        return "central"
    v = str(raw).strip().lower()
    if v in ALLOWED_REGIONS:
        return v
    # fallback
    return "central"

# --------------------------
# 2️⃣ Upload pipeline (optional)
# --------------------------
try:
    print("Upload the trained pipeline file (claim_pipeline.pkl) or skip if running Phase 9 above:")
    uploaded_pipeline = files.upload()
    if uploaded_pipeline:
        pipeline_filename = list(uploaded_pipeline.keys())[0]
        pipeline = joblib.load(io.BytesIO(uploaded_pipeline[pipeline_filename]))
        print(f"✅ Loaded pipeline: {pipeline_filename}")
except Exception as e:
    print("⚠️ Skipping pipeline upload. Using in-memory pipeline from Phase 9.")
    if 'pipeline' not in globals():
        raise ValueError("No pipeline found in memory. Please run Phase 9 or upload claim_pipeline.pkl.")

# --------------------------
# 3️⃣ Upload dataset CSV
# --------------------------
print("\nUpload the dataset CSV (warranty_final.csv or new claims file):")
uploaded_csv = files.upload()
csv_filename = list(uploaded_csv.keys())[0]
df = pd.read_csv(io.BytesIO(uploaded_csv[csv_filename]))
print(f"✅ Loaded dataset: {csv_filename}")

# --------------------------
# 4️⃣ Ensure required columns and create derived features
# --------------------------
# Dates
df["Purchase_Date"] = pd.to_datetime(df["Purchase_Date"])
df["Claim_Date"] = pd.to_datetime(df["Claim_Date"])

# Days to claim and within warranty
df["Days_To_Claim"] = (df["Claim_Date"] - df["Purchase_Date"]).dt.days.clip(lower=0)
df["Within_Warranty"] = (df["Days_To_Claim"] <= df["Warranty_Period"]).astype(int)

# Severity_Level: random if missing
severity_categories = ["Low", "Medium", "High", "Critical"]
if "Severity_Level" not in df.columns:
    df["Severity_Level"] = np.random.choice(severity_categories, size=len(df))
else:
    # normalize existing values
    df["Severity_Level"] = df["Severity_Level"].str.strip().str.capitalize()
    df.loc[~df["Severity_Level"].isin(severity_categories), "Severity_Level"] = "Medium"

# Issue_Type: default if missing
if "Issue_Type" not in df.columns:
    df["Issue_Type"] = "General"
else:
    df["Issue_Type"] = df["Issue_Type"].astype(str).str.strip()

# Product_Type: default if missing
if "Product_Type" not in df.columns:
    df["Product_Type"] = "Unknown"
else:
    df["Product_Type"] = df["Product_Type"].astype(str).str.strip()

# Region: normalize + fallback
if "Region" not in df.columns:
    df["Region"] = "central"
df["Region"] = df["Region"].apply(normalize_region)

# --------------------------
# 5️⃣ Select features for pipeline
# --------------------------
feature_cols = ["Product_Type","Region","Issue_Type","Severity_Level",
                "Warranty_Period","Days_To_Claim","Within_Warranty"]
X_dashboard = df[feature_cols]

# --------------------------
# 6️⃣ Predict approval probabilities
# --------------------------
df["Approval_Probability"] = pipeline.predict_proba(X_dashboard)[:,1]

# Threshold 0.5 for predicted status
df["Predicted_Status"] = (df["Approval_Probability"] >= 0.5).map({True:"Approved", False:"Rejected"})

# --------------------------
# 7️⃣ Save predictions
# --------------------------
output_filename = "claims_with_predictions.csv"
df.to_csv(output_filename, index=False)
print(f"\n✅ Predictions saved to: {output_filename}")

# --------------------------
# 8️⃣ Download predictions
# --------------------------
files.download(output_filename)


Upload the trained pipeline file (claim_pipeline.pkl) or skip if running Phase 9 above:



Upload the dataset CSV (warranty_final.csv or new claims file):


Saving warranty_final.csv to warranty_final (4).csv
✅ Loaded dataset: warranty_final (4).csv

✅ Predictions saved to: claims_with_predictions.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>