In [None]:
# === Imports ===
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import ipaddress
import pyarrow.parquet
from pandas.io.parquet import to_parquet
from datetime import timedelta
from collections import Counter
import shap
import joblib
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    confusion_matrix, precision_score, recall_score,
    f1_score, average_precision_score, classification_report
)
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

# For consistent plotting
plt.rcParams['figure.dpi'] = 120

# === Utility / Base Classes ===

class DataLoader:
    """Load and perform initial cleaning on required datasets."""
    def __init__(self, fraud_path, ip_path, credit_path):
        self.fraud_path = fraud_path
        self.ip_path = ip_path
        self.credit_path = credit_path

    def load_fraud(self):
        df = pd.read_csv(self.fraud_path)
        # Timestamp parsing
        df['signup_time'] = pd.to_datetime(df['signup_time'], errors='coerce')
        df['purchase_time'] = pd.to_datetime(df['purchase_time'], errors='coerce')
        # Drop exact duplicates
        df = df.drop_duplicates().reset_index(drop=True)
        return df

    def load_ip_country(self):
        ip_df = pd.read_csv(self.ip_path)
        # Convert the bounds to integers for range matching
        ip_df['lower_int'] = ip_df['lower_bound_ip_address'].astype(int)
        ip_df['upper_int'] = ip_df['upper_bound_ip_address'].astype(int)
        # Keep needed columns
        return ip_df[['lower_int', 'upper_int', 'country']].sort_values('lower_int').reset_index(drop=True)

    def load_credit(self):
        df = pd.read_csv(self.credit_path)
        # No timestamp conversion needed (Time is seconds since first transaction)
        df = df.drop_duplicates().reset_index(drop=True)
        return df

class FeatureEngineer:
    """Construct derived features for fraud data."""

    @staticmethod
    def ip_to_int(ip_str):
        # Use ipaddress library for robust conversion; assume IPv4 dotted decimal
        try:
            return int(ipaddress.ip_address(ip_str))
        except Exception:
            return np.nan

    def add_ip_country(self, fraud_df, ip_df):
        # Convert ip_address to integer representation
        fraud_df['ip_int'] = fraud_df['ip_address'].apply(self.ip_to_int)
        # Drop rows where ip conversion failed
        fraud_df = fraud_df.dropna(subset=['ip_int']).copy()
        fraud_df['ip_int'] = fraud_df['ip_int'].astype(int)

        # We'll do a range join: for each fraud row, find the ip_df row where lower_int <= ip_int <= upper_int
        # Efficient approach: sort and use merge_asof then filter
        ip_sorted = ip_df.sort_values('lower_int').reset_index(drop=True)
        fraud_sorted = fraud_df.sort_values('ip_int').reset_index(drop=True)

        merged = pd.merge_asof(
            fraud_sorted,
            ip_sorted,
            left_on='ip_int',
            right_on='lower_int',
            direction='backward',
            suffixes=('','_ip')
        )
        # Filter to ensure ip_int <= upper_int (since merge_asof only guarantees lower_int <= ip_int)
        merged = merged[merged['ip_int'] <= merged['upper_int']].copy()
        # If some IPs didn't match, country will be NaN
        merged.rename(columns={'country': 'ip_country'}, inplace=True)
        return merged

    @staticmethod
    def add_time_features(fraud_df):
        # Hour of day and day of week
        fraud_df['hour_of_day'] = fraud_df['purchase_time'].dt.hour
        fraud_df['day_of_week'] = fraud_df['purchase_time'].dt.day_name()
        # Time since signup in seconds / minutes / hours
        fraud_df['time_since_signup'] = (fraud_df['purchase_time'] - fraud_df['signup_time']).dt.total_seconds()
        # Replace negative or missing with NaN
        fraud_df.loc[fraud_df['time_since_signup'] < 0, 'time_since_signup'] = np.nan
        return fraud_df

    @staticmethod
    def add_user_velocity(fraud_df):
        # Count previous purchases per user up to current purchase time
        fraud_df = fraud_df.sort_values(['user_id', 'purchase_time'])
        fraud_df['user_txn_count_so_far'] = fraud_df.groupby('user_id').cumcount()
        # Time since last purchase
        fraud_df['last_purchase_time'] = fraud_df.groupby('user_id')['purchase_time'].shift(1)
        fraud_df['time_since_last_purchase'] = (fraud_df['purchase_time'] - fraud_df['last_purchase_time']).dt.total_seconds()
        fraud_df['time_since_last_purchase'] = fraud_df['time_since_last_purchase'].fillna(-1)
        # -1 indicates first purchase
        return fraud_df

# === EDA / Visualization Helpers ===

class EDA:
    """Exploratory Data Analysis for both datasets."""

    def __init__(self):
        pass

    @staticmethod
    def plot_class_balance(series, title="Class Distribution"):
        counts = series.value_counts().sort_index()
        labels = counts.index.astype(str)
        plt.figure()
        plt.bar(labels, counts.values)
        plt.title(title)
        plt.xlabel("Class")
        plt.ylabel("Count")
        for i, v in enumerate(counts.values):
            plt.text(i, v + max(counts.values)*0.01, str(v), ha='center')
        plt.tight_layout()
        plt.show()

    @staticmethod
    @staticmethod
    def plot_numeric_distribution(df, column, by_class='class', bins=50):
        plt.figure()
        has_label = False
        for cls in sorted(df[by_class].dropna().unique()):
            subset = df[df[by_class] == cls]
            if subset.empty:
                continue
            plt.hist(subset[column].dropna(), bins=bins, alpha=0.5, label=f"{by_class}={cls}", density=False)
            has_label = True
        plt.title(f"Distribution of {column} by {by_class}")
        plt.xlabel(column)
        plt.ylabel("Count")
        if has_label:
            plt.legend()
        plt.tight_layout()
        plt.show()


    @staticmethod
    def bar_categorical_rate(df, cat_col, target_col, top_n=10):
        # Show fraud rate per category (only for top_n frequent categories)
        counts = df[cat_col].value_counts().nlargest(top_n)
        rates = []
        for cat in counts.index:
            subset = df[df[cat_col] == cat]
            rate = subset[target_col].mean()
            rates.append(rate)
        plt.figure()
        plt.barh([str(c) for c in counts.index][::-1], rates[::-1])
        plt.title(f"Fraud Rate by {cat_col} (top {top_n})")
        plt.xlabel("Fraud Rate")
        plt.tight_layout()
        plt.show()

    @staticmethod
    def time_series_fraud_rate(df, time_col='purchase_time', freq='D', target='class'):
        ts = df.set_index(time_col).resample(freq)[target].mean()
        plt.figure()
        plt.plot(ts.index, ts.values, marker='o')
        plt.title(f"Fraud Rate over Time ({freq} bins)")
        plt.xlabel("Time")
        plt.ylabel("Fraud Rate")
        plt.tight_layout()
        plt.show()

    @staticmethod
    def heatmap_hour_day(df, target='class'):
        pivot = df.pivot_table(index='day_of_week', columns='hour_of_day', values=target, aggfunc='mean')
        # Reorder days for readability
        order = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']
        pivot = pivot.reindex(order)
        plt.figure(figsize=(8,4))
        plt.imshow(pivot, aspect='auto', origin='lower')
        plt.colorbar(label=f"Avg {target}")
        plt.title(f"Fraud Rate by Day of Week and Hour")
        plt.xlabel("Hour of Day")
        plt.ylabel("Day of Week")
        plt.xticks(ticks=range(0,24), labels=range(0,24))
        plt.yticks(ticks=range(len(order)), labels=order)
        plt.tight_layout()
        plt.show()
class Modelling:
    def evaluate_model(model, X_test, y_test, name="model"):
        preds = model.predict(X_test)
        probs = model.predict_proba(X_test)[:,1]
        precision = precision_score(y_test, preds, zero_division=0)
        recall = recall_score(y_test, preds, zero_division=0)
        f1 = f1_score(y_test, preds, zero_division=0)
        pr_auc = average_precision_score(y_test, probs)
        cm = confusion_matrix(y_test, preds)
        print(f"--- {name} evaluation ---")
        print("Precision: {:.4f}, Recall: {:.4f}, F1: {:.4f}, PR-AUC: {:.4f}".format(
            precision, recall, f1, pr_auc
        ))
        print("Confusion Matrix:\n", cm)
        return {
            "precision": precision,
            "recall": recall,
            "f1": f1,
            "pr_auc": pr_auc,
            "confusion_matrix": cm
        }

# === Preprocessing function for Fraud_Data.csv ===
    def prepare_fraud_features(df):
        df = df.copy()
        # Drop rows with missing target or critical time features
        df = df.dropna(subset=['class', 'purchase_time', 'signup_time'])
        # Categorical to encode
        categorical = ['source', 'browser', 'sex', 'ip_country']
        # Numeric features
        numeric = ['purchase_value', 'time_since_signup', 'user_txn_count_so_far', 'time_since_last_purchase']
        # Some may not exist if prior steps failed; filter
        categorical = [c for c in categorical if c in df.columns]
        numeric = [n for n in numeric if n in df.columns]

        X = df[categorical + numeric]
        y = df['class'].astype(int)

        # Build transformer: one-hot categorical, scale numeric
        preprocessor = ColumnTransformer(transformers=[
            ('cat', OneHotEncoder(handle_unknown='ignore', sparse=False), categorical),
            ('num', StandardScaler(), numeric)
        ], remainder='drop')

        # SMOTE + model pipeline for imbalance
        return X, y, preprocessor

# === Preprocessing for creditcard.csv ===
    def prepare_credit_features(df):
        df = df.copy()
        df = df.dropna(subset=['Class'])
        # The anonymized features V1..V28 plus Amount, Time are usable
        features = [c for c in df.columns if c not in ['Class']]
        X = df[features]
        y = df['Class'].astype(int)
        # Scale amount/time (others from PCA already scaled-ish)
        preprocessor = ColumnTransformer(transformers=[
            ('scale', StandardScaler(), ['Time', 'Amount'])
        ], remainder='passthrough')  # keep V1..V28 as is
        return X, y, preprocessor
# === Split / Resample / Train for a generic dataset ===
    def train_and_compare(X, y, preprocessor, random_state=42):
        # Train-test split (stratify because of imbalance)
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.25, stratify=y, random_state=random_state
        )

        # Pipeline: SMOTE -> preprocess -> classifier
        # Logistic Regression baseline
        pipe_lr = ImbPipeline(steps=[
            ('smote', SMOTE(random_state=random_state)),
            ('prep', preprocessor),
            ('clf', LogisticRegression(max_iter=1000, class_weight='balanced'))
        ])
        pipe_lr.fit(X_train, y_train)
        metrics_lr = evaluate_model(pipe_lr, X_test, y_test, name="Logistic Regression")

        # XGBoost (powerful ensemble)
        pipe_xgb = ImbPipeline(steps=[
            ('smote', SMOTE(random_state=random_state)),
            ('prep', preprocessor),
            ('clf', XGBClassifier(
                use_label_encoder=False,
                eval_metric='logloss',
                scale_pos_weight=1,  # SMOTE already balances, so keep 1
                n_estimators=100,
                random_state=random_state
            ))
        ])
        pipe_xgb.fit(X_train, y_train)
        metrics_xgb = evaluate_model(pipe_xgb, X_test, y_test, name="XGBoost")

        return {
            'logistic': (pipe_lr, metrics_lr),
            'xgboost': (pipe_xgb, metrics_xgb),
            'X_train': X_train, 'X_test': X_test, 'y_train': y_train, 'y_test': y_test
        }
class Explain:
    def select_best_model(result_dict):
        # Compare by F1 first, then PR-AUC
        candidates = []
        for name in ['logistic', 'xgboost']:
            model, metrics = result_dict[name]
            candidates.append((name, model, metrics['f1'], metrics['pr_auc']))
        # Sort by f1 desc, then pr_auc desc
        candidates.sort(key=lambda x: (x[2], x[3]), reverse=True)
        best_name, best_model, best_f1, best_pr_auc = candidates[0]
        print(f"Selected best model: {best_name} with F1={best_f1:.4f}, PR-AUC={best_pr_auc:.4f}")
        return best_name, best_model
    def explain_with_shap(best_model, X_train, X_test, y_test, dataset_label):
        # Determine underlying estimator (pipeline or raw)
        # If pipeline, extract final estimator and preprocessor
        if hasattr(best_model, 'named_steps'):
            clf = best_model.named_steps['clf']
            prep = best_model.named_steps.get('prep', None)
            # Need to transform data before SHAP if prep exists
            X_train_trans = prep.transform(X_train) if prep else X_train
            X_test_trans = prep.transform(X_test) if prep else X_test
            feature_names = []
            if prep:
                # Attempt to get transformed feature names
                try:
                    cat_names = prep.named_transformers_['cat'].get_feature_names_out()
                except Exception:
                    cat_names = []
                num_names = []
                if 'num' in prep.named_transformers_:
                    num_names = X_train.select_dtypes(include=[np.number]).columns.tolist()
                feature_names = list(cat_names) + num_names if cat_names or num_names else None
        else:
            clf = best_model
            X_train_trans = X_train
            X_test_trans = X_test
            feature_names = X_train.columns.tolist()

        # Choose explainer
        if hasattr(clf, 'feature_importances_') or isinstance(clf, (shap.explainers._tree.TreeExplainer,)):
            explainer = shap.TreeExplainer(clf)
        else:
            # KernelExplainer is expensive; sample background
            background = shap.sample(X_train_trans, 100, random_state=0)
            explainer = shap.KernelExplainer(clf.predict_proba, background)

        # Compute SHAP values for the positive class
        shap_values = explainer.shap_values(X_test_trans)
        # For binary classifiers TreeExplainer returns list; pick index 1 if list
        if isinstance(shap_values, list):
            shap_vals_pos = shap_values[1]
            expected_value = explainer.expected_value[1]
        else:
            shap_vals_pos = shap_values
            expected_value = explainer.expected_value

        # Summary plot (global)
        print(f"\nSHAP summary plot for {dataset_label} ({best_model.__class__.__name__})")
        shap.summary_plot(shap_vals_pos, X_test_trans, feature_names=feature_names, show=False)
        plt.tight_layout()
        plt.savefig(f"shap_summary_{dataset_label}.png")
        plt.show()

        # Force plot for a few examples: pick 3 random fraud and 3 non-fraud
        os.makedirs(f"shap_force_{dataset_label}", exist_ok=True)
        # Need to use JS visualization for inline; also save static via matplotlib fallback
        shap.initjs()
        sample_idxs = []
        # ensure enough variety
        fraud_idxs = np.where(y_test == 1)[0]
        nonfraud_idxs = np.where(y_test == 0)[0]
        rng = np.random.default_rng(42)
        selected = []
        for group, label in [(fraud_idxs, 1), (nonfraud_idxs, 0)]:
            if len(group) == 0:
                continue
            chosen = rng.choice(group, size=min(3, len(group)), replace=False)
            selected.extend(chosen.tolist())
        for idx in selected:
            single_X = X_test_trans[idx]
            # Force plot (HTML)
            force_plot = shap.force_plot(
                expected_value,
                shap_vals_pos[idx],
                single_X,
                feature_names=feature_names,
                matplotlib=False
            )
            # Save HTML version
            shap_html = f"shap_force_{dataset_label}/force_{idx}.html"
            with open(shap_html, "w") as f:
                f.write(shap.plots._force_matplotlib._repr_html_(force_plot))
            # Also show inline if in notebook
            display(force_plot)

        # Save SHAP values for downstream use
        joblib.dump({
            'shap_values': shap_vals_pos,
            'expected_value': expected_value,
            'feature_names': feature_names,
            'X_test_transformed': X_test_trans,
            'y_test': y_test
        }, f"shap_explanation_{dataset_label}.pkl")
        print(f"SHAP artifacts saved for {dataset_label}.")

        return shap_vals_pos, expected_value

# === Execution: Load, preprocess, and run EDA ===

# Paths - replace with actual file paths if different
fraud_csv = "../data/Fraud_Data.csv"              # e-commerce
ip_csv = "../data/IpAddress_to_Country.csv"
credit_csv = "../data/creditcard.csv"              # bank credit transactions

# Instantiate loaders and engineers
loader = DataLoader(fraud_csv, ip_csv, credit_csv)
fe = FeatureEngineer()
eda = EDA()

# Load datasets
fraud_df = loader.load_fraud()
ip_df = loader.load_ip_country()
credit_df = loader.load_credit()

# Basic overview
print("Fraud Data shape:", fraud_df.shape)
print("Credit Card Data shape:", credit_df.shape)
print("IP-country mapping shape:", ip_df.shape)

# === EDA on Fraud_Data.csv ===
print("\n--- Class imbalance in fraud dataset ---")
# The target column is named 'class' in fraud_df
eda.plot_class_balance(fraud_df['class'], title="Fraud_Data.csv: class distribution")

# Feature engineering: geolocation, time, velocity
fraud_df = fe.add_ip_country(fraud_df, ip_df)
fraud_df = fe.add_time_features(fraud_df)
fraud_df = fe.add_user_velocity(fraud_df)

# Inspect missing after merges
print("\nMissing values (fraud_df):")
print(fraud_df.isnull().sum().sort_values(ascending=False).head(10))

# Visualizations
# Numeric: purchase_value
eda.plot_numeric_distribution(fraud_df, 'purchase_value', by_class='class', bins=40)

# Categorical fraud rate: source, browser, ip_country, sex
for cat in ['source', 'browser', 'ip_country', 'sex']:
    if cat in fraud_df.columns:
        eda.bar_categorical_rate(fraud_df, cat, 'class', top_n=8)

# Time patterns
eda.heatmap_hour_day(fraud_df, target='class')
eda.time_series_fraud_rate(fraud_df, time_col='purchase_time', freq='D', target='class')

# === EDA on creditcard.csv ===
print("\n--- Class imbalance in credit card dataset ---")
# Target is 'Class' (capitalized)
eda.plot_class_balance(credit_df['Class'], title="creditcard.csv: Class distribution")

# Quick distribution of Amount by class
eda.plot_numeric_distribution(credit_df, 'Amount', by_class='Class', bins=40)

# Since credit dataset uses anonymized components (V1..V28), we can inspect correlations for top features:
corr = credit_df.corr()
# Example: show top correlations with Class
corr_with_class = corr['Class'].abs().sort_values(ascending=False).drop('Class')
print("\nTop features correlated with fraud in creditcard.csv:")
print(corr_with_class.head(10))

# === Summary Statistics ===
def print_basic_stats(df, target_col):
    print(f"\n=== Summary for target={target_col} ===")
    print("Overall count:", len(df))
    print("Positive (fraud) count:", df[target_col].sum())
    print("Negative count:", len(df) - df[target_col].sum())
    print("Fraud rate: {:.4f}".format(df[target_col].mean()))

print_basic_stats(fraud_df, 'class')
print_basic_stats(credit_df, 'Class')

# === Save cleaned intermediate EDA snapshots if desired ===
fraud_df.to_parquet("../data/processed_fraud_data.parquet", index=False)
credit_df.to_parquet("../data/processed_credit_data.parquet", index=False)

fraud_df.to_csv("../data/processed_fraud_data.parquet", index=False)
credit_df.to_csv("../data/processed_credit_data.parquet", index=False)

mod = Modelling()
# === Run on Fraud_Data.csv ===
X_fraud, y_fraud, pre_fraud = mod.prepare_fraud_features(fraud_df)
results_fraud = mod.train_and_compare(X_fraud, y_fraud, pre_fraud)

# === Run on creditcard.csv ===
X_credit, y_credit, pre_credit = mod.prepare_credit_features(credit_df)
results_credit = mod.train_and_compare(X_credit, y_credit, pre_credit)

ex = Explain()

# === Apply to fraud dataset ===
best_name_fraud, best_model_fraud = ex.select_best_model(results_fraud)
shap_vals_fraud, explainer_expected_fraud = ex.explain_with_shap(
    best_model_fraud,
    results_fraud['X_train'],
    results_fraud['X_test'],
    results_fraud['y_test'],
    dataset_label="fraud"
)

# === Apply to credit card dataset ===
best_name_credit, best_model_credit = ex.select_best_model(results_credit)
shap_vals_credit, explainer_expected_credit = ex.explain_with_shap(
    best_model_credit,
    results_credit['X_train'],
    results_credit['X_test'],
    results_credit['y_test'],
    dataset_label="credit"
)
