In [None]:
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import average_precision_score, roc_auc_score, precision_recall_curve

# Feature engineering utilities

In [None]:
# -----------------------------
# Transctional Features
# -----------------------------

def create_base_features(df):
    """Basic transaction + demographic derived features."""
    df = df.copy()

    # Ensure datetime
    df['transaction_date'] = pd.to_datetime(df['transaction_date'])

    # Per-customer aggregates (global / history)
    cust_aggs = df.groupby('customer_id').agg(
        total_txns=('transaction_id','nunique'),
        first_txn=('transaction_date','min'),
        last_txn=('transaction_date','max'),
        avg_txn_amount=('amount','mean'),
        std_txn_amount=('amount','std'),
        max_txn_amount=('amount','max')
    ).reset_index()
    cust_aggs['cust_active_days'] = (cust_aggs['last_txn'] - cust_aggs['first_txn']).dt.days + 1
    cust_aggs['txns_per_day'] = cust_aggs['total_txns'] / cust_aggs['cust_active_days'].replace(0,1)

    df = df.merge(cust_aggs, on='customer_id', how='left')

    # Recency / frequency / monetary for the specific transaction
    latest_date = df['transaction_date'].max()
    df['days_since_last_txn'] = (latest_date - df['transaction_date']).dt.days
    df['amount_over_avg'] = df['amount'] / (df['avg_txn_amount'] + 1e-9)

    # Time features
    df['hour'] = df['transaction_date'].dt.hour
    df['dayofweek'] = df['transaction_date'].dt.dayofweek
    df['is_weekend'] = df['dayofweek'].isin([5,6]).astype(int)

    # Demographics
    if 'age' in df.columns:
        df['age_bucket'] = pd.cut(df['age'], bins=[0,25,35,50,65,200], labels=False)
    if 'income' in df.columns:
        df['log_income'] = np.log1p(df['income'])

    return df

In [None]:
# -----------------------------
# Rolling (Txn) Features
# -----------------------------

def create_rolling_features(df, windows=[7,30,90]):
    """Creates rolling-window features per customer (requires transaction-level ordered by date)."""
    df = df.sort_values(['customer_id','transaction_date'])
    for w in windows:
        key = f'txn_count_{w}d'
        df[key] = df.groupby('customer_id')['transaction_id'].transform(lambda x: x.rolling(w, min_periods=1).count())

        amt_key = f'txn_amt_mean_{w}d'
        df[amt_key] = df.groupby('customer_id')['amount'].transform(lambda x: x.rolling(w, min_periods=1).mean())

    return df

In [None]:
# -----------------------------
# Behavioral Features
# -----------------------------

def behavioral_features(df):
    """Create behavioral signals: velocity, unusual amount zscore, high-risk ratios."""
    df = df.copy()
    # amount zscore per customer
    df['amount_z'] = df.groupby('customer_id')['amount'].transform(lambda x: (x - x.mean()) / (x.std().replace(0,1)))

    # fraction of declined transactions historically (if available)
    if 'status' in df.columns:
        df['is_declined'] = (df['status'] == 'DECLINED').astype(int)
        df['decline_rate'] = df.groupby('customer_id')['is_declined'].transform('mean')

    return df

In [None]:
# -----------------------------
# Device Features
# -----------------------------


def device_ip_features(df):
    """Simple device/IP aggregation features and rare-value flags."""
    df = df.copy()
    # counts per device / ip
    if 'device_id' in df.columns:
        df['device_txn_count'] = df.groupby('device_id')['transaction_id'].transform('count')
        df['unique_customers_per_device'] = df.groupby('device_id')['customer_id'].transform('nunique')
        df['device_rare_flag'] = (df['device_txn_count'] < 3).astype(int)

    if 'ip_address' in df.columns:
        df['ip_txn_count'] = df.groupby('ip_address')['transaction_id'].transform('count')
        df['unique_customers_per_ip'] = df.groupby('ip_address')['customer_id'].transform('nunique')

    return df


In [None]:
# -----------------------------
# Missing values and encoding
# -----------------------------

# CatBoost handles NaNs natively. For features where NaN is informative,
# create a missing indicator. For other numeric missingness, you can impute.

def add_missing_indicators(df, cols):
    for c in cols:
        if c in df.columns:
            df[f'{c}_missing'] = df[c].isna().astype(int)
    return df

In [None]:
# -----------------------------
# End-to-end feature builder
# -----------------------------

def build_features(df):
    df = create_base_features(df)
    df = create_rolling_features(df)
    df = device_ip_features(df)
    df = behavioral_features(df)

    # Missing indicators for key fields
    df = add_missing_indicators(df, ['amount','device_id','ip_address','customer_id'])

    # Reduce cardinality for very high-cardinality cats (hashing or top-k)
    for c in ['device_id','ip_address']:
        if c in df.columns:
            top = df[c].value_counts().nlargest(1000).index
            df[c] = df[c].where(df[c].isin(top), 'OTHER')

    # Final fill for any remaining nulls (CatBoost accepts NaN but some downstream ops may not)
    df.fillna(np.nan, inplace=True)
    return df


In [None]:
# -----------------------------
# Modeling with CatBoost (imbalance + missing values)
# -----------------------------

def train_catboost(X, y, categorical_features, n_splits=5):
    params = dict(
        iterations=2000,
        depth=6,
        learning_rate=0.03,
        loss_function='Logloss',
        eval_metric='AUC',
        l2_leaf_reg=3.0,
        random_seed=42,
        od_type='Iter',
        od_wait=200,
        task_type='CPU',
        verbose=100
    )

    # class weights (simple heuristic) - tune on validation
    pos = y.mean()
    if pos > 0:
        params['class_weights'] = [1.0, (1-pos)/pos]

    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    oof_proba = np.zeros(len(X))
    models = []

    for tr, va in cv.split(X, y):
        train_pool = Pool(X.iloc[tr], y.iloc[tr], cat_features=categorical_features)
        valid_pool = Pool(X.iloc[va], y.iloc[va], cat_features=categorical_features)
        model = CatBoostClassifier(**params)
        model.fit(train_pool, eval_set=valid_pool, use_best_model=True)
        oof_proba[va] = model.predict_proba(valid_pool)[:,1]
        models.append(model)

    print('CV AUROC:', roc_auc_score(y, oof_proba))
    print('CV AUPRC:', average_precision_score(y, oof_proba))
    return models, oof_proba


In [None]:
# -----------------------------
# Threshold selection & cost-based optimization
# -----------------------------

def choose_threshold(p, y, tp_gain=100, fp_cost=10, fn_cost=200):
    thresholds = np.linspace(0.0005, 0.9995, 2000)
    best_th, best_util = 0.5, -1e12
    for th in thresholds:
        pred = (p >= th).astype(int)
        tp = ((pred==1)&(y==1)).sum()
        fp = ((pred==1)&(y==0)).sum()
        fn = ((pred==0)&(y==1)).sum()
        util = tp*tp_gain - fp*fp_cost - fn*fn_cost
        if util > best_util:
            best_th, best_util = th, util
    return best_th, best_util


In [None]:

# -----------------------------
# Improving detection over time
# -----------------------------
# 1) Continuous labeling pipeline: record outcomes for flagged transactions (chargebacks, confirmed frauds).
# 2) Shadow mode and backtesting: run new models in parallel and compare decisions to production.
# 3) Monitoring: track AUPRC, precision@k, false positive rate, and PSI for features.
# 4) Retraining triggers: automated retrain when key metrics degrade beyond threshold (e.g., AUPRC drop > 5%).
# 5) Incremental training: accumulate new labeled batches and retrain or fine-tune weekly/bi-weekly/monthly depending on drift.


# Frequency guidance:
# - High-velocity ecommerce (many daily changes, frequent new payment methods): retrain weekly or bi-weekly.
# - Moderate traffic: monthly retrain is common.
# - Low-volume or very stable domains: quarterly retrain may suffice.
# Always use monitoring to adapt frequency (data drift, feature distribution changes, drop in business KPIs).


# -----------------------------
# MLOps & deployment notes
# -----------------------------
# - Use MLflow to track experiments, metrics, and artifacts.
# - Use Airflow to orchestrate data ingestion, feature engineering, training, evaluation, and model promotion.
# - Deploy models behind a feature flag: shadow -> canary -> full rollout.
# - Monitor post-deployment metrics and have automatic rollback rules.


# -----------------------------
# Example: retrain trigger (pseudo-code)
# -----------------------------
# if current_auprc < baseline_auprc * 0.95:
#     trigger_retraining()


# -----------------------------
# Usage example (assuming df, y available):
# -----------------------------
# df_feat = build_features(df)
# categorical_features = ['interest_category','device_id','ip_address','country']
# models, oof = train_catboost(df_feat[categorical_features + numeric_cols], y, categorical_features)
# th, util = choose_threshold(oof, y)
# print('Selected thresh:', th)
