In [1]:
import numpy as np
import pandas as pd
import joblib
import json
from pathlib import Path

from sklearn.base import BaseEstimator, TransformerMixin

## Random Forest

In [2]:
# TargetEncoder
class TargetEncoder(BaseEstimator, TransformerMixin):
    """
    Smoothed target encoding: enc(v) = (sum_y + prior * m) / (count + m)
    Unseen -> prior.
    """
    def __init__(self, cols, smoothing=200):
        self.cols = cols
        self.smoothing = smoothing
        self.maps_ = {}
        self.prior_ = None

    def fit(self, X, y):
        X = X.copy(); y = pd.Series(y)
        self.prior_ = float(y.mean())
        self.maps_ = {}
        for c in self.cols:
            s = X[c].astype(str)
            stats = y.groupby(s).agg(['sum','count'])
            enc = (stats['sum'] + self.prior_ * self.smoothing) / (stats['count'] + self.smoothing)
            self.maps_[c] = enc
        return self

    def transform(self, X):
        X = X.copy()
        feats = []
        for c in self.cols:
            m = self.maps_.get(c, pd.Series(dtype=float))
            v = X[c].astype(str).map(m).fillna(self.prior_)
            feats.append(v.astype('float32').to_numpy().reshape(-1, 1))
        return np.hstack(feats)

# CountEncoder
class CountEncoder(BaseEstimator, TransformerMixin):
    """
    Unsupervised frequency (count) encoding with optional log scaling.
    """
    def __init__(self, cols, normalize=False, log1p=True):
        self.cols = cols
        self.normalize = normalize
        self.log1p = log1p
        self.maps_ = {}
        self.n_train_ = None

    def fit(self, X, y=None):
        X = X.copy()
        self.n_train_ = len(X)
        self.maps_ = {}
        for c in self.cols:
            vc = X[c].astype(str).value_counts()
            if self.normalize:
                vc = vc / self.n_train_
            self.maps_[c] = vc
        return self

    def transform(self, X):
        X = X.copy()
        feats = []
        for c in self.cols:
            m = self.maps_.get(c, pd.Series(dtype=float))
            enc = X[c].astype(str).map(m).fillna(0.0)
            if self.log1p and not self.normalize:
                enc = np.log1p(enc)  # log(count)
            elif self.log1p and self.normalize:
                enc = np.log1p(enc * self.n_train_)  # ~log(count)
            feats.append(enc.astype('float32').to_numpy().reshape(-1, 1))
        return np.hstack(feats)

In [5]:
# Features expected by the saved pipeline
FEATURES = ['age_group','gender_clean','category_clean','amount_bin','merchant','customer']

# Amount binning identical to training
AMOUNT_EDGES = [0, 10, 25, 50, 75, 150, 250, 500, 1000, 2500, np.inf]
AMOUNT_LABELS = ['0–10','10–25','25–50','50–75','75–150','150–250','250–500','500–1000','1000–2500','2500+']

def make_amount_bin(amount_series: pd.Series) -> pd.Series:
    # ensure numeric
    s = pd.to_numeric(amount_series, errors='coerce').fillna(-1)
    bins = pd.cut(
        s,
        bins=AMOUNT_EDGES,
        labels=AMOUNT_LABELS,
        right=False,          # [a, b)
        include_lowest=True
    ).astype(str)
    # values <0 (missing/invalid) will become 'nan' string; keep as 'nan' so encoders handle as unseen
    return bins

# Optional: light normalization for strings
def _norm_str(x):
    if pd.isna(x): return x
    return str(x).strip().replace('\u2014','–').replace('\u2013','–')  # unify dash to en dash

# Paths (edit to your artifact names)
MODEL_PATH      = Path("/content/drive/MyDrive/fraud_model/rf_pipeline_weighted.joblib")
THRESHOLDS_PATH = Path("/content/drive/MyDrive/fraud_model/thresholds_by_amount_bin_weighted.json")
GLOBAL_THR_PATH = Path("/content/drive/MyDrive/fraud_model/global_threshold_weighted.txt")

In [6]:
# Load the trained pipeline (includes OneHot + TE + Count encoders)
model = joblib.load(MODEL_PATH)

# Load thresholds if available (per-bin + default). If not present, fall back to 0.5.
thr_map = None
default_thr = 0.5
if THRESHOLDS_PATH.exists():
    thr_map = json.load(open(THRESHOLDS_PATH))
    # ensure float
    thr_map = {str(k): float(v) for k,v in thr_map.items()}
if GLOBAL_THR_PATH.exists():
    default_thr = float(open(GLOBAL_THR_PATH).read().strip())
print("Loaded model:", MODEL_PATH)
print("Per-bin thresholds:", "yes" if thr_map is not None else "no", "| default thr:", default_thr)

Loaded model: /content/drive/MyDrive/fraud_model/rf_pipeline_weighted.joblib
Per-bin thresholds: yes | default thr: 0.9787487064232085


In [7]:
def prepare_inference_frame(df_raw: pd.DataFrame) -> pd.DataFrame:
    df = df_raw.copy()

    # If amount_bin missing, construct it from 'amount'
    if 'amount_bin' not in df.columns:
        if 'amount' not in df.columns:
            raise ValueError("Incoming data must have either 'amount_bin' or 'amount'.")
        df['amount_bin'] = make_amount_bin(df['amount'])

    # Light normalization for categorical inputs
    for c in ['age_group','gender_clean','category_clean','amount_bin','merchant','customer']:
        if c in df.columns:
            df[c] = df[c].map(_norm_str)
        else:
            # Create missing columns as 'nan' strings; encoders will treat as unseen
            df[c] = 'nan'

    # Canonicalize age & gender where possible
    if 'age_group' in df.columns:
        df['age_group'] = df['age_group'].astype(str).str.upper()
        df.loc[~df['age_group'].isin(['0','1','2','3','4','5','6','U']), 'age_group'] = 'U'

    if 'gender_clean' in df.columns:
        df['gender_clean'] = df['gender_clean'].astype(str).str.upper()
        df.loc[~df['gender_clean'].isin(['F','M','E','U']), 'gender_clean'] = 'U'

    # Ensure required columns exist and order them
    missing = [c for c in FEATURES if c not in df.columns]
    if missing:
        raise ValueError(f"Missing required columns for the model: {missing}")
    return df

In [8]:
def predict_proba_only(df_new: pd.DataFrame) -> np.ndarray:
    """Return fraud probability for each row."""
    df_prep = prepare_inference_frame(df_new)
    proba = model.predict_proba(df_prep[FEATURES])[:, 1]
    return proba

def predict_labels(df_new: pd.DataFrame) -> pd.DataFrame:
    """
    Returns a DataFrame with proba, threshold used, and label.
    Uses per-bin thresholds if available, else global default.
    """
    df_prep = prepare_inference_frame(df_new)
    proba = model.predict_proba(df_prep[FEATURES])[:, 1]
    if thr_map is not None:
        thrs = df_prep['amount_bin'].map(thr_map).astype(float).fillna(float(default_thr)).values
    else:
        thrs = np.full(len(df_prep), float(default_thr), dtype=float)
    pred = (proba >= thrs).astype(int)

    out = df_prep.copy()
    out['proba_fraud'] = proba
    out['threshold_used'] = thrs
    out['pred_fraud'] = pred
    return out

In [None]:
# Example new data CSV; must include either 'amount' or a prebuilt 'amount_bin'
new_df = pd.read_csv("new_transactions.csv")

scored = predict_labels(new_df)

# Keep only a tidy output
cols_to_show = (['customer','merchant','amount','amount_bin','age_group','gender_clean','category_clean']
                if 'amount' in scored.columns else
                ['customer','merchant','amount_bin','age_group','gender_clean','category_clean'])
cols_to_show = [c for c in cols_to_show if c in scored.columns]
result = scored[cols_to_show + ['proba_fraud','threshold_used','pred_fraud']]

print(result.head(10))
result.to_csv("new_transactions_scored.csv", index=False)
print("Saved: new_transactions_scored.csv")


## Cat-Boost

In [1]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [2]:
import numpy as np
import pandas as pd
import json
from pathlib import Path
from catboost import CatBoostClassifier, Pool

In [3]:
# Features expected by the model
features = ['age_group','gender_clean','category_clean','amount_bin','merchant','customer']

# Amount binning
amount_edges  = [0, 10, 25, 50, 75, 150, 250, 500, 1000, 2500, np.inf]
amount_labels = ['0–10','10–25','25–50','50–75','75–150','150–250','250–500','500–1000','1000–2500','2500+']

# Model loading
model_path= Path("/content/drive/MyDrive/fraud_model/catboost_fraud_model.cbm")
thresholds_path= Path("/content/drive/MyDrive/fraud_model/thresholds_by_amount_bin_cat.json")
global_threshold= Path("/content/drive/MyDrive/fraud_model/global_threshold_cat.txt")


In [8]:
#Helper functions
def _norm_dash(s: str) -> str:
    """Normalize hyphens to en dash to keep amount_bin labels consistent."""
    if pd.isna(s): return s
    return str(s).replace('-', '–').replace('\u2013','–').replace('\u2014','–').strip()

def make_amount_bin(amount_series: pd.Series) -> pd.Series:
    """Create amount_bin from raw amount using training-time edges & labels."""
    a = pd.to_numeric(amount_series, errors='coerce')
    bins = pd.cut(a, bins=amount_edges, labels=amount_labels, right=False, include_lowest=True)
    return bins.astype(str)

def prepare_inference_frame(df_raw: pd.DataFrame) -> pd.DataFrame:
    """Ensure the input frame has all expected columns, clean strings, and build amount_bin if missing."""
    df = df_raw.copy()

    # Build amount_bin if you only receive raw 'amount'
    if 'amount_bin' not in df.columns:
        if 'amount' not in df.columns:
            raise ValueError("Incoming data must include either 'amount_bin' or 'amount'.")
        df['amount_bin'] = make_amount_bin(df['amount'])

    # Normalize & coerce categorical strings
    for c in ['age_group','gender_clean','category_clean','amount_bin','merchant','customer']:
        if c in df.columns:
            df[c] = df[c].astype(str).map(_norm_dash)
        else:
            df[c] = 'nan'  # model handles unseen values

    # Canonicalize small vocab fields
    df['age_group'] = df['age_group'].str.upper()
    df.loc[~df['age_group'].isin(['0','1','2','3','4','5','6','U']), 'age_group'] = 'U'

    df['gender_clean'] = df['gender_clean'].str.upper()
    df.loc[~df['gender_clean'].isin(['F','M','E','U']), 'gender_clean'] = 'U'

    # Ensure column order & presence
    missing = [c for c in features if c not in df.columns]
    if missing:
        raise ValueError(f"Missing required columns: {missing}")

    return df

def load_artifacts(model_path=model_path, thr_map_path=thresholds_path, global_thr_path=global_threshold):
    """Load CatBoost model and (optionally) the per-bin threshold map + global default."""
    model = CatBoostClassifier()
    model.load_model(str(model_path))

    thr_map = None
    if thr_map_path.exists():
        raw = json.load(open(thr_map_path, 'r'))
        # normalize keys (dashes etc.) to be safe
        thr_map = { _norm_dash(k): float(v) for k, v in raw.items() }

    default_thr = 0.5
    if global_thr_path.exists():
        default_thr = float(open(global_thr_path, 'r').read().strip())

    return model, thr_map, default_thr

In [6]:
#Scoring functions
def predict_proba_only(df_new: pd.DataFrame,
                       model: CatBoostClassifier,
                       cat_features=features) -> np.ndarray:
    """Return fraud probability for each row."""
    df_prep = prepare_inference_frame(df_new)
    pool = Pool(df_prep[features], cat_features=cat_features)
    proba = model.predict_proba(pool)[:, 1]
    return proba

def predict_with_thresholds(df_new: pd.DataFrame,
                            model: CatBoostClassifier,
                            thr_map: dict | None = None,
                            default_thr: float = 0.5,
                            cat_features=features) -> pd.DataFrame:
    """
    Score records and apply per-bin thresholds if provided.
    Returns a DataFrame with proba_fraud, threshold_used, pred_fraud.
    """
    df_prep = prepare_inference_frame(df_new)
    pool = Pool(df_prep[features], cat_features=cat_features)
    proba = model.predict_proba(pool)[:, 1]

    if thr_map:
        thrs = df_prep['amount_bin'].map(thr_map).astype(float).fillna(float(default_thr)).values
    else:
        thrs = np.full(len(df_prep), float(default_thr), dtype=float)

    pred = (proba >= thrs).astype(int)

    out = df_prep.copy()
    out['proba_fraud']   = proba
    out['threshold_used'] = thrs
    out['pred_fraud']     = pred
    return out

In [9]:
#Example usage
# Load model + thresholds
model, thr_map, default_thr = load_artifacts()

new_df = pd.read_csv("/content/drive/MyDrive/cleaned_fraud_df.csv").sample(20)

scored = predict_with_thresholds(new_df, model, thr_map, default_thr)
# view results
cols_to_show = [c for c in ['customer','merchant','amount','amount_bin','age_group','gender_clean','category_clean'] if c in scored.columns]
result = scored[cols_to_show + ['proba_fraud','threshold_used','pred_fraud']]