In [None]:
# fraud_pipeline_tf.py
"""
End-to-End Fraud Detection Pipeline for Bank Drafts using TensorFlow

Steps:
1. Synthetic Data Generation (prototyping)
2. Persist Historical Drafts into a Database (SQLite/Postgres)
3. Build & Refresh Statistics (RIB-level, Bank-level, Population-level)
4. Train TensorFlow Models:
     - Autoencoder for anomaly detection on amounts
     - MLP classifier for supervised fraud detection
5. Serialize Models & Stats
6. Inference Function to score new OCR-extracted drafts
7. Onboard new RIB statistics after approval

Usage:
  python fraud_pipeline_tf.py generate
  python fraud_pipeline_tf.py train --data <data_csv> --model-dir <model_dir>
  python fraud_pipeline_tf.py predict --ocr <ocr_json> --model-dir <model_dir>
  python fraud_pipeline_tf.py onboard --rib <rib> --amt <amount>

Dependencies:
  pandas, numpy, faker, num2words, sqlalchemy, joblib, tensorflow
"""
import os
import json
import argparse
import random
import pandas as pd
import numpy as np
from faker import Faker
from num2words import num2words
from sqlalchemy import create_engine
import joblib
import tensorflow as tf
from tensorflow.keras import layers, models, callbacks

# ---------- 0. Configuration ----------
DB_URI = 'sqlite:///drafts.db'  # or your PostgreSQL URI
ENGINE = create_engine(DB_URI)

# ---------- Utility Functions ----------
def is_valid_rib(v: str) -> bool:
    s = v.replace(' ', '').replace('-', '')
    if len(s) != 20:
        return False
    n = int(s[:-2] + '00')
    chk = 97 - (n % 97)
    return chk == int(s[-2:])


def amount_to_words_fr(x: float) -> str:
    text = num2words(x, lang='fr')
    return text.replace('virgule', 'dinars zéro')

# ---------- 1. Synthetic Data Generation ----------
F = Faker('fr_FR'); Faker.seed(42); random.seed(42)

def generate_valid_rib():
    def valid(v: str) -> bool:
        s = v.replace(' ', '').replace('-', '')
        if len(s) != 20: return False
        n = int(s[:-2] + '00')
        return 97 - (n % 97) == int(s[-2:])
    while True:
        base = ''.join(str(random.randint(0,9)) for _ in range(18))
        for i in range(100):
            cand = base + f"{i:02d}"
            if valid(cand): return cand

def generate_synthetic(n_legit=3500, n_fraud=500):
    """
    Generate synthetic drafts with varied fraud patterns:
    - amount_words always matches amount_digits (for both legit & fraud)
    - fraud rows randomly exhibit one of several fraud types:
      * Missing signature
      * Invalid barcode
      * Invalid RIB format
      * Amount incompatible (extreme outlier)
      * Mismatch words vs digits (optional)
    """
    rows = []
    fraud_patterns = [
        'missing_signature',
        'invalid_barcode',
        'invalid_rib',
        'amount_outlier',
        'words_mismatch'
    ]
    for fraud in [False] * n_legit + [True] * n_fraud:
        amt = round(random.uniform(500, 10000), 3)
        # always correct literal words
        words = amount_to_words_fr(amt)
        rib = generate_valid_rib()
        sig = True
        barcode_valid = True
        # apply fraud patterns
        if fraud:
            pattern = random.choice(fraud_patterns)
            if pattern == 'missing_signature':
                sig = False
            elif pattern == 'invalid_barcode':
                barcode_valid = False
            elif pattern == 'invalid_rib':
                # corrupt last two digits
                rib = rib[:-2] + f"{random.randint(0,99):02d}"
            elif pattern == 'amount_outlier':
                # assign extreme amount
                amt = round(random.uniform(20000, 50000), 3)
                words = amount_to_words_fr(amt)
            elif pattern == 'words_mismatch':
                # intentionally corrupt words
                words = words + " erreurs"
        rows.append({
            'traite_num': str(F.random_number(digits=12)),
            'amount_digits': amt,
            'amount_words': words,
            'bank': F.company(),
            'rib': rib,
            'signature_detected': sig,
            'barcode_validates_traite': barcode_valid,
            'fraud_label': int(fraud)
        })
    df = pd.DataFrame(rows)
    df.to_sql('drafts', ENGINE, if_exists='replace', index=False)
    print("Synthetic data generated with varied fraud patterns.")
    return df

# ---------- 2 & 3. Build/Refresh Stats ----------
def build_stats():
    df = pd.read_sql('drafts', ENGINE)
    pop = df['amount_digits']
    pop_stats = {'mean':pop.mean(),'std':pop.std(),'lo':pop.quantile(0.01),'hi':pop.quantile(0.99)}
    rib_stats = df.groupby('rib')['amount_digits'].agg(
        mean_amount='mean', std_amount='std', count='count',
        pct_1=lambda x: x.quantile(0.01), pct_99=lambda x: x.quantile(0.99)
    )
    bank_stats = df.groupby('bank')['amount_digits'].agg(
        mean='mean', std='std', lo=lambda x: x.quantile(0.01), hi=lambda x: x.quantile(0.99)
    )
    return pop_stats, rib_stats, bank_stats

# ---------- 4. Model Training ----------
def train(data_csv: str, model_dir: str):
    os.makedirs(model_dir, exist_ok=True)
    df = pd.read_csv(data_csv)
    pop_stats, rib_stats, bank_stats = build_stats()
    # Autoencoder for amount anomaly
    amounts = df[['amount_digits']].values.astype('float32')
    inp = layers.Input(shape=(1,))
    x = layers.Dense(8, activation='relu')(inp)
    x = layers.Dense(4, activation='relu')(x)
    x = layers.Dense(8, activation='relu')(x)
    out = layers.Dense(1)(x)
    autoenc = models.Model(inp, out)
    autoenc.compile(optimizer='adam', loss='mse')
    autoenc.fit(amounts, amounts,
                epochs=20, batch_size=32,
                validation_split=0.2,
                callbacks=[callbacks.EarlyStopping(patience=5)])
    autoenc.save(f"{model_dir}/autoencoder")
    # Prepare features for MLP classifier
    # For simplicity, only using numeric signals here
    df['mismatch'] = df.apply(lambda r: r['amount_words'].strip()!=amount_to_words_fr(r['amount_digits']).strip(), axis=1)
    df['sig_missing'] = ~df['signature_detected']
    df['barcode_bad'] = ~df['barcode_validates_traite']
    df['rib_invalid'] = ~df['rib'].apply(is_valid_rib)
    # z-score fallback
    def compute_z(r):
        rib, amt, bank = r['rib'], r['amount_digits'], r['bank']
        if rib in rib_stats.index and rib_stats.at[rib,'count']>=5:
            s=rib_stats.loc[rib]; μ,σ=s['mean_amount'],s['std_amount']
        elif bank in bank_stats.index:
            s=bank_stats.loc[bank]; μ,σ=s['mean'],s['std']
        else:
            μ,σ=pop_stats['mean'],pop_stats['std']
        return (amt-μ)/σ if σ>0 else 0
    df['z_score'] = df.apply(compute_z, axis=1)
    df['amount_incompatible'] = df['z_score'].abs()>3
    X = df[['amount_digits','mismatch','sig_missing','barcode_bad','rib_invalid','z_score','amount_incompatible']].astype('float32')
    y = df['fraud_label'].values
    # MLP classifier
    clf_in = layers.Input(shape=(X.shape[1],))
    y1 = layers.Dense(16, activation='relu')(clf_in)
    y1 = layers.Dense(8, activation='relu')(y1)
    y1 = layers.Dense(1, activation='sigmoid')(y1)
    clf = models.Model(clf_in, y1)
    clf.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    clf.fit(X, y, epochs=30, batch_size=32, validation_split=0.2,
            callbacks=[callbacks.EarlyStopping(patience=5)])
    clf.save(f"{model_dir}/classifier")
    # save stats
    joblib.dump({'pop':pop_stats,'rib':rib_stats,'bank':bank_stats}, f"{model_dir}/stats.pkl")
    print(f"Models saved in {model_dir}")

# ---------- 5. Inference ----------
def predict(ocr_json: dict, model_dir: str) -> dict:
    # load models & stats
    autoenc = tf.keras.models.load_model(f"{model_dir}/autoencoder")
    clf = tf.keras.models.load_model(f"{model_dir}/classifier")
    stats = joblib.load(f"{model_dir}/stats.pkl")
    pop, rib_stats, bank_stats = stats['pop'], stats['rib'], stats['bank']
    # prepare features
    amt = float(ocr_json['amount_digits'])
    mismatch = float(ocr_json['amount_words'].strip()!=amount_to_words_fr(amt).strip())
    sig_missing = float(not ocr_json['signature_detects'])
    barcode_bad = float(not ocr_json['barcode_validates_traite'])
    rib_invalid = float(not is_valid_rib(ocr_json['rib']))
    # z-score
    bank = ocr_json['bank']
    if ocr_json['rib'] in rib_stats.index and rib_stats.at[ocr_json['rib'],'count']>=5:
        s=rib_stats.loc[ocr_json['rib']]; μ,σ=s['mean_amount'],s['std_amount']
    elif bank in bank_stats.index:
        s=bank_stats.loc[bank]; μ,σ=s['mean'],s['std']
    else:
        μ,σ=pop['mean'],pop['std']
    z = (amt-μ)/σ if σ>0 else 0
    amount_incompatible = float(abs(z)>3)
    # anomaly score from autoencoder
    recon = autoenc.predict(np.array([[amt]]))
    err = float(np.abs(recon-amt))
    # classifier predict
    features = np.array([[amt, mismatch, sig_missing, barcode_bad, rib_invalid, z, amount_incompatible]], dtype='float32')
    fraud_prob = float(clf.predict(features)[0,0])
    # combine signals or return prob
    return {'fraud_score': fraud_prob, 'anomaly_error': err, 'fraud_label': fraud_prob>0.5}

# ---------- 6. Onboard New RIB ----------
def onboard(rib: str, amt: float):
    # after inserting a new draft record, refresh stats
    build_stats()
    print(f"Onboarded {rib}")

# ---------- 7. CLI Entrypoint ----------
if __name__=='__main__':
    p = argparse.ArgumentParser()
    p.add_argument('command', choices=['generate','train','predict','onboard'])
    p.add_argument('--data', help='CSV path')
    p.add_argument('--model-dir', help='Directory for saving/loading TF models')
    p.add_argument('--ocr', help='OCR JSON path')
    p.add_argument('--rib', help='RIB to onboard')
    p.add_argument('--amt', type=float, help='Amount for onboard')
    args=p.parse_args()
    if args.command=='generate': generate_synthetic()
    elif args.command=='train': train(args.data, args.model_dir)
    elif args.command=='predict':
        j=json.load(open(args.ocr)); print(predict(j,args.model_dir))
    elif args.command=='onboard': onboard(args.rib,args.amt)
