# Fraud Detection System
## Draft Document Analysis Pipeline

## 1. Environment Setup

## 2. Data Loading & Cleaning

In [4]:
from sklearn.pipeline import Pipeline
# %% [markdown]
# # Fraud Detection System
# Complete implementation with feature balancing and threshold adjustments

# %% [markdown]
# ## 1. Initial Setup

# %%
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import xgboost as xgb
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, precision_recall_curve
from sklearn.utils.class_weight import compute_class_weight

sns.set_style('whitegrid')

# Constants
AMOUNT_HIGH_QUANTILE = 0.95
GAP_LONG_DAYS = 90

# %% [markdown]
# ## 2. Data Loading & Validation

# %%
def load_and_validate_data():
    df = pd.read_sql('SELECT * FROM drafts', 'sqlite:///drafts.db')
    
    required_cols = ['amount_digits', 'date_created', 'date_due', 'rib']
    for col in required_cols:
        if col not in df.columns:
            raise ValueError(f"Missing required column: {col}")
    
    # Type conversions and cleaning
    df['date_created'] = pd.to_datetime(df['date_created'], errors='coerce')
    df['date_due'] = pd.to_datetime(df['date_due'], errors='coerce')
    df['amount_digits'] = pd.to_numeric(df['amount_digits'], errors='coerce')
    
    initial_count = len(df)
    df = df.dropna(subset=required_cols)
    print(f"Removed {initial_count - len(df)} invalid records")
    
    return df

# %% [markdown]
# ## 3. Feature Engineering

# %%
def create_features(df):
    # Time-based features
    df['gap_days'] = (df['date_due'] - df['date_created']).dt.days
    df['gap_negative'] = (df['gap_days'] < 0).astype(int)
    df['gap_long'] = (df['gap_days'] > GAP_LONG_DAYS).astype(int)
    
    # Amount features
    amount_threshold = df['amount_digits'].quantile(AMOUNT_HIGH_QUANTILE)
    df['amount_high'] = (df['amount_digits'] > amount_threshold).astype(int)
    df['amount_log'] = np.log1p(df['amount_digits'])
    joblib.dump(amount_threshold, 'amount_threshold.pkl')
    
    # Validation features
    df['amount_words_match'] = df.apply(
        lambda x: fuzzy_match_amount_words(x['amount_digits'], x.get('amount_words', '')),
        axis=1
    )
    df['sig_missing'] = (~df['signature_detected']).astype(int) if 'signature_detected' in df.columns else 0
    df['barcode_bad'] = (~df['barcode_validates_traite']).astype(int) if 'barcode_validates_traite' in df.columns else 0
    df['rib_invalid'] = df['rib'].apply(lambda x: not is_valid_rib(x))
    
    # Text features
    text_cols = ['payer_name_address', 'drawer_name']
    for col in text_cols:
        df[f'{col}_len'] = df[col].str.len() if col in df.columns else 30
    
    return df

# %% [markdown]
# ## 4. Helper Functions

# %%
def is_valid_rib(rib):
    try:
        rib_clean = ''.join(filter(str.isdigit, str(rib)))
        if len(rib_clean) != 20: return False
        if rib_clean.startswith('000'): return False
        return int(rib_clean[18:20]) > 0  # Basic check digit simulation
    except:
        return False

def fuzzy_match_amount_words(amount, words):
    try:
        from num2words import num2words
        amount = float(amount)
        words = str(words).lower()
        
        if str(int(amount)) in words:
            return True
            
        expected = num2words(amount, lang='fr').replace('-', ' ')
        return any(word in expected for word in words.split())
    except:
        return False

# %% [markdown]
# ## 5. Model Training

# %%
def train_models(X_train, y_train):
    # Class weighting
    classes = np.unique(y_train)
    weights = compute_class_weight('balanced', classes=classes, y=y_train)
    class_weights = dict(zip(classes, weights))
    
    # Preprocessing pipeline
    numeric_features = ['amount_digits', 'gap_days', 'amount_log']
    numeric_transformer = Pipeline(steps=[
        ('scaler', StandardScaler())
    ])
    
    categorical_features = ['amount_high', 'gap_long', 'gap_negative',
                           'amount_words_match', 'sig_missing', 'barcode_bad',
                           'rib_invalid']
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', 'passthrough', categorical_features)
        ])
    
    # Initialize models
    rf = RandomForestClassifier(
        n_estimators=200,
        class_weight='balanced_subsample',
        max_depth=5,
        random_state=42
    )
    
    lr = LogisticRegression(
        class_weight=class_weights,
        penalty='l2',
        C=0.1,
        max_iter=1000,
        random_state=42
    )
    
    nn = tf.keras.Sequential([
        tf.keras.layers.Dense(16, activation='relu', kernel_regularizer='l2'),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(8, activation='relu'),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])
    
    nn.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=[
            tf.keras.metrics.Precision(name='precision'),
            tf.keras.metrics.Recall(name='recall'),
            tf.keras.metrics.AUC(name='pr_auc', curve='PR')
        ]
    )
    
    return {
        'preprocessor': preprocessor,
        'rf': rf,
        'lr': lr,
        'nn': nn,
        'class_weights': class_weights
    }

# %% [markdown]
# ## 6. Prediction System

# %%
def predict_fraud(input_data, model, preprocessor, threshold=0.5):
    # Feature engineering
    features = pd.DataFrame([{
        'amount_digits': input_data['amount_digits'],
        'gap_days': (pd.to_datetime(input_data['date_due']) - 
                    pd.to_datetime(input_data['date_created'])).days,
        'amount_log': np.log1p(input_data['amount_digits']),
        'amount_high': int(input_data['amount_digits'] > joblib.load('amount_threshold.pkl')),
        'gap_long': int((pd.to_datetime(input_data['date_due']) - 
                       pd.to_datetime(input_data['date_created'])).days > GAP_LONG_DAYS),
        'gap_negative': int(pd.to_datetime(input_data['date_due']) < 
                        pd.to_datetime(input_data['date_created'])),
        'amount_words_match': fuzzy_match_amount_words(input_data['amount_digits'], 
                                                      input_data.get('amount_words', '')),
        'sig_missing': int(not input_data.get('signature_detected', False)),
        'barcode_bad': int(not input_data.get('barcode_validates_traite', False)),
        'rib_invalid': int(not is_valid_rib(input_data.get('rib', '')))
    }])
    
    # Preprocessing and prediction
    X_processed = preprocessor.transform(features)
    
    if isinstance(model, tf.keras.Model):
        prob = model.predict(X_processed)[0][0]
    else:
        prob = model.predict_proba(X_processed)[0][1]
    
    # Conflict resolution
    conflict_score = 0
    if features['amount_words_match'][0] and not features['sig_missing'][0]:
        conflict_score -= 0.3
    if features['rib_invalid'][0] and features['barcode_bad'][0]:
        conflict_score += 0.4
    
    final_prob = np.clip(prob + conflict_score, 0, 1)
    
    return {
        'fraud_probability': final_prob,
        'prediction': final_prob > threshold,
        'confidence': 'High' if abs(final_prob - 0.5) > 0.3 else 'Medium',
        'feature_values': features.to_dict(),
        'threshold': threshold
    }

# %% [markdown]
# ## 7. Execution & Testing

# %%
if __name__ == "__main__":
    # Data pipeline
    df = load_and_validate_data()
    df = create_features(df)
    
    # Prepare features
    features = ['amount_digits', 'gap_days', 'amount_log', 'amount_high',
               'gap_long', 'gap_negative', 'amount_words_match', 'sig_missing',
               'barcode_bad', 'rib_invalid']
    X = df[features]
    y = df['fraud_label'] if 'fraud_label' in df.columns else np.zeros(len(df))
    
    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )
    
    # Model training
    models = train_models(X_train, y_train)
    preprocessor = models['preprocessor']
    
    # Fit preprocessing
    X_train_processed = preprocessor.fit_transform(X_train)
    X_test_processed = preprocessor.transform(X_test)
    
    # Train classifiers
    models['rf'].fit(X_train_processed, y_train)
    models['lr'].fit(X_train_processed, y_train)
    
    # Neural network training
    models['nn'].fit(
        X_train_processed, y_train,
        epochs=50,
        batch_size=32,
        validation_split=0.2,
        class_weight=models['class_weights'],
        callbacks=[
            tf.keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)
        ],
        verbose=0
    )
    
    # Save models
    joblib.dump(preprocessor, 'preprocessor.pkl')
    joblib.dump(models['rf'], 'rf_model.pkl')
    joblib.dump(models['lr'], 'lr_model.pkl')
    models['nn'].save('nn_model.keras')

Removed 0 invalid records


ValueError: Invalid dtype: object