In [1]:
import pandas as pd
import numpy as np
import joblib

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score


In [2]:
def custom_feature_engineering(df: pd.DataFrame):
    df = df.copy()
    
    # Log-transform amount
    df['log_amount'] = np.log1p(df['amount'])
    
    # Create binary feature for nighttime transactions
    if 'transaction_hour' in df.columns:
        df['is_night'] = df['transaction_hour'].apply(lambda x: 1 if (x >= 22 or x < 5) else 0)
    
    return df


In [None]:
class FraudModelPipeline:
    def __init__(self, numeric_features, categorical_features):
        self.numeric_features = numeric_features
        self.categorical_features = categorical_features
        self.pipeline = None
        self.model = None
    
    def build_preprocessing_pipeline(self):
        """Builds preprocessing pipeline"""
        custom_transformer = FunctionTransformer(custom_feature_engineering)

        numeric_transformer = Pipeline(steps=[
            ('scaler', StandardScaler())
        ])

        categorical_transformer = Pipeline(steps=[
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ])

        preprocessor = ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, self.numeric_features),
                ('cat', categorical_transformer, self.categorical_features)
            ],
            remainder='drop'
        )

        preprocessing_pipeline = Pipeline(steps=[
            ('custom_features', custom_transformer),
            ('preprocessor', preprocessor)
        ])
        
        return preprocessing_pipeline

    def build_full_pipeline(self):
        """Combine preprocessing + model into full pipeline"""
        preprocessor = self.build_preprocessing_pipeline()

        model = RandomForestClassifier(
            n_estimators=200,
            max_depth=10,
            class_weight='balanced',
            random_state=42
        )

        full_pipeline = Pipeline(steps=[
            ('preprocessing', preprocessor),
            ('model', model)
        ])

        self.pipeline = full_pipeline

    def train(self, df, target_col):
        """Train model"""
        X = df.drop(columns=[target_col])
        y = df[target_col]

        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )

        print(" Training model...")
        self.build_full_pipeline()
        self.pipeline.fit(X_train, y_train)

        print(" Model training complete.")

        y_pred = self.pipeline.predict(X_test)
        y_prob = self.pipeline.predict_proba(X_test)[:, 1]

        print("\n Evaluation Results:")
        print(classification_report(y_test, y_pred))
        print("ROC-AUC Score:", roc_auc_score(y_test, y_prob))
        print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

        return X_test, y_test, y_pred

    def save_artifacts(self, path_prefix="../artifacts"):
        """Save preprocessing + model pipeline"""
        joblib.dump(self.pipeline, f"{path_prefix}/fraud_detection_pipeline.pkl")
        print(f" Saved full pipeline to {path_prefix}/fraud_detection_pipeline.pkl")

    def load_artifacts(self, path):
        """Load preprocessing + model pipeline"""
        self.pipeline = joblib.load(path)
        print(" Loaded pipeline artifact.")

    def predict(self, new_data: pd.DataFrame):
        """Predict on new data"""
        if self.pipeline is None:
            raise ValueError("Pipeline not loaded. Call load_artifacts() first.")
        return self.pipeline.predict(new_data)


In [None]:
# Define features
numeric_features = [
    'amount', 'amount_scaled', 'customer_avg_amount',
    'customer_std_amount', 'transaction_frequency',
    'distance_from_home', 'transaction_hour'
]

categorical_features = [
    'device_type', 'location', 'merchant_category', 'channel'
]

# Initialize and train
fraud_pipeline = FraudModelPipeline(numeric_features, categorical_features)
X_test, y_test, y_pred = fraud_pipeline.train(df, target_col='is_fraud')

# Save artifacts
fraud_pipeline.save_artifacts()


In [None]:
# Load pipeline
fraud_pipeline.load_artifacts("../artifacts/fraud_detection_pipeline.pkl")

# Inference on new transactions
new_txn = df.sample(3, random_state=42).drop(columns=['is_fraud'])
preds = fraud_pipeline.predict(new_txn)

print("Predictions:", preds)
