# Fraud Detection Model Training

This notebook demonstrates the process of training the fraud detection model used in our API. We'll go through:

1. Data loading and exploration
2. Data preprocessing
3. Feature engineering
4. Model training and evaluation
5. Model saving for use in the API

The trained model is saved as a pickle file that can be loaded by the fraud detection API.

In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.decomposition import PCA
import joblib
import os

# Set up notebook options
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)
%matplotlib inline

## 1. Data Loading and Exploration

We'll load a sample of the transaction data and examine its structure.

In [2]:
# For demonstration, we'll create a synthetic dataset
# In a real scenario, you would load your transaction data

def generate_synthetic_data(n_samples=10000, fraud_ratio=0.1):
    """Generate synthetic transaction data for demonstration"""
    np.random.seed(42)
    
    # Generate transaction amounts
    legitimate_amounts = np.random.lognormal(mean=4.5, sigma=1, size=int(n_samples * (1 - fraud_ratio)))
    fraud_amounts = np.random.lognormal(mean=6, sigma=1.5, size=int(n_samples * fraud_ratio))
    
    # Generate card IDs
    legitimate_card_ids = np.random.randint(1000, 9000, size=int(n_samples * (1 - fraud_ratio)))
    fraud_card_ids = np.random.randint(1000, 9000, size=int(n_samples * fraud_ratio))
    
    # Product codes
    product_codes = ['C', 'H', 'R', 'S', 'W']
    legitimate_products = np.random.choice(product_codes, size=int(n_samples * (1 - fraud_ratio)), p=[0.4, 0.1, 0.2, 0.25, 0.05])
    fraud_products = np.random.choice(product_codes, size=int(n_samples * fraud_ratio), p=[0.1, 0.4, 0.1, 0.1, 0.3])
    
    # Card types
    card_types = ['visa', 'mastercard', 'amex', 'discover']
    legitimate_card_types = np.random.choice(card_types, size=int(n_samples * (1 - fraud_ratio)))
    fraud_card_types = np.random.choice(card_types, size=int(n_samples * fraud_ratio))
    
    # Card categories
    card_categories = ['debit', 'credit']
    legitimate_card_categories = np.random.choice(card_categories, size=int(n_samples * (1 - fraud_ratio)), p=[0.6, 0.4])
    fraud_card_categories = np.random.choice(card_categories, size=int(n_samples * fraud_ratio), p=[0.3, 0.7])
    
    # Email domains
    email_domains = ['gmail.com', 'yahoo.com', 'hotmail.com', 'aol.com', 'other']
    legitimate_emails = np.random.choice(email_domains, size=int(n_samples * (1 - fraud_ratio)), p=[0.5, 0.2, 0.2, 0.05, 0.05])
    fraud_emails = np.random.choice(email_domains, size=int(n_samples * fraud_ratio), p=[0.2, 0.1, 0.1, 0.1, 0.5])
    
    # Transaction timestamps (in seconds since some epoch)
    legitimate_timestamps = np.random.randint(1000000, 2000000, size=int(n_samples * (1 - fraud_ratio)))
    fraud_timestamps = np.random.randint(1000000, 2000000, size=int(n_samples * fraud_ratio))
    
    # Additional features
    # V-features (numeric)
    v_cols = 10
    legitimate_v = np.random.normal(0, 1, size=(int(n_samples * (1 - fraud_ratio)), v_cols))
    fraud_v = np.random.normal(0.5, 1.5, size=(int(n_samples * fraud_ratio), v_cols))
    
    # C-features (numeric)
    c_cols = 5
    legitimate_c = np.random.normal(0, 1, size=(int(n_samples * (1 - fraud_ratio)), c_cols))
    fraud_c = np.random.normal(0.5, 1.5, size=(int(n_samples * fraud_ratio), c_cols))
    
    # D-features (numeric)
    d_cols = 5
    legitimate_d = np.random.normal(0, 1, size=(int(n_samples * (1 - fraud_ratio)), d_cols))
    fraud_d = np.random.normal(0.5, 1.5, size=(int(n_samples * fraud_ratio), d_cols))
    
    # M-features (categorical)
    m_values = ['T', 'F', 'M']
    m_cols = 5
    legitimate_m = np.array([np.random.choice(m_values, size=int(n_samples * (1 - fraud_ratio))) for _ in range(m_cols)]).T
    fraud_m = np.array([np.random.choice(m_values, size=int(n_samples * fraud_ratio)) for _ in range(m_cols)]).T
    
    # Combine legitimate and fraud data
    data = {
        'TransactionAmt': np.concatenate([legitimate_amounts, fraud_amounts]),
        'TransactionDT': np.concatenate([legitimate_timestamps, fraud_timestamps]),
        'card1': np.concatenate([legitimate_card_ids, fraud_card_ids]),
        'ProductCD': np.concatenate([legitimate_products, fraud_products]),
        'card4': np.concatenate([legitimate_card_types, fraud_card_types]),
        'card6': np.concatenate([legitimate_card_categories, fraud_card_categories]),
        'P_emaildomain': np.concatenate([legitimate_emails, fraud_emails]),
        'isFraud': np.concatenate([np.zeros(int(n_samples * (1 - fraud_ratio))), np.ones(int(n_samples * fraud_ratio))])
    }
    
    # Add V, C, D, M features
    for i in range(v_cols):
        data[f'V{i+1}'] = np.concatenate([legitimate_v[:, i], fraud_v[:, i]])
    
    for i in range(c_cols):
        data[f'C{i+1}'] = np.concatenate([legitimate_c[:, i], fraud_c[:, i]])
    
    for i in range(d_cols):
        data[f'D{i+1}'] = np.concatenate([legitimate_d[:, i], fraud_d[:, i]])
    
    for i in range(m_cols):
        data[f'M{i+1}'] = np.concatenate([legitimate_m[:, i], fraud_m[:, i]])
    
    # Create DataFrame
    df = pd.DataFrame(data)
    
    # Shuffle the data
    return df.sample(frac=1, random_state=42).reset_index(drop=True)

# Generate synthetic data
transaction_data = generate_synthetic_data(n_samples=10000, fraud_ratio=0.1)

# Display the first few rows
transaction_data.head()

In [3]:
# Get basic information about the dataset
print(f"Dataset shape: {transaction_data.shape}")
print(f"Number of fraud transactions: {transaction_data['isFraud'].sum()}")
print(f"Fraud rate: {transaction_data['isFraud'].mean() * 100:.2f}%")

# Check data types and missing values
transaction_data.info()

## 2. Data Preprocessing and Exploration

Now let's explore the data and create visualizations to understand the distribution of features.

In [4]:
# Visualize transaction amount distribution by fraud status
plt.figure(figsize=(12, 6))
sns.histplot(data=transaction_data, x='TransactionAmt', hue='isFraud', 
             multiple='stack', bins=30, log_scale=True)
plt.title('Transaction Amount Distribution by Fraud Status')
plt.xlabel('Transaction Amount (log scale)')
plt.ylabel('Count')
plt.show()

# Visualize fraud rate by product code
product_fraud = transaction_data.groupby('ProductCD')['isFraud'].mean().reset_index()
plt.figure(figsize=(10, 6))
sns.barplot(data=product_fraud, x='ProductCD', y='isFraud')
plt.title('Fraud Rate by Product Code')
plt.xlabel('Product Code')
plt.ylabel('Fraud Rate')
plt.show()

# Visualize fraud rate by card type
card_fraud = transaction_data.groupby('card4')['isFraud'].mean().reset_index()
plt.figure(figsize=(10, 6))
sns.barplot(data=card_fraud, x='card4', y='isFraud')
plt.title('Fraud Rate by Card Type')
plt.xlabel('Card Type')
plt.ylabel('Fraud Rate')
plt.show()

## 3. Feature Engineering

Let's create the `AdvancedMLPipeline` class that will handle feature engineering.

In [5]:
class AdvancedMLPipeline:
    def __init__(self, model_type='rf', n_components=5, remove_outliers=True):
        # Initialize all components: model, imputers, encoder, PCA
        self.model = RandomForestClassifier(n_estimators=100, random_state=42)
        self.scaler = StandardScaler()
        self.imputer_num = SimpleImputer(strategy='mean')
        self.imputer_cat = SimpleImputer(strategy='most_frequent')
        self.encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
        self.pca = PCA(n_components=n_components)
        self.remove_outliers = remove_outliers
        self.cat_columns = []
        self.feature_columns = []

    def _add_features(self, df):
        # Create time features from TransactionDT
        if 'TransactionDT' in df.columns:
            df['hour'] = pd.to_datetime(df['TransactionDT'], unit='s', errors='coerce').dt.hour.astype('Int64')
            df['day'] = pd.to_datetime(df['TransactionDT'], unit='s', errors='coerce').dt.dayofweek.astype('Int64')
            df.drop(columns=['TransactionDT'], inplace=True)

        # Group-level aggregation features
        c_cols = [col for col in df.columns if col.startswith('C')]
        d_cols = [col for col in df.columns if col.startswith('D')]
        v_cols = [col for col in df.columns if col.startswith('V')]

        if c_cols:
            df['C_sum'] = df[c_cols].sum(axis=1)
        if d_cols:
            df['D_missing'] = df[d_cols].isnull().sum(axis=1)
        if v_cols:
            df['V_mean'] = df[v_cols].mean(axis=1)
        return df

    def _remove_outliers(self, df, col='TransactionAmt'):
        # Remove outliers from TransactionAmt using IQR
        if col in df.columns:
            Q1 = df[col].quantile(0.25)
            Q3 = df[col].quantile(0.75)
            IQR = Q3 - Q1
            lower = Q1 - 1.5 * IQR
            upper = Q3 + 1.5 * IQR
            before = df.shape[0]
            df = df[(df[col] >= lower) & (df[col] <= upper)]
            after = df.shape[0]
            print(f"Removed {before - after} outliers from {col}")
        return df

    def fit_model_on_chunk(self, df, label_column='isFraud'):
        df = self._add_features(df)

        if self.remove_outliers:
            df = self._remove_outliers(df)

        # Convert known categoricals to string
        known_cats = ['ProductCD', 'card4', 'card6', 'P_emaildomain',
                      'M1','M2','M3','M4','M5']
        for col in known_cats:
            if col in df.columns:
                df[col] = df[col].astype(str)

        # Identify categorical and numeric columns
        self.cat_columns = [col for col in df.columns if df[col].dtype == 'object' and col != label_column]
        num_cols = [col for col in df.columns if col not in self.cat_columns + [label_column]]

        # Impute missing values
        if self.cat_columns:
            df[self.cat_columns] = self.imputer_cat.fit_transform(df[self.cat_columns])
        if num_cols:
            df[num_cols] = self.imputer_num.fit_transform(df[num_cols])

        # One-hot encode categorical features
        encoded = self.encoder.fit_transform(df[self.cat_columns])
        encoded_df = pd.DataFrame(encoded, columns=self.encoder.get_feature_names_out(self.cat_columns), index=df.index)

        # Replace original categoricals with encoded version
        df = df.drop(columns=self.cat_columns)
        df = pd.concat([df, encoded_df], axis=1)

        # Scale numeric columns
        if num_cols:
            df[num_cols] = self.scaler.fit_transform(df[num_cols])

        # Apply PCA on V-columns
        v_cols = [col for col in df.columns if col.startswith('V')]
        if v_cols:
            pca_trans = self.pca.fit_transform(df[v_cols])
            pca_df = pd.DataFrame(pca_trans, columns=[f'V_PCA_{i}' for i in range(pca_trans.shape[1])], index=df.index)
            df = df.drop(columns=v_cols)
            df = pd.concat([df, pca_df], axis=1)

        # Save the final feature columns
        self.feature_columns = [col for col in df.columns if col not in [label_column]]

        # Train model
        X = df[self.feature_columns]
        y = df[label_column]
        self.model.fit(X, y)
        
        return X, y

    def transform_for_predict(self, df):
        df = self._add_features(df)

        # Fill missing categorical values
        for col in self.cat_columns:
            if col not in df.columns:
                df[col] = "missing"
        
        if self.cat_columns:
            df[self.cat_columns] = self.imputer_cat.transform(df[self.cat_columns])

        # Impute numeric
        num_cols = [col for col in df.columns if col not in self.cat_columns]
        if num_cols:
            df[num_cols] = self.imputer_num.transform(df[num_cols])

        # Encode categoricals
        encoded = self.encoder.transform(df[self.cat_columns])
        encoded_df = pd.DataFrame(
            encoded, 
            columns=self.encoder.get_feature_names_out(self.cat_columns), 
            index=df.index
        )

        df = df.drop(columns=self.cat_columns)
        df = pd.concat([df, encoded_df], axis=1)

        # Scale numeric
        if num_cols:
            df[num_cols] = self.scaler.transform(df[num_cols])

        # Apply PCA to V columns
        v_cols = [col for col in df.columns if col.startswith('V')]
        if v_cols:
            pca_trans = self.pca.transform(df[v_cols])
            pca_df = pd.DataFrame(
                pca_trans, 
                columns=[f'V_PCA_{i}' for i in range(pca_trans.shape[1])], 
                index=df.index
            )
            df = df.drop(columns=v_cols)
            df = pd.concat([df, pca_df], axis=1)

        # Align columns with training
        for col in self.feature_columns:
            if col not in df.columns:
                df[col] = 0
        df = df[self.feature_columns]
        return df

    def evaluate(self, X, y_true):
        y_pred = self.model.predict(X)
        y_proba = self.model.predict_proba(X)[:, 1]
        print(classification_report(y_true, y_pred))
        print("ROC AUC:", roc_auc_score(y_true, y_proba))
        print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))
        
        # ROC curve
        fpr, tpr, _ = roc_curve(y_true, y_proba)
        plt.figure(figsize=(8, 6))
        plt.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc_score(y_true, y_proba):.3f})')
        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('ROC Curve')
        plt.legend(loc='lower right')
        plt.show()
        
        # Feature importance
        feature_importance = pd.DataFrame({
            'feature': X.columns,
            'importance': self.model.feature_importances_
        })
        feature_importance = feature_importance.sort_values('importance', ascending=False).head(15)
        
        plt.figure(figsize=(10, 6))
        sns.barplot(data=feature_importance, x='importance', y='feature')
        plt.title('Top 15 Feature Importances')
        plt.xlabel('Importance')
        plt.ylabel('Feature')
        plt.show()
        
        return {
            'y_pred': y_pred,
            'y_proba': y_proba,
            'auc': roc_auc_score(y_true, y_proba),
            'feature_importance': feature_importance
        }

## 4. Model Training and Evaluation

In [6]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    transaction_data.drop('isFraud', axis=1),
    transaction_data['isFraud'],
    test_size=0.2,
    random_state=42,
    stratify=transaction_data['isFraud']
)

# Combine X_train and y_train for pipeline
train_df = X_train.copy()
train_df['isFraud'] = y_train

# Initialize and train the pipeline
pipeline = AdvancedMLPipeline(model_type='rf', n_components=5, remove_outliers=True)
X_processed, y_processed = pipeline.fit_model_on_chunk(train_df, label_column='isFraud')

# Evaluate the model
X_test_processed = pipeline.transform_for_predict(X_test)
evaluation_results = pipeline.evaluate(X_test_processed, y_test)

## 5. Model Saving for Use in the API

In [7]:
# Create model directory if it doesn't exist
MODEL_DIR = '../model'
os.makedirs(MODEL_DIR, exist_ok=True)

# Save the model
model_path = os.path.join(MODEL_DIR, 'model.pkl')
joblib.dump(pipeline.model, model_path)
print(f"Model saved to {model_path}")

# Save the pipeline
pipeline_path = os.path.join(MODEL_DIR, 'pipeline.pkl')
joblib.dump(pipeline, pipeline_path)
print(f"Pipeline saved to {pipeline_path}")

## 6. Example Prediction

Let's demonstrate how to make a prediction using the saved model and pipeline.

In [8]:
# Example transaction data
example_transaction = pd.DataFrame({
    'TransactionAmt': [100.0],
    'ProductCD': ['C'],
    'card1': [1234],
    'card4': ['visa'],
    'card6': ['debit'],
    'P_emaildomain': ['gmail.com'],
    'TransactionDT': [1500000],
})

# Transform the data
X_example = pipeline.transform_for_predict(example_transaction)

# Make prediction
prediction = pipeline.model.predict(X_example)[0]
probability = pipeline.model.predict_proba(X_example)[0, 1]

print(f"Prediction: {'Fraud' if prediction == 1 else 'Legitimate'}")
print(f"Fraud Probability: {probability:.4f}")

## 7. Conclusion

We've successfully trained a fraud detection model using a Random Forest classifier and created a preprocessing pipeline that handles:

- Feature engineering
- Missing value imputation
- Categorical encoding
- Numerical scaling
- Dimensionality reduction

The model and pipeline are saved as pickle files that can be loaded by our fraud detection API to make real-time predictions on new transactions.