In [5]:
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch_geometric.data import HeteroData
from torch_geometric.nn import HGTConv, Linear
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, precision_recall_curve, average_precision_score
from flask import Flask, request, jsonify
import os

In [None]:
orders_df = pd.read_csv('orders.csv')
users_df = pd.read_csv('users.csv')
payments_df = pd.read_csv('payments.csv')

In [3]:
# ---------------------------
# Data Processor
# ---------------------------
class DataProcessor:
    def __init__(self, order_data_path, user_data_path, payment_data_path):
        self.order_data_path = order_data_path
        self.user_data_path = user_data_path
        self.payment_data_path = payment_data_path

        self.node_mappings = {}
        self.edge_indices = {}
        self.node_features = {}
        self.labels = {}

    def load_data(self):
        """Load raw data from CSV files."""
        self.orders_df = pd.read_csv(self.order_data_path)
        self.users_df = pd.read_csv(self.user_data_path)
        self.payments_df = pd.read_csv(self.payment_data_path)
        if 'is_fraud' not in self.orders_df.columns:
            raise ValueError("Order data must have 'is_fraud' column")

    def create_node_mappings(self):
        """Create mappings from original IDs to consecutive indices."""
        # Convert all IDs to strings for consistency
        unique_users = self.users_df['user_id'].astype(str).unique()
        self.node_mappings['user'] = {uid: idx for idx, uid in enumerate(unique_users)}

        unique_orders = self.orders_df['order_id'].astype(str).unique()
        self.node_mappings['order'] = {oid: idx for idx, oid in enumerate(unique_orders)}

        unique_payments = self.payments_df['payment_id'].astype(str).unique()
        self.node_mappings['payment'] = {pid: idx for idx, pid in enumerate(unique_payments)}

    def extract_node_features(self):
        """Extract and store features for each node type as tensors."""
        # For users: modify columns as needed
        user_features = self.users_df[['age', 'account_age_days', 'total_past_orders', 'avg_order_value']].values
        self.node_features['user'] = torch.tensor(user_features, dtype=torch.float)

        # For orders: modify columns as needed
        order_features = self.orders_df[['order_amount', 'num_items', 'time_of_day', 'day_of_week']].values
        self.node_features['order'] = torch.tensor(order_features, dtype=torch.float)

        # For payments: use one-hot encoding for payment_type
        payment_types = pd.get_dummies(self.payments_df['payment_type']).values
        self.node_features['payment'] = torch.tensor(payment_types, dtype=torch.float)

        # For orders, we also store fraud labels.
        self.labels['order'] = torch.tensor(self.orders_df['is_fraud'].values, dtype=torch.long)

    def create_edge_indices(self):
        """Create edge indices (relationships) between node types."""
        # User places Order edges
        user_order_df = self.orders_df[['user_id', 'order_id']].drop_duplicates()
        user_indices = [self.node_mappings['user'][str(uid)] for uid in user_order_df['user_id']]
        order_indices = [self.node_mappings['order'][str(oid)] for oid in user_order_df['order_id']]
        self.edge_indices[('user', 'places', 'order')] = torch.tensor([user_indices, order_indices])
        self.edge_indices[('order', 'placed_by', 'user')] = torch.tensor([order_indices, user_indices])

        # Order uses Payment edges
        order_payment_df = self.orders_df[['order_id', 'payment_id']].drop_duplicates()
        order_indices = [self.node_mappings['order'][str(oid)] for oid in order_payment_df['order_id']]
        payment_indices = [self.node_mappings['payment'][str(pid)] for pid in order_payment_df['payment_id']]
        self.edge_indices[('order', 'uses', 'payment')] = torch.tensor([order_indices, payment_indices])
        self.edge_indices[('payment', 'used_by', 'order')] = torch.tensor([payment_indices, order_indices])

    def create_heterograph(self):
        """Create and return a heterogeneous graph using HeteroData."""
        self.load_data()
        self.create_node_mappings()
        self.extract_node_features()
        self.create_edge_indices()

        data = HeteroData()
        # Set node features
        for node_type, features in self.node_features.items():
            data[node_type].x = features

        # Set edge indices
        for edge_type, edge_index in self.edge_indices.items():
            data[edge_type].edge_index = edge_index

        # Set fraud labels for orders
        data['order'].y = self.labels['order']

        return data

    def split_data(self, data, test_size=0.2, val_size=0.1):
        """Split the order nodes into train, validation, and test masks."""
        num_orders = data['order'].x.size(0)
        order_indices = np.arange(num_orders)
        # Split into train+val and test
        train_val_idx, test_idx = train_test_split(order_indices, test_size=test_size, 
                                                    stratify=data['order'].y.numpy())
        # Split train+val into train and validation
        val_size_adjusted = val_size / (1 - test_size)
        train_idx, val_idx = train_test_split(train_val_idx, test_size=val_size_adjusted,
                                              stratify=data['order'].y[train_val_idx].numpy())

        train_mask = torch.zeros(num_orders, dtype=torch.bool)
        val_mask = torch.zeros(num_orders, dtype=torch.bool)
        test_mask = torch.zeros(num_orders, dtype=torch.bool)

        train_mask[train_idx] = True
        val_mask[val_idx] = True
        test_mask[test_idx] = True

        data['order'].train_mask = train_mask
        data['order'].val_mask = val_mask
        data['order'].test_mask = test_mask

        return data

In [6]:
# ---------------------------
# HGNN Model Definition
# ---------------------------
class FraudDetectionHGNN(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels, metadata):
        super(FraudDetectionHGNN, self).__init__()
        # metadata is a tuple: (node_feature_dims, edge_types)

        self.embeddings = torch.nn.ModuleDict()
        for node_type, in_channels in metadata[0].items():
            self.embeddings[node_type] = Linear(in_channels, hidden_channels)

        # Two HGTConv layers
        self.conv1 = HGTConv(hidden_channels, hidden_channels, metadata, num_heads=4, group='sum')
        self.conv2 = HGTConv(hidden_channels, hidden_channels, metadata, num_heads=4, group='sum')

        # Output layer for the "order" node
        self.output = Linear(hidden_channels, out_channels)

    def forward(self, x_dict, edge_index_dict):
        # Compute initial embeddings
        h_dict = {node_type: self.embeddings[node_type](x) for node_type, x in x_dict.items()}

        # First conv layer and activation
        h_dict = self.conv1(h_dict, edge_index_dict)
        h_dict = {ntype: F.leaky_relu(h) for ntype, h in h_dict.items()}

        # Second conv layer and activation
        h_dict = self.conv2(h_dict, edge_index_dict)
        h_dict = {ntype: F.leaky_relu(h) for ntype, h in h_dict.items()}

        # Use the "order" node embedding for fraud prediction
        out = self.output(h_dict['order'])
        return out

In [7]:
# ---------------------------
# Training Pipeline
# ---------------------------
class FraudDetectionTrainer:
    def __init__(self, model, data, device=None):
        if device is None:
            device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.device = device
        self.model = model.to(self.device)
        self.data = data.to(self.device)

        # Define the optimizer
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=0.001, weight_decay=5e-4)

        # Addressing class imbalance: compute pos_weight for BCE loss
        order_labels = self.data['order'].y
        pos_weight = (order_labels == 0).sum() / (order_labels == 1).sum()
        self.criterion = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight)

    def train(self, epochs=100, patience=10):
        best_val_loss = float('inf')
        counter = 0

        for epoch in range(epochs):
            self.model.train()
            self.optimizer.zero_grad()
            out = self.model(
                {node_type: self.data[node_type].x for node_type in self.data.node_types},
                {edge_type: self.data[edge_type].edge_index for edge_type in self.data.edge_types}
            )
            train_mask = self.data['order'].train_mask
            loss = self.criterion(out[train_mask].squeeze(), self.data['order'].y[train_mask].float())
            loss.backward()
            self.optimizer.step()

            # Evaluate on validation set
            val_loss = self.evaluate(mode='val')
            print(f'Epoch: {epoch+1}, Train Loss: {loss.item():.4f}, Val Loss: {val_loss:.4f}')

            # Early stopping
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                counter = 0
                torch.save(self.model.state_dict(), 'best_fraud_model.pt')
            else:
                counter += 1
                if counter >= patience:
                    print(f'Early stopping at epoch {epoch+1}')
                    break

    def evaluate(self, mode='val'):
        self.model.eval()
        with torch.no_grad():
            out = self.model(
                {node_type: self.data[node_type].x for node_type in self.data.node_types},
                {edge_type: self.data[edge_type].edge_index for edge_type in self.data.edge_types}
            )
            mask = self.data['order'].val_mask if mode == 'val' else self.data['order'].test_mask
            loss = self.criterion(out[mask].squeeze(), self.data['order'].y[mask].float())
            if mode == 'test':
                preds = torch.sigmoid(out[mask].squeeze()).cpu().numpy()
                labels = self.data['order'].y[mask].cpu().numpy()
                auc = roc_auc_score(labels, preds)
                ap = average_precision_score(labels, preds)
                precision, recall, thresholds = precision_recall_curve(labels, preds)
                f1_scores = 2 * recall * precision / (recall + precision + 1e-6)
                optimal_idx = np.argmax(f1_scores)
                optimal_threshold = thresholds[optimal_idx] if thresholds.size > 0 else 0.5
                print(f'Test Loss: {loss.item():.4f}, AUC: {auc:.4f}, AP: {ap:.4f}')
                print(f'Optimal threshold: {optimal_threshold:.4f}')
            return loss.item()

    def test(self):
        self.model.load_state_dict(torch.load('best_fraud_model.pt'))
        return self.evaluate(mode='test')

In [9]:
# ---------------------------
# Inference API
# ---------------------------
class FraudDetectionAPI:
    def __init__(self, model_path, data_processor, hidden_channels=64, threshold=0.5):
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.data_processor = data_processor

        # Create a sample heterogeneous graph to extract metadata
        sample_data = self.data_processor.create_heterograph()
        metadata = (
            {node_type: sample_data[node_type].x.size(1) for node_type in sample_data.node_types},
            sample_data.edge_types
        )
        self.model = FraudDetectionHGNN(hidden_channels=hidden_channels, out_channels=1, metadata=metadata).to(self.device)
        self.model.load_state_dict(torch.load(model_path, map_location=self.device))
        self.model.eval()
        self.threshold = threshold

    def process_new_order(self, order_data):
        """
        Convert new order JSON data to a mini-graph, perform inference,
        and return the fraud probability along with classification.
        """
        graph_data = self._convert_order_to_graph(order_data)
        with torch.no_grad():
            out = self.model(
                {node_type: graph_data[node_type].x.to(self.device) for node_type in graph_data.node_types},
                {edge_type: graph_data[edge_type].edge_index.to(self.device) for edge_type in graph_data.edge_types}
            )
            # Assume new order is the only order node and at index 0
            fraud_prob = torch.sigmoid(out[0]).item()
            is_fraud = fraud_prob >= self.threshold
        return {
            'fraud_probability': fraud_prob,
            'is_fraud': is_fraud,
            'order_id': order_data['order_id']
        }

    def _convert_order_to_graph(self, order_data):
        """
        Convert a single new order into a minimal HeteroData graph.
        This simplified implementation creates one node for each type.
        """
        data = HeteroData()
        # Create user node feature vector – adjust features as needed
        data['user'].x = torch.tensor([[
            order_data['user_age'],
            order_data['account_age_days'],
            order_data['user_total_orders'],
            order_data['user_avg_order_value']
        ]], dtype=torch.float)
        # Create order node feature vector
        data['order'].x = torch.tensor([[
            order_data['order_amount'],
            order_data['num_items'],
            order_data['time_of_day'],
            order_data['day_of_week']
        ]], dtype=torch.float)
        # Create payment node feature vector using one-hot encoding (assumes payment_type_idx is provided)
        # Here we assume a fixed dimensionality (e.g., 4) for payment one-hot vector; adjust as needed.
        payment_dim = 4
        payment_feature = torch.zeros((1, payment_dim))
        payment_idx = int(order_data['payment_type_idx'])
        if payment_idx < payment_dim:
            payment_feature[0, payment_idx] = 1
        data['payment'].x = payment_feature

        # Create simple edge indices for the mini-graph (only one edge per relation)
        data[('user', 'places', 'order')].edge_index = torch.tensor([[0], [0]])
        data[('order', 'placed_by', 'user')].edge_index = torch.tensor([[0], [0]])
        data[('order', 'uses', 'payment')].edge_index = torch.tensor([[0], [0]])
        data[('payment', 'used_by', 'order')].edge_index = torch.tensor([[0], [0]])
        return data

In [10]:
# ---------------------------
# Flask API Setup
# ---------------------------
def create_fraud_detection_app(model_path, data_processor):
    app = Flask(__name__)
    fraud_api = FraudDetectionAPI(model_path, data_processor)

    @app.route('/predict', methods=['POST'])
    def predict_fraud():
        try:
            order_data = request.get_json()
            required_fields = [
                'order_id', 'user_id', 'payment_id', 'order_amount', 'num_items',
                'time_of_day', 'day_of_week', 'user_age', 'account_age_days',
                'user_total_orders', 'user_avg_order_value', 'payment_type_idx'
            ]
            for field in required_fields:
                if field not in order_data:
                    return jsonify({'error': f'Missing required field: {field}'}), 400

            result = fraud_api.process_new_order(order_data)
            return jsonify(result)
        except Exception as e:
            return jsonify({'error': str(e)}), 500

    return app

In [13]:
# ---------------------------
# Main Application
# ---------------------------
def main():
    # File paths: update these paths based on your environment.
    order_data_path = os.path.join('data', 'orders.csv')
    user_data_path = os.path.join('data', 'users.csv')
    payment_data_path = os.path.join('data', 'payments.csv')

    # Initialize DataProcessor
    data_processor = DataProcessor(order_data_path, user_data_path, payment_data_path)
    data = data_processor.create_heterograph()
    data = data_processor.split_data(data)

    # Define metadata for HGNN: node feature dimensions and edge types.
    metadata = (
        {node_type: data[node_type].x.size(1) for node_type in data.node_types},
        data.edge_types
    )

    # Initialize and train the HGNN model
    hidden_channels = 64
    model = FraudDetectionHGNN(hidden_channels=hidden_channels, out_channels=1, metadata=metadata)
    trainer = FraudDetectionTrainer(model, data)
    trainer.train(epochs=100, patience=10)
    trainer.test()  # Evaluate on the test set

    # Start the Flask API for real-time fraud detection
    app = create_fraud_detection_app('best_fraud_model.pt', data_processor)
    app.run(host='0.0.0.0', port=5000)

if __name__ == '__main__':
    main()

FileNotFoundError: [Errno 2] No such file or directory: 'data\\orders.csv'

In [14]:
print("Haziq siddiqui")

Haziq siddiqui
