In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Reinstall with specific version
!pip install --upgrade qdrant-client==1.7.0

Collecting qdrant-client==1.7.0
  Downloading qdrant_client-1.7.0-py3-none-any.whl.metadata (9.3 kB)
Collecting grpcio-tools>=1.41.0 (from qdrant-client==1.7.0)
  Downloading grpcio_tools-1.76.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (5.3 kB)
Collecting portalocker<3.0.0,>=2.7.0 (from qdrant-client==1.7.0)
  Downloading portalocker-2.10.1-py3-none-any.whl.metadata (8.5 kB)
Collecting urllib3<2.0.0,>=1.26.14 (from qdrant-client==1.7.0)
  Downloading urllib3-1.26.20-py2.py3-none-any.whl.metadata (50 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m50.1/50.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Collecting protobuf<7.0.0,>=6.31.1 (from grpcio-tools>=1.41.0->qdrant-client==1.7.0)
  Downloading protobuf-6.33.5-cp39-abi3-manylinux2014_x86_64.whl.metadata (593 bytes)
Downloading qdrant_client-1.7.0-py3-none-any.whl (203 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚î

In [None]:
"""
FRAUD DETECTION: XGBoost + Qdrant Hybrid Model with ONNX Export

Installation:
    pip install qdrant-client==1.7.0 xgboost pandas numpy scikit-learn onnx onnxruntime skl2onnx

    # Note: Use specific Qdrant version for compatibility

Qdrant Setup:
    # Option 1: Docker (recommended for production)
    docker run -p 6333:6333 -p 6334:6334 -v $(pwd)/qdrant_storage:/qdrant/storage qdrant/qdrant

    # Option 2: In-memory (for development/testing)
    # No setup needed - code automatically uses this if no server

Architecture:
    1. Extract 5 behavioral features from transaction
    2. Create rich 12D embedding (features + account behavior + network stats)
    3. XGBoost predicts base risk score
    4. Qdrant finds similar historical fraud patterns
    5. Combine scores: 70% XGBoost + 30% Similarity = Final Score
    6. Export XGBoost model to ONNX format for AI agent deployment
"""

import json
import pandas as pd
import numpy as np
import pickle
import os
from datetime import datetime, timedelta
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from sklearn.metrics import (mean_squared_error, mean_absolute_error, r2_score,
                            roc_auc_score, roc_curve, precision_recall_curve, auc)
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct, Filter, FieldCondition, MatchValue, SearchRequest
import warnings
warnings.filterwarnings('ignore')

# ONNX export imports
try:
    import onnx
    import onnxruntime as ort
    from skl2onnx import convert_sklearn
    from skl2onnx.common.data_types import FloatTensorType
    ONNX_AVAILABLE = True
except ImportError:
    ONNX_AVAILABLE = False
    print("‚ö†Ô∏è  ONNX libraries not available. Install with: pip install onnx onnxruntime skl2onnx")

# ============================================================================
# CONFIGURATION
# ============================================================================
FRAUD_SEVERITY = {
    'ACCOUNT_TAKEOVER': 95,
    'RING_ACTIVITY': 90,
    'LAYERING': 85,
    'STRUCTURING': 85,
    'MULE_OUT': 80,
    'MULE_IN': 75,
    'SCATTER': 70,
    'GATHER': 70,
    'TRANSFER': 0,
}

# Hybrid model weights
XGBOOST_WEIGHT = 0.70  # 70% weight to XGBoost
QDRANT_WEIGHT = 0.30   # 30% weight to Qdrant similarity

# Drive path for saving models (modify this to your mounted drive path)
DRIVE_PATH = "/content/drive/MyDrive/financialFraudDetectionUsingAIAgent"

# ============================================================================
# QDRANT VECTOR STORE WITH RICH EMBEDDINGS
# ============================================================================
class FraudVectorStore:
    """
    Qdrant vector store with RICH embeddings for fraud detection

    Embedding Structure (12 dimensions):
        [0-4]  : 5 core features (geo_distance, impossible_travel, etc.)
        [5-7]  : Account behavior (age, transaction frequency, amount pattern)
        [8-10] : Network features (unique recipients, device diversity, IP diversity)
        [11]   : Time pattern (hour normalized)
    """

    def __init__(self, collection_name="fraud_embeddings", use_memory=True):
        """Initialize Qdrant client"""
        self.collection_name = collection_name
        self.scaler = StandardScaler()

        # Try server, fallback to memory
        if not use_memory:
            try:
                test_client = QdrantClient(host="localhost", port=6333, timeout=2)
                test_client.get_collections()
                self.client = test_client
                print("‚úì Connected to Qdrant server at localhost:6333")
                return
            except:
                print("‚ö†Ô∏è  Qdrant server not found, using in-memory mode")

        self.client = QdrantClient(":memory:")
        print("‚úì Using Qdrant in-memory mode")

    def create_rich_embeddings(self, df, features_df):
        """
        Create 12D embeddings combining features + behavioral signals

        Args:
            df: Original dataframe with all transaction data
            features_df: DataFrame with 5 core features

        Returns:
            12D embeddings (numpy array)
        """
        embeddings = []

        # Core 5 features (dimensions 0-4)
        core_features = features_df.values

        # Calculate additional behavioral features
        # Account behavior (dimensions 5-7)
        account_age_days = (df['timestamp'] - df.groupby('from_account')['timestamp'].transform('min')).dt.total_seconds() / 86400
        account_age_normalized = np.clip(account_age_days / 365, 0, 1)  # Normalize to 0-1

        account_tx_frequency = df.groupby('from_account').cumcount() + 1
        tx_freq_normalized = np.clip(account_tx_frequency / 100, 0, 1)  # Normalize

        amount_normalized = np.clip(df['amount'] / 10000, 0, 1)  # Normalize amounts

        # Network features (dimensions 8-10)
        unique_recipients = df.groupby('from_account')['to_account'].transform('nunique')
        recipients_normalized = np.clip(unique_recipients / 50, 0, 1)

        device_diversity = df.groupby('from_account')['device_id'].transform('nunique')
        device_normalized = np.clip(device_diversity / 10, 0, 1)

        if 'ip_address' in df.columns:
            ip_diversity = df.groupby('from_account')['ip_address'].transform('nunique')
            ip_normalized = np.clip(ip_diversity / 10, 0, 1)
        else:
            ip_normalized = np.zeros(len(df))

        # Time pattern (dimension 11)
        hour_normalized = df['timestamp'].dt.hour / 24

        # Combine all features into 12D embeddings
        behavioral_features = np.column_stack([
            account_age_normalized,
            tx_freq_normalized,
            amount_normalized,
            recipients_normalized,
            device_normalized,
            ip_normalized,
            hour_normalized
        ])

        # Concatenate: [5 core features] + [7 behavioral features] = 12D
        full_embeddings = np.hstack([core_features, behavioral_features])

        # Normalize for cosine similarity
        normalized_embeddings = self.scaler.fit_transform(full_embeddings)

        print(f"‚úì Created {normalized_embeddings.shape[0]:,} embeddings of {normalized_embeddings.shape[1]}D")

        return normalized_embeddings

    def create_collection(self, vector_size=12):
        """Create Qdrant collection"""
        try:
            self.client.delete_collection(collection_name=self.collection_name)
        except:
            pass

        self.client.create_collection(
            collection_name=self.collection_name,
            vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE)
        )
        print(f"‚úì Created collection: {self.collection_name} ({vector_size}D vectors)")

    def add_transactions(self, embeddings, df, fraud_types, risk_scores, transaction_ids):
        """Add transactions with rich metadata"""

        print(f"Uploading {len(embeddings):,} vectors to Qdrant...")

        batch_size = 1000
        total_batches = (len(embeddings) + batch_size - 1) // batch_size

        for batch_idx in range(0, len(embeddings), batch_size):
            batch_end = min(batch_idx + batch_size, len(embeddings))

            points = []
            for idx in range(batch_idx, batch_end):
                # Rich payload for AI agent queries
                payload = {
                    "transaction_id": str(transaction_ids[idx]),
                    "fraud_type": str(fraud_types.iloc[idx]),
                    "risk_score": float(risk_scores.iloc[idx]),
                    "is_fraud": int(risk_scores.iloc[idx] > 0),

                    # Transaction details
                    "from_account": str(df.iloc[idx]['from_account']),
                    "to_account": str(df.iloc[idx]['to_account']),
                    "amount": float(df.iloc[idx]['amount']),
                    "timestamp": str(df.iloc[idx]['timestamp']),

                    # Location
                    "country": str(df.iloc[idx].get('location.country', 'Unknown')),
                    "city": str(df.iloc[idx].get('location.city', 'Unknown')),

                    # Device/IP
                    "device_id": str(df.iloc[idx].get('device_id', 'Unknown')),
                    "ip_address": str(df.iloc[idx].get('ip_address', 'Unknown')),

                    # Features (for analysis)
                    "geo_distance": float(df.iloc[idx]['geo_distance_km']),
                    "impossible_travel": int(df.iloc[idx]['impossible_travel']),
                    "location_changes": int(df.iloc[idx]['location_changes']),
                    "is_high_amount": int(df.iloc[idx]['is_high_amount']),
                    "is_round_amount": int(df.iloc[idx]['is_round_amount']),
                }

                point = PointStruct(
                    id=idx,
                    vector=embeddings[idx].tolist(),
                    payload=payload
                )
                points.append(point)

            self.client.upsert(collection_name=self.collection_name, points=points)

            if (batch_idx // batch_size + 1) % 10 == 0 or batch_end == len(embeddings):
                print(f"  Progress: {batch_end:,}/{len(embeddings):,} ({batch_end/len(embeddings)*100:.1f}%)")

        print(f"‚úì Uploaded complete!")

    def get_similarity_score_batch(self, query_embeddings, top_k=20, fraud_only=True):
        """
        Get similarity scores for multiple transactions at once (MUCH FASTER)

        Args:
            query_embeddings: numpy array of shape (n_samples, embedding_dim)
            top_k: number of similar transactions to find
            fraud_only: only search fraud transactions

        Returns:
            similarity_scores: array of scores (0-100)
        """

        # Transform all embeddings at once
        query_vectors = self.scaler.transform(query_embeddings).tolist()

        # Build filter
        if fraud_only:
            search_filter = Filter(
                must=[FieldCondition(key="is_fraud", match=MatchValue(value=1))]
            )
        else:
            search_filter = None

        similarity_scores = []

        # Batch search (process multiple at once)
        batch_size = 100
        for i in range(0, len(query_vectors), batch_size):
            batch_vectors = query_vectors[i:i+batch_size]

            for query_vector in batch_vectors:
                try:
                    results = self.client.search(
                        collection_name=self.collection_name,
                        query_vector=query_vector,
                        limit=top_k,
                        query_filter=search_filter
                    )
                except (AttributeError, TypeError):
                    try:
                        results = self.client.search(
                            collection_name=self.collection_name,
                            query_vector=query_vector,
                            limit=top_k,
                            search_filter=search_filter
                        )
                    except:
                        from qdrant_client.http import models
                        search_result = self.client.query_points(
                            collection_name=self.collection_name,
                            query=query_vector,
                            limit=top_k,
                            query_filter=search_filter,
                            with_payload=True
                        )
                        results = search_result.points if hasattr(search_result, 'points') else []

                if results:
                    # Calculate weighted similarity score
                    sim_scores = [r.score for r in results]
                    fraud_risks = [r.payload['risk_score'] for r in results]
                    weighted_risk = sum(s * r for s, r in zip(sim_scores, fraud_risks))
                    total_weight = sum(sim_scores)
                    score = weighted_risk / total_weight if total_weight > 0 else 0
                    similarity_scores.append(score)
                else:
                    similarity_scores.append(0)

            # Progress indicator
            if (i + batch_size) % 1000 == 0:
                print(f"  Batch similarity progress: {i + batch_size}/{len(query_vectors)}")

        return np.array(similarity_scores)

    def save_scaler(self, filepath):
        """Save the StandardScaler for later use"""
        os.makedirs(os.path.dirname(filepath), exist_ok=True)
        with open(filepath, 'wb') as f:
            pickle.dump(self.scaler, f)
        print(f"‚úì Saved scaler to: {filepath}")

# ============================================================================
# HYBRID MODEL: XGBoost + Qdrant (OPTIMIZED)
# ============================================================================
class HybridFraudDetector:
    """
    Hybrid model combining XGBoost and Qdrant similarity

    Score = 0.7 * XGBoost + 0.3 * Qdrant Similarity
    """

    def __init__(self, xgboost_model, qdrant_store, xgb_weight=0.7, qdrant_weight=0.3):
        self.xgboost = xgboost_model
        self.qdrant = qdrant_store
        self.xgb_weight = xgb_weight
        self.qdrant_weight = qdrant_weight

    def predict_batch(self, features_5d, embeddings_12d):
        """
        Predict risk scores for multiple transactions at once (FAST)

        Args:
            features_5d: numpy array (n_samples, 5)
            embeddings_12d: numpy array (n_samples, 12)

        Returns:
            hybrid_scores: array of combined risk scores
            xgb_scores: array of XGBoost predictions
            similarity_scores: array of Qdrant similarity scores
        """

        # XGBoost predictions (vectorized - FAST)
        xgb_scores = np.clip(self.xgboost.predict(features_5d), 0, 100)

        # Qdrant similarity (batch processing - MUCH FASTER)
        similarity_scores = self.qdrant.get_similarity_score_batch(
            embeddings_12d, top_k=20, fraud_only=True
        )

        # Combine scores
        hybrid_scores = (self.xgb_weight * xgb_scores +
                        self.qdrant_weight * similarity_scores)

        return hybrid_scores, xgb_scores, similarity_scores

# ============================================================================
# FEATURE ENGINEERING (Same as before)
# ============================================================================
def create_5_features(df):
    """Create 5 core features"""
    df = df.copy()

    # Remove type column
    fraud_types = None
    if 'type' in df.columns:
        fraud_types = df['type'].copy()
        df = df.drop(columns=['type'])

    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df = df.sort_values(['from_account', 'timestamp']).reset_index(drop=True)

    # 1. geo_distance_km
    if all(col in df.columns for col in ['location.latitude', 'location.longitude']):
        df['prev_lat'] = df.groupby('from_account')['location.latitude'].shift(1)
        df['prev_lon'] = df.groupby('from_account')['location.longitude'].shift(1)
        df['prev_lat'] = df['prev_lat'].fillna(df['location.latitude'])
        df['prev_lon'] = df['prev_lon'].fillna(df['location.longitude'])

        lat1 = np.radians(df['prev_lat'])
        lon1 = np.radians(df['prev_lon'])
        lat2 = np.radians(df['location.latitude'])
        lon2 = np.radians(df['location.longitude'])

        dlat = lat2 - lat1
        dlon = lon2 - lon1

        a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
        c = 2 * np.arcsin(np.sqrt(np.clip(a, 0, 1)))
        df['geo_distance_km'] = 6371 * c
    else:
        df['geo_distance_km'] = 0

    # 2. impossible_travel
    df['prev_timestamp'] = df.groupby('from_account')['timestamp'].shift(1)
    df['time_diff_hours'] = (df['timestamp'] - df['prev_timestamp']).dt.total_seconds() / 3600
    df['time_diff_hours'] = df['time_diff_hours'].fillna(24).replace(0, 0.001)
    df['travel_speed_kmh'] = df['geo_distance_km'] / df['time_diff_hours']
    df['impossible_travel'] = (df['travel_speed_kmh'] > 800).astype(int)

    # 3. location_changes
    df['location_moved'] = (df['geo_distance_km'] > 1).astype(int)
    df['location_changes'] = df.groupby('from_account')['location_moved'].cumsum()

    # 4. is_high_amount
    threshold_95 = df['amount'].quantile(0.95)
    df['is_high_amount'] = (df['amount'] >= threshold_95).astype(int)

    # 5. is_round_amount
    df['is_round_amount'] = (df['amount'] % 100 == 0).astype(int)

    return df, fraud_types

# ============================================================================
# ONNX EXPORT FUNCTIONS
# ============================================================================
def export_xgboost_to_onnx(xgb_model, output_path, feature_names, n_features=5):
    """
    Export XGBoost model to ONNX format

    Args:
        xgb_model: Trained XGBoost model
        output_path: Path to save ONNX model
        feature_names: List of feature names
        n_features: Number of input features
    """
    if not ONNX_AVAILABLE:
        print("‚ùå ONNX libraries not available. Cannot export model.")
        return False

    try:
        print(f"\n{'='*80}")
        print("EXPORTING MODEL TO ONNX FORMAT")
        print(f"{'='*80}")

        # Ensure output directory exists
        os.makedirs(os.path.dirname(output_path), exist_ok=True)

        # Define the input type for ONNX
        initial_type = [('float_input', FloatTensorType([None, n_features]))]

        # Convert to ONNX
        print(f"\n[1/3] Converting XGBoost model to ONNX...")
        onnx_model = convert_sklearn(
            xgb_model,
            initial_types=initial_type,
            target_opset=12,
            options={id(xgb_model): {'zipmap': False}}
        )

        # Save ONNX model
        print(f"[2/3] Saving ONNX model to: {output_path}")
        with open(output_path, "wb") as f:
            f.write(onnx_model.SerializeToString())

        # Verify the model
        print(f"[3/3] Verifying ONNX model...")
        onnx_model_check = onnx.load(output_path)
        onnx.checker.check_model(onnx_model_check)

        # Test inference
        print(f"\n[TEST] Testing ONNX inference...")
        ort_session = ort.InferenceSession(output_path)

        # Create dummy input
        dummy_input = np.random.rand(1, n_features).astype(np.float32)
        ort_inputs = {ort_session.get_inputs()[0].name: dummy_input}
        ort_outputs = ort_session.run(None, ort_inputs)

        print(f"‚úì ONNX model successfully exported and verified!")
        print(f"\nModel Details:")
        print(f"  - Input shape: (batch_size, {n_features})")
        print(f"  - Output shape: (batch_size, 1)")
        print(f"  - Features: {feature_names}")
        print(f"  - File size: {os.path.getsize(output_path) / 1024:.2f} KB")

        return True

    except Exception as e:
        print(f"‚ùå Error exporting to ONNX: {str(e)}")
        return False

def save_model_metadata(metadata_path, model_info):
    """Save model metadata as JSON"""
    os.makedirs(os.path.dirname(metadata_path), exist_ok=True)
    with open(metadata_path, 'w') as f:
        json.dump(model_info, f, indent=2)
    print(f"‚úì Saved model metadata to: {metadata_path}")

# ============================================================================
# TRAINING (WITH ONNX EXPORT)
# ============================================================================
def train_hybrid_model(json_file_path, save_path=DRIVE_PATH):
    """Train hybrid XGBoost + Qdrant model and export to ONNX"""

    print("="*80)
    print("HYBRID FRAUD DETECTION: XGBoost + Qdrant Vector Similarity")
    print("="*80)

    # Load data
    print("\n[1/7] Loading data...")
    with open(json_file_path, 'r') as f:
        data = json.load(f)
    df_original = pd.json_normalize(data)
    print(f"Loaded {len(df_original):,} transactions")

    # Engineer features
    print("\n[2/7] Engineering 5 core features...")
    df, fraud_types = create_5_features(df_original.copy())

    # Create target
    print("\n[3/7] Creating target...")
    y = fraud_types.map(FRAUD_SEVERITY).fillna(0)
    print(f"Fraud: {(y > 0).sum():,} ({(y > 0).mean()*100:.1f}%)")

    # Select 5D features for XGBoost
    feature_names = ['geo_distance_km', 'impossible_travel', 'location_changes',
                     'is_high_amount', 'is_round_amount']
    X_5d = df[feature_names].copy()
    X_5d = X_5d.replace([np.inf, -np.inf], np.nan).fillna(0)

    print(f"\nFeature matrix: {X_5d.shape}")

    # Split data
    print("\n[4/7] Splitting data...")
    X_train, X_temp, y_train, y_temp = train_test_split(
        X_5d, y, test_size=0.3, random_state=42, stratify=(y > 0)
    )
    X_val, X_test, y_val, y_test = train_test_split(
        X_temp, y_temp, test_size=0.5, random_state=42, stratify=(y_temp > 0)
    )

    train_indices = X_train.index
    val_indices = X_val.index
    test_indices = X_test.index

    print(f"Train: {len(X_train):,} | Val: {len(X_val):,} | Test: {len(X_test):,}")

    # Train XGBoost
    print("\n[5/7] Training XGBoost...")
    xgb_model = XGBRegressor(
        n_estimators=200,
        max_depth=5,
        learning_rate=0.1,
        subsample=0.8,
        random_state=42,
        n_jobs=-1
    )

    xgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=50)

    print("\n[6/7] Creating Qdrant vector store...")
    qdrant = FraudVectorStore(use_memory=True)
    qdrant.create_collection(vector_size=12)

    # Create 12D embeddings for TRAINING data
    print("\nCreating embeddings for training data...")
    train_embeddings = qdrant.create_rich_embeddings(
        df.iloc[train_indices],
        X_train
    )

    # Add training data to Qdrant
    qdrant.add_transactions(
        embeddings=train_embeddings,
        df=df.iloc[train_indices],
        fraud_types=fraud_types.iloc[train_indices],
        risk_scores=y_train,
        transaction_ids=train_indices
    )

    print("\n[7/7] Generating hybrid predictions on test set...")

    # Create 12D embeddings for TEST data
    test_embeddings = qdrant.create_rich_embeddings(
        df.iloc[test_indices],
        X_test
    )

    # Create hybrid model
    hybrid_model = HybridFraudDetector(
        xgboost_model=xgb_model,
        qdrant_store=qdrant,
        xgb_weight=XGBOOST_WEIGHT,
        qdrant_weight=QDRANT_WEIGHT
    )

    # Generate hybrid predictions
    print("\nGenerating predictions...")
    y_test_hybrid, y_test_xgb, y_test_similarity = hybrid_model.predict_batch(
        X_test.values,
        test_embeddings
    )

    # ========================================================================
    # EVALUATION
    # ========================================================================

    print("\n" + "="*80)
    print("MODEL COMPARISON")
    print("="*80)

    y_test_binary = (y_test > 0).astype(int)

    # XGBoost only
    auc_xgb = roc_auc_score(y_test_binary, y_test_xgb)
    precision_xgb, recall_xgb, _ = precision_recall_curve(y_test_binary, y_test_xgb)
    auc_pr_xgb = auc(recall_xgb, precision_xgb)

    # Hybrid
    auc_hybrid = roc_auc_score(y_test_binary, y_test_hybrid)
    precision_hybrid, recall_hybrid, _ = precision_recall_curve(y_test_binary, y_test_hybrid)
    auc_pr_hybrid = auc(recall_hybrid, precision_hybrid)

    print(f"\nXGBoost Only:")
    print(f"  AUC-ROC: {auc_xgb:.4f}")
    print(f"  AUC-PR:  {auc_pr_xgb:.4f}")

    print(f"\nHybrid (XGBoost + Qdrant):")
    print(f"  AUC-ROC: {auc_hybrid:.4f}  {'‚úÖ +' + f'{(auc_hybrid-auc_xgb):.4f}' if auc_hybrid > auc_xgb else '‚ö†Ô∏è  ' + f'{(auc_hybrid-auc_xgb):.4f}'}")
    print(f"  AUC-PR:  {auc_pr_hybrid:.4f}  {'‚úÖ +' + f'{(auc_pr_hybrid-auc_pr_xgb):.4f}' if auc_pr_hybrid > auc_pr_xgb else '‚ö†Ô∏è  ' + f'{(auc_pr_hybrid-auc_pr_xgb):.4f}'}")

    improvement = ((auc_hybrid - auc_xgb) / auc_xgb * 100) if auc_xgb > 0 else 0
    print(f"\n‚ú® Qdrant improves AUC-ROC by {improvement:.2f}%")

    # ========================================================================
    # SAVE MODELS
    # ========================================================================

    print("\n" + "="*80)
    print("SAVING MODELS")
    print("="*80)

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    # 1. Save XGBoost model as ONNX
    onnx_path = os.path.join(save_path, f"fraud_model_xgboost_{timestamp}.onnx")
    export_success = export_xgboost_to_onnx(
        xgb_model,
        onnx_path,
        feature_names,
        n_features=len(feature_names)
    )

    # 2. Save XGBoost model in native format (backup)
    xgb_native_path = os.path.join(save_path, f"fraud_model_xgboost_{timestamp}.json")
    xgb_model.save_model(xgb_native_path)
    print(f"\n‚úì Saved XGBoost native model to: {xgb_native_path}")

    # 3. Save Qdrant scaler
    scaler_path = os.path.join(save_path, f"qdrant_scaler_{timestamp}.pkl")
    qdrant.save_scaler(scaler_path)

    # 4. Save model metadata
    metadata = {
        "model_type": "Hybrid XGBoost + Qdrant",
        "timestamp": timestamp,
        "training_date": datetime.now().isoformat(),
        "feature_names": feature_names,
        "n_features": len(feature_names),
        "embedding_dimension": 12,
        "xgboost_weight": XGBOOST_WEIGHT,
        "qdrant_weight": QDRANT_WEIGHT,
        "performance": {
            "xgboost_auc_roc": float(auc_xgb),
            "xgboost_auc_pr": float(auc_pr_xgb),
            "hybrid_auc_roc": float(auc_hybrid),
            "hybrid_auc_pr": float(auc_pr_hybrid),
            "improvement_percent": float(improvement)
        },
        "training_data": {
            "total_samples": len(df_original),
            "fraud_samples": int((y > 0).sum()),
            "fraud_percentage": float((y > 0).mean() * 100),
            "train_size": len(X_train),
            "val_size": len(X_val),
            "test_size": len(X_test)
        },
        "files": {
            "onnx_model": onnx_path,
            "xgboost_native": xgb_native_path,
            "scaler": scaler_path,
        },
        "qdrant_config": {
            "collection_name": qdrant.collection_name,
            "vector_dimension": 12,
            "distance_metric": "cosine"
        }
    }

    metadata_path = os.path.join(save_path, f"model_metadata_{timestamp}.json")
    save_model_metadata(metadata_path, metadata)

    return hybrid_model, qdrant, X_test, y_test, y_test_hybrid, auc_xgb, auc_hybrid, metadata

# ============================================================================
# MAIN
# ============================================================================
if __name__ == "__main__":
    # Update this path to your actual JSON file location
    json_file = os.path.join(DRIVE_PATH, "fraud_data.json")

    model, qdrant, X_test, y_test, predictions, auc_xgb, auc_hybrid, metadata = train_hybrid_model(
        json_file,
        save_path=DRIVE_PATH
    )

    print("\n" + "="*80)
    print("‚úÖ HYBRID MODEL TRAINING & EXPORT COMPLETE!")
    print("="*80)
    print("\nModel Performance:")
    print(f"  XGBoost AUC-ROC: {auc_xgb:.4f}")
    print(f"  Hybrid AUC-ROC:  {auc_hybrid:.4f}")
    print(f"  Improvement:     +{((auc_hybrid-auc_xgb)/auc_xgb*100):.2f}%")

    print("\nüìÅ Saved Files:")
    print(f"  - ONNX Model: {metadata['files']['onnx_model']}")
    print(f"  - XGBoost Native: {metadata['files']['xgboost_native']}")
    print(f"  - Scaler: {metadata['files']['scaler']}")
    print(f"  - Metadata: {metadata['files'].get('metadata', 'N/A')}")

    print("\nü§ñ For AI Agent Integration:")
    print("  1. Use ONNX model for inference (fastest)")
    print("  2. Load scaler for embedding normalization")
    print("  3. Connect to Qdrant collection: 'fraud_embeddings'")
    print("  4. Input: 5D feature vector")
    print("  5. Output: Risk score (0-100)")

    print("\nüìö ONNX Model Usage Example:")
    print("```python")
    print("import onnxruntime as ort")
    print("import numpy as np")
    print("")
    print("# Load ONNX model")
    print(f"session = ort.InferenceSession('{os.path.basename(metadata['files']['onnx_model'])}')")
    print("")
    print("# Prepare input (5 features)")
    print("input_data = np.array([[geo_distance, impossible_travel, location_changes,")
    print("                        is_high_amount, is_round_amount]], dtype=np.float32)")
    print("")
    print("# Run inference")
    print("input_name = session.get_inputs()[0].name")
    print("output = session.run(None, {input_name: input_data})")
    print("risk_score = output[0][0]")
    print("```")
# fraud_detection_with_onnx_export.py
# Displaying fraud_detection_with_onnx_export.py.

‚ö†Ô∏è  ONNX libraries not available. Install with: pip install onnx onnxruntime skl2onnx
HYBRID FRAUD DETECTION: XGBoost + Qdrant Vector Similarity

[1/7] Loading data...
Loaded 100,000 transactions

[2/7] Engineering 5 core features...

[3/7] Creating target...
Fraud: 49,243 (49.2%)

Feature matrix: (100000, 5)

[4/7] Splitting data...
Train: 70,000 | Val: 15,000 | Test: 15,000

[5/7] Training XGBoost...
[0]	validation_0-rmse:41.03182
[50]	validation_0-rmse:40.99818
[100]	validation_0-rmse:41.00202
[150]	validation_0-rmse:40.99097
[199]	validation_0-rmse:40.98784

[6/7] Creating Qdrant vector store...
‚úì Using Qdrant in-memory mode
‚úì Created collection: fraud_embeddings (12D vectors)

Creating embeddings for training data...
‚úì Created 70,000 embeddings of 12D
Uploading 70,000 vectors to Qdrant...
  Progress: 10,000/70,000 (14.3%)
  Progress: 20,000/70,000 (28.6%)
  Progress: 30,000/70,000 (42.9%)
  Progress: 40,000/70,000 (57.1%)
  Progress: 50,000/70,000 (71.4%)
  Progress: 60,

In [None]:
import torch

if torch.cuda.is_available():
    print("‚úÖ GPU is available")
    print("GPU Name:", torch.cuda.get_device_name(0))
else:
    print("‚ùå GPU is NOT available (using CPU)")


‚úÖ GPU is available
GPU Name: Tesla T4
