In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from typing import Dict, Any, List, Optional, Callable, Tuple
import time
import random
import logging
import asyncio
import hashlib
import json
from collections import defaultdict, deque
from dataclasses import dataclass
import threading

# Scientific & statistical
from scipy import stats
from scipy.stats import wasserstein_distance, ks_2samp, ttest_ind

# ML & data science
from sklearn.datasets import make_classification, load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

# Graphing and analysis
#import networkx as nx

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

print("Setup complete. All dependencies imported.")

Setup complete. All dependencies imported.


### A/B Testing

In [4]:
import time
import random
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

class SimpleABTestRouter:
    """
    Basic A/B test router that randomly splits traffic.
    """
    def __init__(self, legacy_model, candidate_model, 
                 candidate_traffic_fraction=0.1):
        self.legacy_model = legacy_model
        self.candidate_model = candidate_model
        self.traffic_fraction = candidate_traffic_fraction
        
        # Track metrics for both groups
        self.metrics = {
            'legacy': {
                'count': 0,
                'predictions': [],
                'latencies': []
            },
            'candidate': {
                'count': 0,
                'predictions': [],
                'latencies': []
            }
        }
    
    def route_request(self, input_data: np.ndarray, user_id: str) -> Dict[str, Any]:
        """
        Route request to either legacy or candidate model.
        Randomly assign based on traffic fraction.
        """
        start_time = time.time()
        
        # Reshape input if it's 1D (single sample)
        if len(input_data.shape) == 1:
            input_data = input_data.reshape(1, -1)
        
        # Random assignment
        if random.random() < self.traffic_fraction:
            prediction = self.candidate_model.predict(input_data)
            model_used = 'candidate'
        else:
            prediction = self.legacy_model.predict(input_data)
            model_used = 'legacy'
        
        latency = time.time() - start_time
        
        # Log metrics
        self.metrics[model_used]['count'] += 1
        self.metrics[model_used]['predictions'].append(prediction[0])  # assuming predict returns array
        self.metrics[model_used]['latencies'].append(latency)
        
        return {
            'model': model_used,
            'prediction': prediction[0],  # extract scalar
            'latency': latency
        }

    def get_summary_stats(self) -> Dict[str, Any]:
        """Return aggregated metrics for both groups."""
        summary = {}
        for group in ['legacy', 'candidate']:
            metrics = self.metrics[group]
            summary[group] = {
                'count': metrics['count'],
                'avg_prediction': np.mean(metrics['predictions']) if metrics['predictions'] else 0,
                'avg_latency_ms': np.mean(metrics['latencies']) * 1000 if metrics['latencies'] else 0,
                'std_latency_ms': np.std(metrics['latencies']) * 1000 if metrics['latencies'] else 0
            }
        return summary

# Demo usage
print("\n=== A/B Test Router Demo ===")
legacy = LogisticRegression(random_state=42)
candidate = LogisticRegression(random_state=42, C=0.5)

X, y = make_classification(n_samples=1000, n_features=20, 
                          n_informative=10, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

legacy.fit(X_train, y_train)
candidate.fit(X_train, y_train)

# Create router and simulate requests
router = SimpleABTestRouter(legacy, candidate, candidate_traffic_fraction=0.3)

for i in range(200):
    # Reshape sample into 2D array
    sample = X_test[i].reshape(1, -1)
    result = router.route_request(sample, user_id=f"user_{i}")

print(router.get_summary_stats())



=== A/B Test Router Demo ===
{'legacy': {'count': 145, 'avg_prediction': 0.5379310344827586, 'avg_latency_ms': 1.3351308888402478, 'std_latency_ms': 6.82747961941215}, 'candidate': {'count': 55, 'avg_prediction': 0.509090909090909, 'avg_latency_ms': 0.4689173264936967, 'std_latency_ms': 1.0437011969191194}}


The output of your A/B Test Router Demo shows the key summary statistics for the "legacy" and "candidate" models after routing 200 requests with a 30% candidate traffic fraction:

- Legacy model received 145 requests, candidate model received 55.
- Average prediction values are ~0.538 for legacy and 0.509 for candidate.
- Average latency for legacy is about 1.34 ms with high variance (std ~6.83 ms).
- Candidate model shows faster average response time (~0.47 ms) with less variance (std ~1.04 ms).

***

- **Traffic split:** The fractional count roughly matches 70/30 expected split.
- **Prediction averages:** Both models are producing predictions around 0.5, indicating similar output distributions (likely class probabilities or regression predictions centered near 0.5).
- **Latency:** The candidate model appears more efficient or less loaded, showing lower latency and less variability than the legacy, although legacy's higher variance suggests some outliers or delays in some requests.
- **Performance insight:** This baseline summary indicates the candidate can handle about 1/3rd of traffic with lower response times, and predictions are roughly similar in average value. Further analysis (like accuracy, significance tests) would be needed to conclude if the candidate is preferable.

***

This demonstrates how simple A/B traffic routing with metrics collection can provide actionable operational insights into model behavior in production settings, guiding rollout decisions with real user traffic.

In [5]:
from scipy import stats

def ab_test_significance(group_a_metrics: List[float], 
                        group_b_metrics: List[float],
                        alpha: float = 0.05) -> Dict[str, Any]:
    """
    Perform two‑sample t‑test to determine if groups differ significantly.
    
    Returns:
        - p_value: probability of observing this difference by chance
        - significant: whether difference is statistically significant
        - effect_size: Cohen's d (magnitude of difference)
        - confidence_interval: CI for mean difference
    """
    # t‑test
    t_stat, p_value = stats.ttest_ind(group_a_metrics, group_b_metrics)
    
    # Effect size (Cohen's d)
    mean_a, mean_b = np.mean(group_a_metrics), np.mean(group_b_metrics)
    std_a, std_b = np.std(group_a_metrics), np.std(group_b_metrics)
    n_a, n_b = len(group_a_metrics), len(group_b_metrics)
    
    pooled_std = np.sqrt(((n_a - 1) * std_a**2 + (n_b - 1) * std_b**2) / (n_a + n_b - 2))
    cohens_d = (mean_b - mean_a) / pooled_std if pooled_std > 0 else 0
    
    # Confidence intervals
    sem_a = stats.sem(group_a_metrics)
    sem_b = stats.sem(group_b_metrics)
    ci_a = stats.t.interval(1 - alpha, n_a - 1, 
                           loc=mean_a, scale=sem_a)
    ci_b = stats.t.interval(1 - alpha, n_b - 1, 
                           loc=mean_b, scale=sem_b)
    
    return {
        'p_value': p_value,
        'significant': p_value < alpha,
        'effect_size_cohens_d': cohens_d,
        'mean_a': mean_a,
        'mean_b': mean_b,
        'ci_a': ci_a,
        'ci_b': ci_b,
        'interpretation': 'Significant difference' if p_value < alpha 
                         else 'No significant difference'
    }

# Demo: Simulate A/B test results
legacy_accuracy = np.random.normal(0.85, 0.02, 500)
candidate_accuracy = np.random.normal(0.87, 0.02, 500)

result = ab_test_significance(legacy_accuracy, candidate_accuracy)
print("\n=== A/B Test Significance ===")
for key, value in result.items():
    print(f"{key}: {value}")



=== A/B Test Significance ===
p_value: 6.937972761017257e-49
significant: True
effect_size_cohens_d: 0.9832633766182235
mean_a: 0.8507763942328004
mean_b: 0.8702101723594721
ci_a: (0.8490006148183638, 0.852552173647237)
ci_b: (0.8685100541438253, 0.8719102905751188)
interpretation: Significant difference


The output from your "=== A/B Test Significance ===" shows results from a statistical test comparing two models:

- **p_value ≈ 6.94e-49**: Extremely small p-value indicates the difference in performance between the two models is statistically significant.
- **significant: True**: Confirms the difference is unlikely due to random chance.
- **effect_size_cohens_d ≈ 0.98**: A large effect size, meaning the difference is not just statistically significant but also practically meaningful.
- **mean_a ≈ 0.851 and mean_b ≈ 0.870**: The candidate model (b) has a higher mean accuracy than the legacy (a).
- **Confidence intervals** for each mean are tight and do not overlap, reinforcing the significant difference.
- **Interpretation: Significant difference**: The candidate model outperforms legacy with strong evidence.

***

Your A/B testing shows that the candidate model provides a **statistically and practically significant improvement** over the legacy model’s performance. This supports a strong case for adopting the candidate model in production, assuming other operational metrics (latency, stability) are acceptable.

In [7]:
import time
import hashlib
import numpy as np
from typing import Dict, Any
from collections import defaultdict
import logging

logger = logging.getLogger(__name__)

class CanaryRouter:
    """
    Routes requests to candidate model for canary users,
    legacy for rest. Deterministically assigns users based on hash.
    """
    def __init__(self, legacy_model, candidate_model, 
                 canary_traffic_fraction=0.05):
        self.legacy_model = legacy_model
        self.candidate_model = candidate_model
        self.canary_fraction = canary_traffic_fraction
        
        # Health tracking
        self.canary_metrics = defaultdict(list)
        self.legacy_metrics = defaultdict(list)
        self.user_segments = set()
        self.health_metrics = defaultdict(list)
        self.is_healthy = True
    
    def is_canary_user(self, user_id: str) -> bool:
        """
        Deterministically assign user to canary or not.
        Same user always gets same assignment.
        """
        hash_value = int(hashlib.md5(user_id.encode()).hexdigest(), 16)
        return (hash_value % 100) < (self.canary_fraction * 100)
    
    def route_request(self, user_id: str, input_data: np.ndarray) -> Dict[str, Any]:
        # Reshape if input_data is 1D (single sample)
        if len(input_data.shape) == 1:
            input_data = input_data.reshape(1, -1)
        
        start_time = time.time()
        try:
            if self.is_canary_user(user_id):
                self.user_segments.add(user_id)
                prediction = self.candidate_model.predict(input_data)
                model_type = 'candidate'
            else:
                prediction = self.legacy_model.predict(input_data)
                model_type = 'legacy'
            
            latency = time.time() - start_time
            self.health_metrics['latency'].append(latency)
            return {'model': model_type, 'prediction': prediction[0], 'latency': latency}
        
        except Exception as e:
            self.health_metrics['error_rate'].append(1)
            logger.error(f"Error in route_request: {str(e)}")
            raise e
    
    def check_canary_health(self, 
                            latency_threshold_ms=100,
                            error_threshold=0.01) -> bool:
        """
        Check if canary metrics exceed thresholds.
        Return False if unhealthy (should rollback).
        """
        if not self.canary_metrics['latency']:
            return True  # Not enough data yet
        
        avg_latency = np.mean(self.canary_metrics['latency'])
        error_rate = len(self.canary_metrics['error']) / \
                     (len(self.canary_metrics['latency']) + 
                      len(self.canary_metrics['error']) + 1)
        
        is_healthy = (avg_latency * 1000 < latency_threshold_ms and
                      error_rate < error_threshold)
        
        logger.info(f"Canary health: latency={avg_latency*1000:.2f}ms, "
                    f"error_rate={error_rate:.4f}, healthy={is_healthy}")
        
        self.is_healthy = is_healthy
        return is_healthy
    
    def expand_canary(self, new_fraction: float):
        """Increase canary traffic percentage."""
        old_fraction = self.canary_fraction
        self.canary_fraction = min(new_fraction, 1.0)
        logger.info(f"Expanded canary from {old_fraction*100:.1f}% to "
                    f"{self.canary_fraction*100:.1f}%")

# Demo
print("\n=== Canary Deployment Demo ===")
canary_router = CanaryRouter(legacy, candidate, canary_traffic_fraction=0.1)

for i in range(100):
    user_id = f"user_{i}"
    result = canary_router.route_request(user_id, X_test[i])

canary_router.check_canary_health()


=== Canary Deployment Demo ===


True

In [8]:
class AdvancedCanaryRouter:
    """
    Canary router with automatic rollback based on monitored metrics.
    """
    def __init__(self, legacy_model, candidate_model,
                 error_threshold=0.02,
                 latency_threshold_ms=150,
                 window_size=500):
        self.legacy_model = legacy_model
        self.candidate_model = candidate_model
        self.error_threshold = error_threshold
        self.latency_threshold_ms = latency_threshold_ms
        self.window_size = window_size
        
        # Sliding window of recent metrics
        self.metrics_window = deque(maxlen=window_size)
        self.canary_fraction = 0.05
        self.is_healthy = True
        self.rollback_triggered = False
    
    def route_request(self, user_id: str, input_data: Dict[str, Any]
                     ) -> Dict[str, Any]:
        """Route request and monitor metrics."""
        start_time = time.time()
        is_canary = (int(hashlib.md5(user_id.encode()).hexdigest(), 16) % 100 
                    < self.canary_fraction * 100)
        
        try:
            if is_canary and not self.rollback_triggered:
                pred = self.candidate_model.predict(input_data)
                model = 'candidate'
            else:
                pred = self.legacy_model.predict(input_data)
                model = 'legacy'
            
            latency_ms = (time.time() - start_time) * 1000
            
            self.metrics_window.append({
                'model': model,
                'latency_ms': latency_ms,
                'error': False,
                'timestamp': time.time()
            })
            
            # Check health after each request (batched in practice)
            if len(self.metrics_window) % 50 == 0:
                self._check_and_handle_health()
            
            return {'model': model, 'prediction': pred, 'latency_ms': latency_ms}
        
        except Exception as e:
            latency_ms = (time.time() - start_time) * 1000
            self.metrics_window.append({
                'model': 'candidate' if is_canary else 'legacy',
                'latency_ms': latency_ms,
                'error': True,
                'timestamp': time.time()
            })
            raise
    
    def _check_and_handle_health(self):
        """Check metrics and trigger rollback if needed."""
        if len(self.metrics_window) < 100:
            return
        
        # Calculate metrics on candidate traffic only
        candidate_requests = [m for m in self.metrics_window 
                            if m['model'] == 'candidate']
        
        if not candidate_requests:
            return
        
        error_rate = sum(1 for m in candidate_requests if m['error']) / len(candidate_requests)
        avg_latency = np.mean([m['latency_ms'] for m in candidate_requests])
        
        is_unhealthy = (error_rate > self.error_threshold or
                       avg_latency > self.latency_threshold_ms)
        
        if is_unhealthy and not self.rollback_triggered:
            logger.warning(f"ROLLBACK TRIGGERED: error_rate={error_rate:.4f}, "
                          f"avg_latency_ms={avg_latency:.2f}")
            self.rollback_triggered = True
            self.is_healthy = False

# Demo
print("\n=== Advanced Canary with Rollback ===")
advanced_canary = AdvancedCanaryRouter(legacy, candidate)

for i in range(300):
    user_id = f"user_{i}"
    try:
        result = advanced_canary.route_request(user_id, X_test[i % len(X_test)])
    except:
        pass

print(f"Rollback triggered: {advanced_canary.rollback_triggered}")



=== Advanced Canary with Rollback ===
Rollback triggered: False
