In [None]:
# Comprehensive Model Optimization Strategies for Fraud Detection
# Business Constraints: Decline rate ≤ 30%, Agent alerts < 0.1%, Missed fraud ≤ 0.02%

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif, RFE
from sklearn.metrics import make_scorer, precision_recall_curve, roc_auc_score
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE
from imblearn.under_sampling import EditedNearestNeighbours, TomekLinks
from imblearn.combine import SMOTETomek, SMOTEENN
import xgboost as xgb
import lightgbm as lgb

print("MODEL OPTIMIZATION STRATEGIES FOR FRAUD DETECTION")
print("=" * 60)

# =============================================================================
# STRATEGY 1: ADVANCED FEATURE ENGINEERING
# =============================================================================

def advanced_feature_engineering(df):
    """Advanced feature engineering techniques"""
    
    print("\n1. ADVANCED FEATURE ENGINEERING")
    print("-" * 40)
    
    df_enhanced = df.copy()
    
    # 1.1 Polynomial features for key variables
    print("Creating polynomial features...")
    from sklearn.preprocessing import PolynomialFeatures
    
    # Apply to most discriminative features only
    key_features = ['V4', 'V11', 'V12', 'V14', 'Amount']  # Example key features
    if all(col in df.columns for col in key_features):
        poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
        poly_features = poly.fit_transform(df[key_features])
        poly_feature_names = [f"poly_{i}" for i in range(poly_features.shape[1] - len(key_features))]
        
        for i, name in enumerate(poly_feature_names):
            df_enhanced[name] = poly_features[:, len(key_features) + i]
    
    # 1.2 Time-based feature engineering
    print("Creating advanced time features...")
    if 'Time' in df.columns:
        df_enhanced['Hour'] = (df['Time'] % (24*3600)) // 3600
        df_enhanced['Minute'] = (df['Time'] % 3600) // 60
        df_enhanced['Day_of_transaction'] = df['Time'] // (24*3600)
        
        # Cyclic encoding for time
        df_enhanced['Hour_sin'] = np.sin(2 * np.pi * df_enhanced['Hour'] / 24)
        df_enhanced['Hour_cos'] = np.cos(2 * np.pi * df_enhanced['Hour'] / 24)
        df_enhanced['Minute_sin'] = np.sin(2 * np.pi * df_enhanced['Minute'] / 60)
        df_enhanced['Minute_cos'] = np.cos(2 * np.pi * df_enhanced['Minute'] / 60)
        
        # Time-based patterns
        df_enhanced['Is_Night'] = ((df_enhanced['Hour'] >= 23) | (df_enhanced['Hour'] <= 6)).astype(int)
        df_enhanced['Is_Business_Hour'] = ((df_enhanced['Hour'] >= 9) & (df_enhanced['Hour'] <= 17)).astype(int)
        df_enhanced['Is_Weekend'] = ((df_enhanced['Day_of_transaction'] % 7) >= 5).astype(int)
    
    # 1.3 Amount-based feature engineering
    print("Creating advanced amount features...")
    if 'Amount' in df.columns:
        df_enhanced['Amount_log'] = np.log1p(df['Amount'])
        df_enhanced['Amount_sqrt'] = np.sqrt(df['Amount'])
        df_enhanced['Amount_cbrt'] = np.cbrt(df['Amount'])
        
        # Amount percentile ranking
        df_enhanced['Amount_percentile'] = df['Amount'].rank(pct=True)
        
        # Amount categories with business logic
        df_enhanced['Amount_micro'] = (df['Amount'] <= 1).astype(int)
        df_enhanced['Amount_small'] = ((df['Amount'] > 1) & (df['Amount'] <= 100)).astype(int)
        df_enhanced['Amount_medium'] = ((df['Amount'] > 100) & (df['Amount'] <= 1000)).astype(int)
        df_enhanced['Amount_large'] = (df['Amount'] > 1000).astype(int)
    
    # 1.4 PCA feature combinations and statistics
    print("Creating PCA feature combinations...")
    v_columns = [col for col in df.columns if col.startswith('V')]
    if len(v_columns) >= 10:
        df_enhanced['V_sum'] = df[v_columns].sum(axis=1)
        df_enhanced['V_mean'] = df[v_columns].mean(axis=1)
        df_enhanced['V_std'] = df[v_columns].std(axis=1)
        df_enhanced['V_skew'] = df[v_columns].skew(axis=1)
        df_enhanced['V_kurt'] = df[v_columns].kurtosis(axis=1)
        df_enhanced['V_median'] = df[v_columns].median(axis=1)
        df_enhanced['V_max'] = df[v_columns].max(axis=1)
        df_enhanced['V_min'] = df[v_columns].min(axis=1)
        df_enhanced['V_range'] = df_enhanced['V_max'] - df_enhanced['V_min']
        
        # Count positive and negative values
        df_enhanced['V_positive_count'] = (df[v_columns] > 0).sum(axis=1)
        df_enhanced['V_negative_count'] = (df[v_columns] < 0).sum(axis=1)
        df_enhanced['V_zero_count'] = (df[v_columns] == 0).sum(axis=1)
    
    print(f"Features increased from {df.shape[1]} to {df_enhanced.shape[1]}")
    return df_enhanced

# =============================================================================
# STRATEGY 2: ADVANCED SAMPLING TECHNIQUES
# =============================================================================

def optimize_sampling_strategy(X_train, y_train):
    """Test multiple sampling strategies to handle class imbalance"""
    
    print("\n2. ADVANCED SAMPLING OPTIMIZATION")
    print("-" * 40)
    
    sampling_strategies = {
        'SMOTE': SMOTE(random_state=42, k_neighbors=3),
        'ADASYN': ADASYN(random_state=42, n_neighbors=3),
        'BorderlineSMOTE': BorderlineSMOTE(random_state=42, k_neighbors=3),
        'SMOTE_Tomek': SMOTETomek(random_state=42),
        'SMOTE_ENN': SMOTEENN(random_state=42),
    }
    
    sampling_results = {}
    
    for name, sampler in sampling_strategies.items():
        try:
            print(f"Testing {name}...")
            X_resampled, y_resampled = sampler.fit_resample(X_train, y_train)
            
            # Quick evaluation with simple model
            from sklearn.ensemble import RandomForestClassifier
            rf_temp = RandomForestClassifier(n_estimators=50, random_state=42, class_weight='balanced')
            rf_temp.fit(X_resampled, y_resampled)
            
            # Evaluate on validation set
            val_score = rf_temp.score(X_train, y_train)  # Using original training set as validation
            
            sampling_results[name] = {
                'original_size': len(X_train),
                'resampled_size': len(X_resampled),
                'original_fraud_rate': y_train.mean(),
                'resampled_fraud_rate': y_resampled.mean(),
                'validation_accuracy': val_score
            }
            
            print(f"  Original size: {len(X_train):,} → Resampled: {len(X_resampled):,}")
            print(f"  Fraud rate: {y_train.mean():.4f} → {y_resampled.mean():.4f}")
            
        except Exception as e:
            print(f"  Error with {name}: {str(e)}")
    
    # Find best sampling strategy
    best_sampler = max(sampling_results.keys(), 
                      key=lambda k: sampling_results[k]['validation_accuracy'])
    
    print(f"\nBest sampling strategy: {best_sampler}")
    print(f"Validation accuracy: {sampling_results[best_sampler]['validation_accuracy']:.4f}")
    
    return sampling_strategies[best_sampler], sampling_results

# =============================================================================
# STRATEGY 3: HYPERPARAMETER OPTIMIZATION
# =============================================================================

def optimize_model_hyperparameters(X_train, y_train, X_val, y_val):
    """Comprehensive hyperparameter optimization"""
    
    print("\n3. HYPERPARAMETER OPTIMIZATION")
    print("-" * 40)
    
    # Custom scorer for business constraints
    def business_scorer(y_true, y_pred_proba, decline_weight=0.4, alert_weight=0.4, fraud_weight=0.2):
        """Custom scorer considering business constraints"""
        # This is a simplified version - implement based on your threshold logic
        return roc_auc_score(y_true, y_pred_proba)
    
    # Model configurations to test
    model_configs = {
        'RandomForest': {
            'model': RandomForestClassifier(random_state=42),
            'params': {
                'n_estimators': [100, 200, 300, 500],
                'max_depth': [10, 20, 30, None],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 4],
                'class_weight': ['balanced', 'balanced_subsample'],
                'bootstrap': [True, False]
            }
        },
        
        'XGBoost': {
            'model': xgb.XGBClassifier(random_state=42, eval_metric='logloss'),
            'params': {
                'n_estimators': [100, 200, 300],
                'max_depth': [3, 6, 9],
                'learning_rate': [0.01, 0.1, 0.2],
                'subsample': [0.8, 0.9, 1.0],
                'colsample_bytree': [0.8, 0.9, 1.0],
                'scale_pos_weight': [1, 10, 50, 100]
            }
        },
        
        'LightGBM': {
            'model': lgb.LGBMClassifier(random_state=42, verbose=-1),
            'params': {
                'n_estimators': [100, 200, 300],
                'max_depth': [3, 6, 9],
                'learning_rate': [0.01, 0.1, 0.2],
                'num_leaves': [31, 50, 100],
                'subsample': [0.8, 0.9, 1.0],
                'colsample_bytree': [0.8, 0.9, 1.0],
                'class_weight': ['balanced']
            }
        }
    }
    
    best_models = {}
    
    for model_name, config in model_configs.items():
        print(f"\nOptimizing {model_name}...")
        
        try:
            # Use RandomizedSearchCV for faster optimization
            random_search = RandomizedSearchCV(
                config['model'],
                config['params'],
                n_iter=20,  # Reduced for faster execution
                cv=StratifiedKFold(n_splits=3),
                scoring='roc_auc',
                n_jobs=-1,
                random_state=42,
                verbose=0
            )
            
            random_search.fit(X_train, y_train)
            
            best_models[model_name] = {
                'model': random_search.best_estimator_,
                'params': random_search.best_params_,
                'cv_score': random_search.best_score_,
                'val_score': random_search.best_estimator_.score(X_val, y_val)
            }
            
            print(f"  Best CV Score: {random_search.best_score_:.4f}")
            print(f"  Validation Score: {best_models[model_name]['val_score']:.4f}")
            print(f"  Best Params: {random_search.best_params_}")
            
        except Exception as e:
            print(f"  Error optimizing {model_name}: {str(e)}")
    
    return best_models

# =============================================================================
# STRATEGY 4: ENSEMBLE OPTIMIZATION
# =============================================================================

def create_optimized_ensemble(best_models, X_train, y_train):
    """Create optimized ensemble with dynamic weighting"""
    
    print("\n4. ENSEMBLE OPTIMIZATION")
    print("-" * 40)
    
    # Extract trained models
    models_list = [(name, model_info['model']) for name, model_info in best_models.items()]
    
    # Test different ensemble strategies
    ensemble_strategies = {}
    
    # 1. Voting Classifier with different weights
    if len(models_list) >= 2:
        print("Testing Voting Classifier...")
        
        # Equal weights
        voting_equal = VotingClassifier(
            estimators=models_list,
            voting='soft'
        )
        voting_equal.fit(X_train, y_train)
        ensemble_strategies['voting_equal'] = voting_equal
        
        # Performance-based weights
        weights = [best_models[name]['cv_score'] for name, _ in models_list]
        voting_weighted = VotingClassifier(
            estimators=models_list,
            voting='soft',
            weights=weights
        )
        voting_weighted.fit(X_train, y_train)
        ensemble_strategies['voting_weighted'] = voting_weighted
    
    # 2. Custom ensemble with business logic
    class BusinessOptimizedEnsemble:
        def __init__(self, models, weights=None):
            self.models = dict(models)
            self.weights = weights or {name: 1.0 for name, _ in models}
            
        def fit(self, X, y):
            # Models are already fitted
            return self
            
        def predict_proba(self, X):
            predictions = {}
            for name, model in self.models.items():
                predictions[name] = model.predict_proba(X)[:, 1]
            
            # Business-optimized weighting
            # Higher weight for models that better separate high-risk cases
            ensemble_pred = np.zeros(len(X))
            total_weight = sum(self.weights.values())
            
            for name, weight in self.weights.items():
                ensemble_pred += (weight / total_weight) * predictions[name]
            
            return np.column_stack([1 - ensemble_pred, ensemble_pred])
    
    # Create business-optimized ensemble
    business_weights = {}
    for name, model_info in best_models.items():
        # Weight based on CV performance and business relevance
        business_weights[name] = model_info['cv_score'] * 1.2 if 'XGBoost' in name else model_info['cv_score']
    
    business_ensemble = BusinessOptimizedEnsemble(models_list, business_weights)
    business_ensemble.fit(X_train, y_train)
    ensemble_strategies['business_optimized'] = business_ensemble
    
    print(f"Created {len(ensemble_strategies)} ensemble strategies")
    return ensemble_strategies

# =============================================================================
# STRATEGY 5: THRESHOLD OPTIMIZATION WITH BUSINESS CONSTRAINTS
# =============================================================================

def optimize_thresholds_with_constraints(models, X_val, y_val, scaler):
    """Advanced threshold optimization with business constraints"""
    
    print("\n5. ADVANCED THRESHOLD OPTIMIZATION")
    print("-" * 40)
    
    def calculate_business_metrics(risk_scores, y_true, low_thresh, high_thresh):
        """Calculate business metrics for given thresholds"""
        
        low_risk = risk_scores < low_thresh
        medium_risk = (risk_scores >= low_thresh) & (risk_scores < high_thresh)
        high_risk = risk_scores >= high_thresh
        
        # Business constraints
        decline_rate = (medium_risk.sum() + high_risk.sum()) / len(y_true)
        agent_alert_rate = high_risk.sum() / len(y_true)
        
        # Missed fraud rate
        fraud_indices = y_true == 1
        missed_frauds = (fraud_indices & low_risk).sum()
        total_frauds = fraud_indices.sum()
        missed_fraud_rate = missed_frauds / total_frauds if total_frauds > 0 else 0
        
        # Business score (lower is better)
        constraint_violations = 0
        constraint_violations += max(0, decline_rate - 0.30) * 1000  # Penalty for exceeding 30%
        constraint_violations += max(0, agent_alert_rate - 0.001) * 10000  # Penalty for exceeding 0.1%
        constraint_violations += max(0, missed_fraud_rate - 0.02) * 5000  # Penalty for exceeding 2%
        
        return {
            'decline_rate': decline_rate,
            'agent_alert_rate': agent_alert_rate,
            'missed_fraud_rate': missed_fraud_rate,
            'constraint_violations': constraint_violations,
            'business_score': constraint_violations + missed_fraud_rate * 1000  # Minimize fraud miss
        }
    
    # Grid search for optimal thresholds
    print("Performing grid search for optimal thresholds...")
    
    best_threshold_config = None
    best_business_score = float('inf')
    
    threshold_results = []
    
    # Calculate risk scores for validation set
    risk_scores, _ = calculate_risk_scores(X_val, models, scaler)
    
    # Grid search
    low_thresholds = np.arange(0.05, 0.50, 0.02)
    high_thresholds = np.arange(0.20, 0.80, 0.02)
    
    for low_thresh in low_thresholds:
        for high_thresh in high_thresholds:
            if high_thresh <= low_thresh:
                continue
                
            metrics = calculate_business_metrics(risk_scores, y_val, low_thresh, high_thresh)
            
            result = {
                'low_threshold': low_thresh,
                'high_threshold': high_thresh,
                **metrics
            }
            
            threshold_results.append(result)
            
            # Check if this is the best configuration
            if metrics['constraint_violations'] == 0:  # All constraints met
                if metrics['business_score'] < best_business_score:
                    best_business_score = metrics['business_score']
                    best_threshold_config = result
    
    # If no configuration meets all constraints, find the best compromise
    if best_threshold_config is None:
        print("No threshold combination meets all constraints. Finding best compromise...")
        threshold_df = pd.DataFrame(threshold_results)
        
        # Find configuration with minimum constraint violations
        min_violations = threshold_df['constraint_violations'].min()
        best_compromises = threshold_df[threshold_df['constraint_violations'] == min_violations]
        best_threshold_config = best_compromises.loc[best_compromises['missed_fraud_rate'].idxmin()].to_dict()
    
    return best_threshold_config, threshold_results

# =============================================================================
# STRATEGY 6: FEATURE SELECTION OPTIMIZATION
# =============================================================================

def optimize_feature_selection(X_train, y_train, X_val, y_val):
    """Advanced feature selection techniques"""
    
    print("\n6. FEATURE SELECTION OPTIMIZATION")
    print("-" * 40)
    
    feature_selection_methods = {}
    
    # 1. Correlation-based selection
    print("Testing correlation-based feature selection...")
    correlation_threshold = 0.01
    correlations = pd.DataFrame(X_train).corrwith(pd.Series(y_train)).abs()
    selected_features_corr = correlations[correlations > correlation_threshold].index.tolist()
    feature_selection_methods['correlation'] = selected_features_corr
    
    # 2. Mutual Information
    print("Testing mutual information feature selection...")
    mi_scores = mutual_info_classif(X_train, y_train, random_state=42)
    k_best_mi = SelectKBest(mutual_info_classif, k=min(50, len(mi_scores)))
    k_best_mi.fit(X_train, y_train)
    selected_features_mi = k_best_mi.get_support(indices=True).tolist()
    feature_selection_methods['mutual_info'] = selected_features_mi
    
    # 3. Recursive Feature Elimination
    print("Testing recursive feature elimination...")
    rf_selector = RandomForestClassifier(n_estimators=50, random_state=42, class_weight='balanced')
    rfe = RFE(rf_selector, n_features_to_select=min(30, X_train.shape[1]), step=5)
    rfe.fit(X_train, y_train)
    selected_features_rfe = rfe.get_support(indices=True).tolist()
    feature_selection_methods['rfe'] = selected_features_rfe
    
    # Evaluate each feature selection method
    print("\nEvaluating feature selection methods...")
    feature_selection_results = {}
    
    for method_name, selected_features in feature_selection_methods.items():
        if len(selected_features) > 0:
            X_train_selected = X_train[:, selected_features] if hasattr(X_train, 'shape') else X_train.iloc[:, selected_features]
            X_val_selected = X_val[:, selected_features] if hasattr(X_val, 'shape') else X_val.iloc[:, selected_features]
            
            # Quick evaluation
            rf_temp = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
            rf_temp.fit(X_train_selected, y_train)
            
            train_score = rf_temp.score(X_train_selected, y_train)
            val_score = rf_temp.score(X_val_selected, y_val)
            
            feature_selection_results[method_name] = {
                'n_features': len(selected_features),
                'features': selected_features,
                'train_score': train_score,
                'val_score': val_score,
                'overfitting': train_score - val_score
            }
            
            print(f"  {method_name}: {len(selected_features)} features, Val Score: {val_score:.4f}")
    
    return feature_selection_results

# =============================================================================
# STRATEGY 7: COST-SENSITIVE LEARNING
# =============================================================================

def implement_cost_sensitive_learning(X_train, y_train):
    """Implement cost-sensitive learning approaches"""
    
    print("\n7. COST-SENSITIVE LEARNING")
    print("-" * 40)
    
    # Define business costs
    # Cost of missing fraud (false negative): High
    # Cost of false positive (declining good transaction): Medium
    # Cost of agent review: Low but limited capacity
    
    fraud_rate = y_train.mean()
    normal_rate = 1 - fraud_rate
    
    # Calculate class weights based on business impact
    # Higher penalty for missing fraud
    class_weights = {
        0: 1.0,  # Normal transactions
        1: (normal_rate / fraud_rate) * 10  # Fraud transactions - 10x penalty for missing
    }
    
    print(f"Calculated class weights: {class_weights}")
    
    cost_sensitive_models = {}
    
    # 1. Weighted Random Forest
    rf_weighted = RandomForestClassifier(
        n_estimators=200,
        class_weight=class_weights,
        random_state=42,
        max_depth=20,
        min_samples_split=5
    )
    rf_weighted.fit(X_train, y_train)
    cost_sensitive_models['rf_weighted'] = rf_weighted
    
    # 2. Weighted XGBoost
    scale_pos_weight = class_weights[1]
    xgb_weighted = xgb.XGBClassifier(
        n_estimators=200,
        scale_pos_weight=scale_pos_weight,
        random_state=42,
        max_depth=6,
        learning_rate=0.1
    )
    xgb_weighted.fit(X_train, y_train)
    cost_sensitive_models['xgb_weighted'] = xgb_weighted
    
    # 3. Custom cost matrix implementation
    class CostSensitiveClassifier:
        def __init__(self, base_model, cost_matrix):
            self.base_model = base_model
            self.cost_matrix = cost_matrix  # [[TN_cost, FP_cost], [FN_cost, TP_cost]]
            
        def fit(self, X, y):
            self.base_model.fit(X, y)
            return self
            
        def predict_proba(self, X):
            base_probs = self.base_model.predict_proba(X)
            
            # Adjust probabilities based on cost matrix
            # Higher cost for false negatives should increase fraud probability
            fn_cost = self.cost_matrix[1][0]  # Cost of missing fraud
            fp_cost = self.cost_matrix[0][1]  # Cost of false positive
            
            adjustment_factor = fn_cost / (fn_cost + fp_cost)
            adjusted_fraud_prob = base_probs[:, 1] * (1 + adjustment_factor)
            adjusted_fraud_prob = np.clip(adjusted_fraud_prob, 0, 1)
            
            adjusted_normal_prob = 1 - adjusted_fraud_prob
            
            return np.column_stack([adjusted_normal_prob, adjusted_fraud_prob])
    
    # Define cost matrix: [[TN, FP], [FN, TP]]
    cost_matrix = [[0, 1], [50, 0]]  # Missing fraud costs 50x more than false positive
    
    base_rf = RandomForestClassifier(n_estimators=100, random_state=42)
    cost_sensitive_rf = CostSensitiveClassifier(base_rf, cost_matrix)
    cost_sensitive_rf.fit(X_train, y_train)
    cost_sensitive_models['cost_sensitive'] = cost_sensitive_rf
    
    return cost_sensitive_models, class_weights

# =============================================================================
# COMPREHENSIVE OPTIMIZATION PIPELINE
# =============================================================================

def comprehensive_model_optimization(df, target_column='Class'):
    """Complete optimization pipeline"""
    
    print("STARTING COMPREHENSIVE MODEL OPTIMIZATION")
    print("=" * 60)
    
    # Prepare data
    X = df.drop(columns=[target_column])
    y = df[target_column]
    
    # Split data
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, stratify=y_train, random_state=42)
    
    optimization_results = {}
    
    # 1. Advanced Feature Engineering
    df_enhanced = advanced_feature_engineering(df)
    X_enhanced = df_enhanced.drop(columns=[target_column])
    
    # Update splits with enhanced features
    X_train_enh = X_enhanced.iloc[X_train.index]
    X_val_enh = X_enhanced.iloc[X_val.index]
    X_test_enh = X_enhanced.iloc[X_test.index]
    
    # 2. Scaling optimization
    scalers = {
        'standard': StandardScaler(),
        'robust': RobustScaler(), 
        'minmax': MinMaxScaler()
    }
    
    best_scaler = None
    best_scaler_score = 0
    
    for scaler_name, scaler in scalers.items():
        X_train_scaled = scaler.fit_transform(X_train_enh)
        X_val_scaled = scaler.transform(X_val_enh)
        
        # Quick evaluation
        rf_temp = RandomForestClassifier(n_estimators=50, random_state=42, class_weight='balanced')
        rf_temp.fit(X_train_scaled, y_train)
        score = rf_temp.score(X_val_scaled, y_val)
        
        if score > best_scaler_score:
            best_scaler_score = score
            best_scaler = scaler
    
    X_train_final = best_scaler.fit_transform(X_train_enh)
    X_val_final = best_scaler.transform(X_val_enh)
    X_test_final = best_scaler.transform(X_test_enh)
    
    # 3. Sampling optimization
    best_sampler, sampling_results = optimize_sampling_strategy(X_train_final, y_train)
    X_train_sampled, y_train_sampled = best_sampler.fit_resample(X_train_final, y_train)
    
    # 4. Hyperparameter optimization
    best_models = optimize_model_hyperparameters(X_train_sampled, y_train_sampled, X_val_final, y_val)
    
    # 5. Ensemble creation
    ensemble_models = create_optimized_ensemble(best_models, X_train_sampled, y_train_sampled)
    
    # 6. Feature selection
    feature_selection_results = optimize_feature_selection(X_train_sampled, y_train_sampled, X_val_final, y_val)
    
    # 7. Cost-sensitive learning
    cost_models, class_weights = implement_cost_sensitive_learning(X_train_sampled, y_train_sampled)
    
    # Compile results
    optimization_results = {
        'enhanced_features': df_enhanced.shape[1] - df.shape[1],
        'best_scaler': type(best_scaler).__name__,
        'sampling_strategy': type(best_sampler).__name__,
        'best_models': best_models,
        'ensemble_models': list(ensemble_models.keys()),
        'feature_selection': feature_selection_results,
        'cost_sensitive_models': list(cost_models.keys()),
        'class_weights': class_weights,
        'final_data_shapes': {
            'original': X_train.shape,
            'enhanced': X_train_enh.shape,
            'sampled': X_train_sampled.shape
        }
    }
    
    print("\n" + "="*60)
    print("OPTIMIZATION COMPLETE - SUMMARY")
    print("="*60)
    print(f"✓ Feature engineering: Added {optimization_results['enhanced_features']} features")
    print(f"✓ Best scaler: {optimization_results['best_scaler']}")
    print(f"✓ Sampling strategy: {optimization_results['sampling_strategy']}")
    print(f"✓ Optimized models: {len(best_models)}")
    print(f"✓ Ensemble strategies: {len(ensemble_models)}")
    print(f"✓ Feature selection methods: {len(feature_selection_results)}")
    print(f"✓ Cost-sensitive models: {len(cost_models)}")
    
    return optimization_results, {
        'X_train': X_train_final,
        'X_val': X_val_final, 
        'X_test': X_test_final,
        'y_train': y_train,
        'y_val': y_val,
        'y_test': y_test,
        'scaler': best_scaler,
        'sampler': best_sampler,
        'models': {**best_models, **ensemble_models, **cost_models}
    }

# =============================================================================
# STRATEGY 8: DYNAMIC THRESHOLD ADJUSTMENT
# =============================================================================

def implement_dynamic_thresholds(models, X_val, y_val, scaler):
    """Implement dynamic threshold adjustment based on transaction patterns"""
    
    print("\n8. DYNAMIC THRESHOLD ADJUSTMENT")
    print("-" * 40)
    
    class DynamicThresholdClassifier:
        def __init__(self, base_models, scaler, base_low_thresh=0.3, base_high_thresh=0.7):
            self.base_models = base_models
            self.scaler = scaler
            self.base_low_thresh = base_low_thresh
            self.base_high_thresh = base_high_thresh
            self.adjustment_factors = {}
            
        def fit_adjustments(self, X, y):
            """Learn adjustment factors based on patterns"""
            
            # Calculate base risk scores
            risk_scores, individual_scores = calculate_risk_scores(X, self.base_models, self.scaler)
            
            # Time-based adjustments
            if hasattr(X, 'columns') and 'Hour' in X.columns:
                hourly_fraud_rates = {}
                for hour in range(24):
                    hour_mask = X['Hour'] == hour
                    if hour_mask.sum() > 0:
                        hourly_fraud_rates[hour] = y[hour_mask].mean()
                
                avg_fraud_rate = y.mean()
                self.adjustment_factors['time'] = {}
                for hour, rate in hourly_fraud_rates.items():
                    # Higher fraud rate hours get lower thresholds (more sensitive)
                    adjustment = 1.0 - ((rate / avg_fraud_rate) - 1.0) * 0.2
                    self.adjustment_factors['time'][hour] = np.clip(adjustment, 0.7, 1.3)
            
            # Amount-based adjustments
            if hasattr(X, 'columns') and 'Amount' in X.columns:
                amount_percentiles = [0, 10, 50, 90, 99, 100]
                amount_thresholds = np.percentile(X['Amount'], amount_percentiles)
                
                self.adjustment_factors['amount'] = {}
                for i in range(len(amount_thresholds)-1):
                    low_amt, high_amt = amount_thresholds[i], amount_thresholds[i+1]
                    amount_mask = (X['Amount'] >= low_amt) & (X['Amount'] < high_amt)
                    
                    if amount_mask.sum() > 0:
                        amt_fraud_rate = y[amount_mask].mean()
                        adjustment = 1.0 - ((amt_fraud_rate / avg_fraud_rate) - 1.0) * 0.15
                        self.adjustment_factors['amount'][(low_amt, high_amt)] = np.clip(adjustment, 0.8, 1.2)
            
            return self
        
        def predict_risk_levels(self, X):
            """Predict risk levels with dynamic thresholds"""
            
            # Get base risk scores
            risk_scores, _ = calculate_risk_scores(X, self.base_models, self.scaler)
            
            # Apply adjustments
            adjusted_low_thresh = np.full(len(X), self.base_low_thresh)
            adjusted_high_thresh = np.full(len(X), self.base_high_thresh)
            
            for i in range(len(X)):
                adjustment = 1.0
                
                # Time adjustment
                if 'time' in self.adjustment_factors and hasattr(X, 'iloc'):
                    hour = X.iloc[i]['Hour'] if 'Hour' in X.columns else 12
                    adjustment *= self.adjustment_factors['time'].get(hour, 1.0)
                
                # Amount adjustment
                if 'amount' in self.adjustment_factors and hasattr(X, 'iloc'):
                    amount = X.iloc[i]['Amount'] if 'Amount' in X.columns else 0
                    for (low_amt, high_amt), amt_adj in self.adjustment_factors['amount'].items():
                        if low_amt <= amount < high_amt:
                            adjustment *= amt_adj
                            break
                
                adjusted_low_thresh[i] *= adjustment
                adjusted_high_thresh[i] *= adjustment
            
            # Classify based on adjusted thresholds
            risk_levels = []
            actions = []
            
            for i, score in enumerate(risk_scores):
                if score < adjusted_low_thresh[i]:
                    risk_levels.append('low')
                    actions.append('APPROVE')
                elif score < adjusted_high_thresh[i]:
                    risk_levels.append('medium')
                    actions.append('DECLINE_VALIDATE')
                else:
                    risk_levels.append('high')
                    actions.append('DECLINE_ALERT')
            
            return risk_levels, actions, {
                'risk_scores': risk_scores,
                'adjusted_low_thresh': adjusted_low_thresh,
                'adjusted_high_thresh': adjusted_high_thresh
            }
    
    # Create and train dynamic classifier
    dynamic_classifier = DynamicThresholdClassifier(models, scaler)
    dynamic_classifier.fit_adjustments(X_val, y_val)
    
    return dynamic_classifier

# =============================================================================
# STRATEGY 9: REAL-TIME MODEL MONITORING
# =============================================================================

def setup_model_monitoring():
    """Setup framework for real-time model monitoring"""
    
    print("\n9. REAL-TIME MODEL MONITORING SETUP")
    print("-" * 40)
    
    class ModelMonitor:
        def __init__(self, target_metrics):
            self.target_metrics = target_metrics
            self.performance_history = []
            self.alert_thresholds = {
                'decline_rate': 0.32,  # Alert if > 32% (above 30% target)
                'agent_alert_rate': 0.0012,  # Alert if > 0.12% (above 0.1% target)
                'missed_fraud_rate': 0.025,  # Alert if > 2.5% (above 2% target)
                'model_drift': 0.05  # Alert if performance drops > 5%
            }
            
        def log_performance(self, timestamp, metrics):
            """Log current performance metrics"""
            self.performance_history.append({
                'timestamp': timestamp,
                **metrics
            })
            
        def check_alerts(self, current_metrics):
            """Check if any metrics exceed alert thresholds"""
            alerts = []
            
            for metric, threshold in self.alert_thresholds.items():
                if metric in current_metrics and current_metrics[metric] > threshold:
                    alerts.append({
                        'metric': metric,
                        'current_value': current_metrics[metric],
                        'threshold': threshold,
                        'severity': 'HIGH' if current_metrics[metric] > threshold * 1.2 else 'MEDIUM'
                    })
            
            return alerts
        
        def calculate_drift(self, new_predictions, baseline_predictions):
            """Calculate model drift using PSI (Population Stability Index)"""
            
            def calculate_psi(expected, actual, buckets=10):
                """Calculate Population Stability Index"""
                
                # Create buckets
                min_val = min(min(expected), min(actual))
                max_val = max(max(expected), max(actual))
                bucket_bounds = np.linspace(min_val, max_val, buckets + 1)
                
                # Calculate distributions
                expected_dist = np.histogram(expected, bins=bucket_bounds)[0] / len(expected)
                actual_dist = np.histogram(actual, bins=bucket_bounds)[0] / len(actual)
                
                # Add small constant to avoid division by zero
                expected_dist = np.where(expected_dist == 0, 0.0001, expected_dist)
                actual_dist = np.where(actual_dist == 0, 0.0001, actual_dist)
                
                # Calculate PSI
                psi = np.sum((actual_dist - expected_dist) * np.log(actual_dist / expected_dist))
                return psi
            
            psi = calculate_psi(baseline_predictions, new_predictions)
            
            drift_level = 'LOW'
            if psi > 0.25:
                drift_level = 'HIGH'
            elif psi > 0.1:
                drift_level = 'MEDIUM'
            
            return {'psi': psi, 'drift_level': drift_level}
        
        def generate_monitoring_report(self):
            """Generate comprehensive monitoring report"""
            if not self.performance_history:
                return "No performance history available"
            
            recent_performance = self.performance_history[-10:]  # Last 10 records
            
            report = "MODEL PERFORMANCE MONITORING REPORT\n"
            report += "=" * 50 + "\n\n"
            
            # Current metrics
            if recent_performance:
                latest = recent_performance[-1]
                report += f"LATEST PERFORMANCE (Timestamp: {latest['timestamp']}):\n"
                for key, value in latest.items():
                    if key != 'timestamp':
                        report += f"  {key}: {value:.4f}\n"
                
                # Check alerts
                alerts = self.check_alerts(latest)
                if alerts:
                    report += "\n🚨 ALERTS:\n"
                    for alert in alerts:
                        report += f"  {alert['severity']}: {alert['metric']} = {alert['current_value']:.4f} (threshold: {alert['threshold']:.4f})\n"
                else:
                    report += "\n✅ All metrics within acceptable ranges\n"
            
            # Trends
            if len(recent_performance) > 1:
                report += "\nTREND ANALYSIS:\n"
                for metric in ['decline_rate', 'agent_alert_rate', 'missed_fraud_rate']:
                    if metric in recent_performance[0]:
                        values = [record[metric] for record in recent_performance if metric in record]
                        if len(values) > 1:
                            trend = "📈 INCREASING" if values[-1] > values[0] else "📉 DECREASING"
                            change = abs(values[-1] - values[0])
                            report += f"  {metric}: {trend} (Δ {change:.4f})\n"
            
            return report
    
    # Initialize monitor with target metrics
    target_metrics = {
        'decline_rate': 0.30,
        'agent_alert_rate': 0.001,
        'missed_fraud_rate': 0.02
    }
    
    monitor = ModelMonitor(target_metrics)
    
    print("✓ Model monitoring framework initialized")
    print("✓ Alert thresholds configured")
    print("✓ Drift detection methods ready")
    print("✓ Reporting system active")
    
    return monitor

# =============================================================================
# STRATEGY 10: A/B TESTING FRAMEWORK
# =============================================================================

def setup_ab_testing_framework():
    """Setup A/B testing framework for model deployment"""
    
    print("\n10. A/B TESTING FRAMEWORK")
    print("-" * 40)
    
    class ABTestingFramework:
        def __init__(self):
            self.experiments = {}
            self.results = {}
            
        def create_experiment(self, experiment_name, model_a, model_b, traffic_split=0.5):
            """Create new A/B test experiment"""
            
            self.experiments[experiment_name] = {
                'model_a': model_a,
                'model_b': model_b,
                'traffic_split': traffic_split,
                'transactions_a': [],
                'transactions_b': [],
                'results_a': [],
                'results_b': []
            }
            
            print(f"✓ Experiment '{experiment_name}' created with {traffic_split*100}% / {(1-traffic_split)*100}% split")
        
        def assign_traffic(self, experiment_name, transaction_id):
            """Assign transaction to model A or B"""
            
            import hashlib
            
            # Use transaction ID hash for consistent assignment
            hash_val = int(hashlib.md5(str(transaction_id).encode()).hexdigest(), 16)
            split_point = self.experiments[experiment_name]['traffic_split']
            
            return 'A' if (hash_val % 100) / 100 < split_point else 'B'
        
        def log_result(self, experiment_name, transaction_id, model_used, prediction, actual_outcome, business_metrics):
            """Log experiment result"""
            
            experiment = self.experiments[experiment_name]
            
            result = {
                'transaction_id': transaction_id,
                'prediction': prediction,
                'actual_outcome': actual_outcome,
                'business_metrics': business_metrics,
                'timestamp': pd.Timestamp.now()
            }
            
            if model_used == 'A':
                experiment['transactions_a'].append(transaction_id)
                experiment['results_a'].append(result)
            else:
                experiment['transactions_b'].append(transaction_id)
                experiment['results_b'].append(result)
        
        def calculate_experiment_results(self, experiment_name, min_sample_size=1000):
            """Calculate statistical significance of experiment results"""
            
            experiment = self.experiments[experiment_name]
            results_a = experiment['results_a']
            results_b = experiment['results_b']
            
            if len(results_a) < min_sample_size or len(results_b) < min_sample_size:
                return {
                    'status': 'INSUFFICIENT_DATA',
                    'sample_size_a': len(results_a),
                    'sample_size_b': len(results_b),
                    'min_required': min_sample_size
                }
            
            # Calculate key metrics for both groups
            metrics_a = self._calculate_group_metrics(results_a)
            metrics_b = self._calculate_group_metrics(results_b)
            
            # Statistical significance testing
            from scipy import stats
            
            significance_results = {}
            
            for metric in ['decline_rate', 'agent_alert_rate', 'missed_fraud_rate']:
                if metric in metrics_a and metric in metrics_b:
                    # Two-proportion z-test
                    successes_a = metrics_a[metric] * len(results_a)
                    successes_b = metrics_b[metric] * len(results_b)
                    
                    # Simplified significance test
                    p_value = stats.chi2_contingency([
                        [successes_a, len(results_a) - successes_a],
                        [successes_b, len(results_b) - successes_b]
                    ])[1]
                    
                    significance_results[metric] = {
                        'model_a': metrics_a[metric],
                        'model_b': metrics_b[metric],
                        'difference': metrics_b[metric] - metrics_a[metric],
                        'p_value': p_value,
                        'significant': p_value < 0.05
                    }
            
            return {
                'status': 'COMPLETE',
                'sample_size_a': len(results_a),
                'sample_size_b': len(results_b),
                'metrics_a': metrics_a,
                'metrics_b': metrics_b,
                'significance_tests': significance_results,
                'recommendation': self._generate_recommendation(significance_results)
            }
        
        def _calculate_group_metrics(self, results):
            """Calculate metrics for a group of results"""
            
            if not results:
                return {}
            
            total_transactions = len(results)
            declined = sum(1 for r in results if r['business_metrics'].get('action') in ['DECLINE_VALIDATE', 'DECLINE_ALERT'])
            agent_alerts = sum(1 for r in results if r['business_metrics'].get('action') == 'DECLINE_ALERT')
            
            # Calculate fraud metrics
            actual_frauds = sum(1 for r in results if r['actual_outcome'] == 1)
            missed_frauds = sum(1 for r in results if r['actual_outcome'] == 1 and r['business_metrics'].get('action') == 'APPROVE')
            
            return {
                'decline_rate': declined / total_transactions if total_transactions > 0 else 0,
                'agent_alert_rate': agent_alerts / total_transactions if total_transactions > 0 else 0,
                'missed_fraud_rate': missed_frauds / actual_frauds if actual_frauds > 0 else 0,
                'total_transactions': total_transactions,
                'total_frauds': actual_frauds
            }
        
        def _generate_recommendation(self, significance_results):
            """Generate recommendation based on test results"""
            
            if not significance_results:
                return "Insufficient data for recommendation"
            
            # Check if Model B is significantly better
            better_metrics = 0
            worse_metrics = 0
            
            for metric, result in significance_results.items():
                if result['significant']:
                    if metric == 'missed_fraud_rate':
                        # Lower is better for missed fraud
                        if result['difference'] < 0:
                            better_metrics += 1
                        else:
                            worse_metrics += 1
                    else:
                        # Lower is generally better for decline and alert rates
                        # But need to balance with business constraints
                        if abs(result['difference']) > 0.01:  # Meaningful difference
                            if result['difference'] < 0:
                                better_metrics += 1
                            else:
                                worse_metrics += 1
            
            if better_metrics > worse_metrics:
                return "DEPLOY MODEL B - Significantly better performance"
            elif worse_metrics > better_metrics:
                return "KEEP MODEL A - Model B performs worse"
            else:
                return "NO CLEAR WINNER - Consider longer test or different models"
    
    # Initialize A/B testing framework
    ab_framework = ABTestingFramework()
    
    print("✓ A/B testing framework initialized")
    print("✓ Statistical significance testing ready")
    print("✓ Business metrics tracking configured")
    print("✓ Recommendation engine active")
    
    return ab_framework

# =============================================================================
# FINAL INTEGRATION AND RECOMMENDATIONS
# =============================================================================

def generate_optimization_recommendations():
    """Generate comprehensive optimization recommendations"""
    
    print("\n" + "="*70)
    print("COMPREHENSIVE OPTIMIZATION RECOMMENDATIONS")
    print("="*70)
    
    recommendations = {
        'immediate_actions': [
            "1. Implement advanced feature engineering (time-based, amount-based, PCA combinations)",
            "2. Apply SMOTE or BorderlineSMOTE sampling to handle class imbalance",
            "3. Use ensemble methods (XGBoost + RandomForest + LightGBM)",
            "4. Implement cost-sensitive learning with 50:1 fraud penalty ratio",
            "5. Optimize thresholds using grid search with business constraints"
        ],
        
        'model_improvements': [
            "1. Use gradient boosting models (XGBoost/LightGBM) for better performance",
            "2. Implement dynamic threshold adjustment based on time/amount patterns",
            "3. Create weighted ensemble with performance-based voting",
            "4. Apply recursive feature elimination to reduce overfitting",
            "5. Use RobustScaler for better handling of outliers"
        ],
        
        'business_alignment': [
            "1. Implement three-tier risk classification system",
            "2. Set up real-time monitoring for constraint violations",
            "3. Create A/B testing framework for model deployment",
            "4. Establish feedback loops for continuous learning",
            "5. Implement customer validation workflows for medium-risk transactions"
        ],
        
        'technical_infrastructure': [
            "1. Deploy real-time scoring API with < 100ms latency",
            "2. Implement model versioning and rollback capabilities",
            "3. Set up automated retraining pipelines (weekly/monthly)",
            "4. Create monitoring dashboards for business stakeholders",
            "5. Implement data drift detection and alerting"
        ],
        
        'risk_management': [
            "1. Establish model governance and approval processes",
            "2. Implement challenger model framework",
            "3. Create escalation procedures for high-risk transactions",
            "4. Set up regular model validation and backtesting",
            "5. Implement fairness and bias monitoring"
        ]
    }
    
    print("\n🎯 IMMEDIATE ACTIONS (Next 2 weeks):")
    for action in recommendations['immediate_actions']:
        print(f"   {action}")
    
    print("\n🔧 MODEL IMPROVEMENTS (Next month):")
    for improvement in recommendations['model_improvements']:
        print(f"   {improvement}")
    
    print("\n💼 BUSINESS ALIGNMENT (Next quarter):")
    for alignment in recommendations['business_alignment']:
        print(f"   {alignment}")
    
    print("\n🏗️ TECHNICAL INFRASTRUCTURE (Next 3 months):")
    for infra in recommendations['technical_infrastructure']:
        print(f"   {infra}")
    
    print("\n⚖️ RISK MANAGEMENT (Ongoing):")
    for risk in recommendations['risk_management']:
        print(f"   {risk}")
    
    print("\n" + "="*70)
    print("SUCCESS METRICS TO TRACK:")
    print("="*70)
    
    success_metrics = [
        "✅ Decline Rate: Currently vs Target ≤ 30%",
        "✅ Agent Alert Rate: Currently vs Target < 0.1%", 
        "✅ Missed Fraud Rate: Currently vs Target ≤ 2%",
        "📈 Model Performance: AUC-ROC improvement",
        "💰 Business Impact: Cost reduction from automation",
        "⏱️ Processing Speed: Average decision time",
        "🎯 Customer Satisfaction: Reduced false positives",
        "🔍 Model Stability: Prediction consistency over time"
    ]
    
    for metric in success_metrics:
        print(f"   {metric}")
    
    print(f"\n{'='*70}")
    print("Remember: Optimization is an iterative process!")
    print("Start with immediate actions and gradually implement advanced features.")
    print(f"{'='*70}")
    
    return recommendations

# Example usage:
if __name__ == "__main__":
    print("To use these optimization strategies:")
    print("1. Run comprehensive_model_optimization(your_dataframe)")
    print("2. Implement the recommendations based on priority")
    print("3. Set up monitoring and A/B testing frameworks")
    print("4. Continuously iterate based on performance feedback")
    
    # Generate final recommendations
    recommendations = generate_optimization_recommendations()