# Classic approach

* Synthetic data (Faker)
* Defined number and types of models
* (simple) EDA
* Simple feature selection

In [3]:
import pandas as pd
import numpy as np
from faker import Faker
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_recall_curve, auc
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline
import warnings
warnings.filterwarnings('ignore')

class FraudDetectionAutoML:
    def __init__(self, n_samples=500, n_fraud=50, random_state=42):
        """
        Initialize the Fraud Detection AutoML system
        
        Args:
            n_samples: Total number of samples to generate
            n_fraud: Number of fraud cases (positive class)
            random_state: Random seed for reproducibility
        """
        self.n_samples = n_samples
        self.n_fraud = n_fraud
        self.n_normal = n_samples - n_fraud
        self.random_state = random_state
        self.fake = Faker()
        Faker.seed(random_state)
        np.random.seed(random_state)
        
        self.data = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.scaler = StandardScaler()
        self.models = {}
        self.results = {}
        
    def generate_fraud_data(self):
        """Generate synthetic fraud detection dataset using Faker"""
        print("Generating synthetic fraud detection dataset...")
        
        data = []
        
        # Generate normal transactions (0 = not fraud)
        for i in range(self.n_normal):
            record = {
                'transaction_id': self.fake.uuid4(),
                'amount': np.random.normal(100, 50),  # Normal spending pattern
                'merchant_category': np.random.choice(['grocery', 'gas', 'restaurant', 'retail', 'online'], 
                                                    p=[0.3, 0.2, 0.2, 0.2, 0.1]),
                'hour_of_day': np.random.choice(range(6, 23), p=self._get_normal_hour_prob()),
                'day_of_week': np.random.randint(0, 7),
                'age': np.random.randint(18, 80),
                'account_age_days': np.random.randint(365, 3650),  # 1-10 years
                'num_transactions_today': np.random.poisson(3),
                'avg_amount_last_30_days': np.random.normal(95, 30),
                'location_risk_score': np.random.beta(2, 8),  # Lower risk for normal
                'device_risk_score': np.random.beta(2, 8),
                'is_weekend': 0,
                'is_fraud': 0
            }
            
            # Set weekend flag
            if record['day_of_week'] in [5, 6]:
                record['is_weekend'] = 1
                
            data.append(record)
        
        # Generate fraud transactions (1 = fraud)
        for i in range(self.n_fraud):
            record = {
                'transaction_id': self.fake.uuid4(),
                'amount': np.random.choice([
                    np.random.normal(500, 200),  # High amount fraud
                    np.random.normal(50, 20),    # Small amount fraud
                    np.random.normal(1000, 300)  # Very high amount fraud
                ], p=[0.5, 0.3, 0.2]),
                'merchant_category': np.random.choice(['online', 'atm', 'unknown', 'retail', 'gas'], 
                                                    p=[0.4, 0.2, 0.2, 0.1, 0.1]),
                'hour_of_day': np.random.choice(range(0, 24), p=self._get_fraud_hour_prob()),
                'day_of_week': np.random.randint(0, 7),
                'age': np.random.randint(18, 80),
                'account_age_days': np.random.choice([
                    np.random.randint(1, 90),     # New accounts (higher risk)
                    np.random.randint(90, 3650)   # Older accounts
                ], p=[0.7, 0.3]),
                'num_transactions_today': np.random.choice([
                    np.random.poisson(1),         # Few transactions
                    np.random.poisson(10)         # Many transactions (velocity)
                ], p=[0.6, 0.4]),
                'avg_amount_last_30_days': np.random.normal(150, 100),
                'location_risk_score': np.random.beta(5, 3),  # Higher risk for fraud
                'device_risk_score': np.random.beta(6, 2),
                'is_weekend': 0,
                'is_fraud': 1
            }
            
            # Set weekend flag
            if record['day_of_week'] in [5, 6]:
                record['is_weekend'] = 1
                
            data.append(record)
        
        # Convert to DataFrame and shuffle
        self.data = pd.DataFrame(data)
        self.data = self.data.sample(frac=1, random_state=self.random_state).reset_index(drop=True)
        
        print(f"Dataset generated successfully!")
        print(f"Total samples: {len(self.data)}")
        print(f"Fraud cases: {self.data['is_fraud'].sum()}")
        print(f"Normal cases: {len(self.data) - self.data['is_fraud'].sum()}")
        print(f"Fraud ratio: {self.data['is_fraud'].mean():.2%}")
        
        return self.data
    
    def _get_normal_hour_prob(self):
        """Get probability distribution for normal transaction hours"""
        # Normal transactions more likely during business hours
        probs = np.ones(17) * 0.05  # 6-22 hours
        probs[2:10] = 0.08  # 8-16 hours (business hours)
        probs[10:15] = 0.06  # 16-21 hours (evening)
        return probs / probs.sum()
    
    def _get_fraud_hour_prob(self):
        """Get probability distribution for fraud transaction hours"""
        # Fraud more likely during off-hours
        probs = np.ones(24) * 0.03
        probs[0:6] = 0.06   # Late night/early morning
        probs[22:24] = 0.05  # Late evening
        probs[8:17] = 0.02   # Business hours (less likely)
        return probs / probs.sum()
    
    def preprocess_data(self):
        """Preprocess the data for machine learning"""
        print("Preprocessing data...")
        
        # Encode categorical variables
        le = LabelEncoder()
        self.data['merchant_category_encoded'] = le.fit_transform(self.data['merchant_category'])
        
        # Select features for modeling
        feature_cols = [
            'amount', 'merchant_category_encoded', 'hour_of_day', 'day_of_week',
            'age', 'account_age_days', 'num_transactions_today', 
            'avg_amount_last_30_days', 'location_risk_score', 'device_risk_score',
            'is_weekend'
        ]
        
        X = self.data[feature_cols]
        y = self.data['is_fraud']
        
        # Split the data
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            X, y, test_size=0.2, random_state=self.random_state, stratify=y
        )
        
        # Scale the features
        self.X_train_scaled = self.scaler.fit_transform(self.X_train)
        self.X_test_scaled = self.scaler.transform(self.X_test)
        
        print(f"Training set: {len(self.X_train)} samples")
        print(f"Test set: {len(self.X_test)} samples")
        print(f"Training fraud ratio: {self.y_train.mean():.2%}")
        
    def setup_models(self):
        """Setup different ML models with various approaches to handle imbalanced data"""
        print("Setting up models...")
        
        # Calculate class weights for imbalanced data
        class_weights = compute_class_weight('balanced', 
                                           classes=np.unique(self.y_train), 
                                           y=self.y_train)
        class_weight_dict = {0: class_weights[0], 1: class_weights[1]}
        
        # Model 1: Random Forest with balanced class weights
        self.models['RF_Balanced'] = RandomForestClassifier(
            n_estimators=100,
            class_weight='balanced',
            random_state=self.random_state,
            max_depth=10
        )
        
        # Model 2: Gradient Boosting with balanced class weights
        self.models['GB_Balanced'] = GradientBoostingClassifier(
            n_estimators=100,
            learning_rate=0.1,
            random_state=self.random_state,
            max_depth=6
        )
        
        # Model 3: Logistic Regression with balanced class weights
        self.models['LR_Balanced'] = LogisticRegression(
            class_weight='balanced',
            random_state=self.random_state,
            max_iter=1000
        )
        
        # Model 4: SVM with balanced class weights
        self.models['SVM_Balanced'] = SVC(
            class_weight='balanced',
            random_state=self.random_state,
            probability=True,
            kernel='rbf'
        )
        
        # Model 5: Random Forest with SMOTE
        self.models['RF_SMOTE'] = ImbPipeline([
            ('smote', SMOTE(random_state=self.random_state)),
            ('classifier', RandomForestClassifier(n_estimators=100, random_state=self.random_state))
        ])
        
        # Model 6: Gradient Boosting with undersampling
        self.models['GB_Undersample'] = ImbPipeline([
            ('undersample', RandomUnderSampler(random_state=self.random_state)),
            ('classifier', GradientBoostingClassifier(n_estimators=100, random_state=self.random_state))
        ])
        
        print(f"Setup {len(self.models)} models for evaluation")
    
    def evaluate_models(self):
        """Evaluate all models using cross-validation and test set performance"""
        print("Evaluating models...")
        
        # Use stratified k-fold for cross-validation
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=self.random_state)
        
        for name, model in self.models.items():
            print(f"\nEvaluating {name}...")
            
            # Determine which data to use (scaled or original)
            if 'SVM' in name or 'LR' in name:
                X_train_data = self.X_train_scaled
                X_test_data = self.X_test_scaled
            else:
                X_train_data = self.X_train
                X_test_data = self.X_test
            
            # Cross-validation scores
            cv_scores = cross_val_score(model, X_train_data, self.y_train, 
                                      cv=cv, scoring='roc_auc')
            
            # Fit model and make predictions
            model.fit(X_train_data, self.y_train)
            y_pred = model.predict(X_test_data)
            y_pred_proba = model.predict_proba(X_test_data)[:, 1]
            
            # Calculate metrics
            roc_auc = roc_auc_score(self.y_test, y_pred_proba)
            precision, recall, _ = precision_recall_curve(self.y_test, y_pred_proba)
            pr_auc = auc(recall, precision)
            
            # Store results
            self.results[name] = {
                'cv_auc_mean': cv_scores.mean(),
                'cv_auc_std': cv_scores.std(),
                'test_roc_auc': roc_auc,
                'test_pr_auc': pr_auc,
                'predictions': y_pred,
                'probabilities': y_pred_proba,
                'model': model
            }
            
            print(f"CV ROC-AUC: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
            print(f"Test ROC-AUC: {roc_auc:.4f}")
            print(f"Test PR-AUC: {pr_auc:.4f}")
    
    def select_best_model(self):
        """Select the best performing model based on PR-AUC (better for imbalanced data)"""
        print("\n" + "="*50)
        print("MODEL SELECTION RESULTS")
        print("="*50)
        
        # Sort models by PR-AUC (Precision-Recall AUC is better for imbalanced datasets)
        sorted_models = sorted(self.results.items(), 
                              key=lambda x: x[1]['test_pr_auc'], 
                              reverse=True)
        
        print("\nModel Performance Ranking (by PR-AUC):")
        print("-" * 40)
        for i, (name, results) in enumerate(sorted_models, 1):
            print(f"{i}. {name:15} | PR-AUC: {results['test_pr_auc']:.4f} | ROC-AUC: {results['test_roc_auc']:.4f}")
        
        # Select best model
        best_model_name = sorted_models[0][0]
        best_model_results = sorted_models[0][1]
        
        print(f"\n🏆 BEST MODEL: {best_model_name}")
        print(f"   PR-AUC: {best_model_results['test_pr_auc']:.4f}")
        print(f"   ROC-AUC: {best_model_results['test_roc_auc']:.4f}")
        print(f"   CV ROC-AUC: {best_model_results['cv_auc_mean']:.4f} ± {best_model_results['cv_auc_std']:.4f}")
        
        return best_model_name, best_model_results
    
    def detailed_evaluation(self, model_name, model_results):
        """Provide detailed evaluation of the best model"""
        print(f"\n" + "="*50)
        print(f"DETAILED EVALUATION: {model_name}")
        print("="*50)
        
        y_pred = model_results['predictions']
        
        # Classification report
        print("\nClassification Report:")
        print("-" * 30)
        print(classification_report(self.y_test, y_pred))
        
        # Confusion matrix
        print("\nConfusion Matrix:")
        print("-" * 20)
        cm = confusion_matrix(self.y_test, y_pred)
        print(f"True Negatives:  {cm[0, 0]}")
        print(f"False Positives: {cm[0, 1]}")
        print(f"False Negatives: {cm[1, 0]}")
        print(f"True Positives:  {cm[1, 1]}")
        
        # Business metrics
        print("\nBusiness Impact Metrics:")
        print("-" * 25)
        precision = cm[1, 1] / (cm[1, 1] + cm[0, 1]) if (cm[1, 1] + cm[0, 1]) > 0 else 0
        recall = cm[1, 1] / (cm[1, 1] + cm[1, 0]) if (cm[1, 1] + cm[1, 0]) > 0 else 0
        
        print(f"Fraud Detection Rate (Recall): {recall:.2%}")
        print(f"Precision (True Fraud / All Flagged): {precision:.2%}")
        print(f"False Positive Rate: {cm[0, 1] / (cm[0, 0] + cm[0, 1]):.2%}")
        
        # Feature importance (if available)
        model = model_results['model']
        if hasattr(model, 'feature_importances_'):
            print("\nTop 5 Most Important Features:")
            print("-" * 35)
            feature_names = [
                'amount', 'merchant_category', 'hour_of_day', 'day_of_week',
                'age', 'account_age_days', 'num_transactions_today', 
                'avg_amount_last_30_days', 'location_risk_score', 'device_risk_score',
                'is_weekend'
            ]
            
            importances = model.feature_importances_
            feature_importance = list(zip(feature_names, importances))
            feature_importance.sort(key=lambda x: x[1], reverse=True)
            
            for i, (feature, importance) in enumerate(feature_importance[:5], 1):
                print(f"{i}. {feature:25} {importance:.4f}")
        elif hasattr(model, 'named_steps') and hasattr(model.named_steps['classifier'], 'feature_importances_'):
            print("\nTop 5 Most Important Features:")
            print("-" * 35)
            feature_names = [
                'amount', 'merchant_category', 'hour_of_day', 'day_of_week',
                'age', 'account_age_days', 'num_transactions_today', 
                'avg_amount_last_30_days', 'location_risk_score', 'device_risk_score',
                'is_weekend'
            ]
            
            importances = model.named_steps['classifier'].feature_importances_
            feature_importance = list(zip(feature_names, importances))
            feature_importance.sort(key=lambda x: x[1], reverse=True)
            
            for i, (feature, importance) in enumerate(feature_importance[:5], 1):
                print(f"{i}. {feature:25} {importance:.4f}")
    
    def run_automl_pipeline(self):
        """Run the complete AutoML pipeline"""
        print("🚀 Starting Fraud Detection AutoML Pipeline")
        print("=" * 50)
        
        # Generate data
        self.generate_fraud_data()
        
        # Preprocess data
        self.preprocess_data()
        
        # Setup models
        self.setup_models()
        
        # Evaluate models
        self.evaluate_models()
        
        # Select best model
        best_model_name, best_model_results = self.select_best_model()
        
        # Detailed evaluation
        self.detailed_evaluation(best_model_name, best_model_results)
        
        print(f"\n✅ AutoML Pipeline Complete!")
        print(f"Best Model: {best_model_name}")
        
        return best_model_name, best_model_results

# Example usage
if __name__ == "__main__":
    # Initialize and run the AutoML system
    fraud_automl = FraudDetectionAutoML(n_samples=500, n_fraud=50, random_state=42)
    
    # Run the complete pipeline
    best_model_name, best_results = fraud_automl.run_automl_pipeline()
    
    # Save the dataset for inspection
    fraud_automl.data.to_csv('fraud_detection_dataset.csv', index=False)
    print(f"\n💾 Dataset saved as 'fraud_detection_dataset.csv'")
    
    # Example of making predictions on new data
    print(f"\n🔮 Example: Making predictions on test set")
    sample_indices = np.random.choice(len(fraud_automl.X_test), 5, replace=False)
    for idx in sample_indices:
        actual = fraud_automl.y_test.iloc[idx]
        prob = best_results['probabilities'][idx]
        prediction = "FRAUD" if prob > 0.5 else "NORMAL"
        actual_label = "FRAUD" if actual == 1 else "NORMAL"
        print(f"Actual: {actual_label:6} | Predicted: {prediction:6} | Probability: {prob:.3f}")

🚀 Starting Fraud Detection AutoML Pipeline
Generating synthetic fraud detection dataset...
Dataset generated successfully!
Total samples: 500
Fraud cases: 50
Normal cases: 450
Fraud ratio: 10.00%
Preprocessing data...
Training set: 400 samples
Test set: 100 samples
Training fraud ratio: 10.00%
Setting up models...
Setup 6 models for evaluation
Evaluating models...

Evaluating RF_Balanced...
CV ROC-AUC: 1.0000 (+/- 0.0000)
Test ROC-AUC: 1.0000
Test PR-AUC: 1.0000

Evaluating GB_Balanced...
CV ROC-AUC: 0.9738 (+/- 0.1049)
Test ROC-AUC: 0.9944
Test PR-AUC: 0.9545

Evaluating LR_Balanced...
CV ROC-AUC: 1.0000 (+/- 0.0000)
Test ROC-AUC: 1.0000
Test PR-AUC: 1.0000

Evaluating SVM_Balanced...
CV ROC-AUC: 1.0000 (+/- 0.0000)
Test ROC-AUC: 1.0000
Test PR-AUC: 1.0000

Evaluating RF_SMOTE...
CV ROC-AUC: 1.0000 (+/- 0.0000)
Test ROC-AUC: 1.0000
Test PR-AUC: 1.0000

Evaluating GB_Undersample...
CV ROC-AUC: 0.9361 (+/- 0.1200)
Test ROC-AUC: 0.9889
Test PR-AUC: 0.9167

MODEL SELECTION RESULTS

Model 