In [1]:
import os
import pandas as pd
import numpy as np
from pathlib import Path

# Correct file paths based on what we found
correct_files = {
    'borderline': 'balanced_dataset_borderline_smote.csv',  # Corrected name
    'adasyn': 'balanced_dataset_adasyn.csv',                # Corrected name
    'week1': 'enhanced_dataset_week1.csv'                   # This one was correct
}

downloads_path = Path.home() / "Downloads"

def load_and_analyze_datasets():
    """Load and analyze all three datasets"""
    datasets_info = {}
    
    for name, filename in correct_files.items():
        file_path = downloads_path / filename
        print(f"\n{'='*80}")
        print(f"üìä LOADING: {name.upper()}")
        print(f"üìÅ File: {filename}")
        print('='*80)
        
        try:
            # Load dataset
            df = pd.read_csv(file_path)
            
            # Store basic info
            datasets_info[name] = {
                'dataframe': df,
                'shape': df.shape,
                'columns': df.columns.tolist(),
                'file_path': file_path
            }
            
            print(f"‚úÖ Successfully loaded!")
            print(f"üìê Shape: {df.shape} ({df.shape[0]} rows, {df.shape[1]} columns)")
            
            # Display first 2 rows to understand structure
            print(f"\nüìã First 2 rows:")
            print(df.head(2))
            
            # Check for target column candidates
            target_candidates = ['anomaly', 'label', 'class', 'target', 'is_anomaly', 'attack', 'malicious', 'Category']
            found_targets = [col for col in target_candidates if col in df.columns]
            
            if found_targets:
                print(f"\nüéØ Potential target columns: {found_targets}")
                for target_col in found_targets:
                    print(f"\n--- Analysis of '{target_col}' ---")
                    print(f"Unique values: {df[target_col].unique()}")
                    print(f"Value counts:")
                    print(df[target_col].value_counts())
                    
                    # Calculate balance ratio
                    value_counts = df[target_col].value_counts()
                    balance_ratio = value_counts.min() / value_counts.max()
                    print(f"Balance ratio: {balance_ratio:.3f}")
                    
                    if balance_ratio > 0.7:
                        print("‚úÖ Highly balanced dataset")
                    elif balance_ratio > 0.3:
                        print("‚úÖ Reasonably balanced")
                    else:
                        print("‚ö†Ô∏è Imbalanced - may need special handling")
            else:
                print(f"\n‚ùì No standard target columns found.")
                print(f"All columns: {df.columns.tolist()}")
                
            # Data types and missing values
            print(f"\nüìä Data types:")
            print(df.dtypes.value_counts())
            
            missing = df.isnull().sum()
            if missing.sum() > 0:
                print(f"\n‚ö†Ô∏è Missing values found:")
                print(missing[missing > 0])
            else:
                print(f"\n‚úÖ No missing values")
                
            # Basic statistics for numerical columns
            numerical_cols = df.select_dtypes(include=[np.number]).columns
            if len(numerical_cols) > 0:
                print(f"\nüìà Numerical columns ({len(numerical_cols)}): {list(numerical_cols)}")
            
        except Exception as e:
            print(f"‚ùå Error loading {filename}: {e}")
    
    return datasets_info

# Load all three datasets
print("üöÄ LOADING ALL THREE DATASETS...")
datasets_info = load_and_analyze_datasets()

# Summary
print(f"\n{'='*80}")
print("üìã SUMMARY OF AVAILABLE DATASETS")
print('='*80)
for name, info in datasets_info.items():
    print(f"üìÅ {name.upper():<10} | Shape: {info['shape']} | Columns: {len(info['columns'])}")

üöÄ LOADING ALL THREE DATASETS...

üìä LOADING: BORDERLINE
üìÅ File: balanced_dataset_borderline_smote.csv
‚úÖ Successfully loaded!
üìê Shape: (70000, 108) (70000 rows, 108 columns)

üìã First 2 rows:
   Dst Port  Protocol  Flow Duration  Tot Fwd Pkts  Tot Bwd Pkts  \
0 -0.615849  -0.37982      -0.056439     -0.019356      0.001071   
1  1.892744  -0.37982      -0.109265     -0.019759      0.001071   

   TotLen Fwd Pkts  TotLen Bwd Pkts  Fwd Pkt Len Max  Fwd Pkt Len Min  \
0        -0.013169        -0.011306         0.598522        -0.267497   
1        -0.010098        -0.010362         1.696897        -0.267497   

   Fwd Pkt Len Mean  ...  latent_feature_12  latent_feature_13  \
0          0.432227  ...          -0.416912          -0.072620   
1          1.555438  ...          -1.949246           0.060735   

   latent_feature_14  latent_feature_15  latent_feature_16  latent_feature_17  \
0          -0.499757          -0.199525          -0.541627          -0.248089   
1       

In [2]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np

class ForensicTabularDataset(Dataset):
    def __init__(self, features, labels):
        self.features = torch.FloatTensor(features)
        self.labels = torch.LongTensor(labels)
    
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

class FTTransformer(nn.Module):
    def __init__(self, num_features, num_classes, dim=128, depth=6, heads=8, dropout=0.1):
        super().__init__()
        self.dim = dim
        
        # Feature embedding
        self.feature_embedding = nn.Linear(num_features, dim)
        
        # Transformer encoder
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=dim, 
            nhead=heads, 
            dim_feedforward=dim*4,
            dropout=dropout,
            batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=depth)
        
        # Classification head
        self.classifier = nn.Sequential(
            nn.LayerNorm(dim),
            nn.Linear(dim, dim//2),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(dim//2, num_classes)
        )
        
        # Initialize weights
        self.apply(self._init_weights)
    
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.xavier_uniform_(module.weight)
            if module.bias is not None:
                module.bias.data.zero_()
    
    def forward(self, x):
        # x shape: [batch_size, num_features]
        batch_size = x.shape[0]
        
        # Embed features
        x = self.feature_embedding(x)  # [batch_size, dim]
        
        # Add sequence dimension for transformer
        x = x.unsqueeze(1)  # [batch_size, 1, dim]
        
        # Apply transformer
        x = self.transformer(x)  # [batch_size, 1, dim]
        
        # Pool and classify
        x = x.squeeze(1)  # [batch_size, dim]
        x = self.classifier(x)  # [batch_size, num_classes]
        
        return x

class ForensicTrainer:
    def __init__(self, model, device='cuda' if torch.cuda.is_available() else 'cpu'):
        self.model = model.to(device)
        self.device = device
        self.optimizer = torch.optim.AdamW(
            model.parameters(), 
            lr=1e-4, 
            weight_decay=1e-5
        )
        self.scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(self.optimizer, T_max=100)
        
    def train_epoch(self, dataloader, criterion):
        self.model.train()
        total_loss = 0
        all_preds = []
        all_targets = []
        
        for batch_idx, (data, target) in enumerate(dataloader):
            data, target = data.to(self.device), target.to(self.device)
            
            self.optimizer.zero_grad()
            output = self.model(data)
            loss = criterion(output, target)
            loss.backward()
            
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)
            self.optimizer.step()
            
            total_loss += loss.item()
            preds = output.argmax(dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_targets.extend(target.cpu().numpy())
        
        accuracy = accuracy_score(all_targets, all_preds)
        avg_loss = total_loss / len(dataloader)
        return avg_loss, accuracy
    
    def validate(self, dataloader, criterion):
        self.model.eval()
        total_loss = 0
        all_preds = []
        all_targets = []
        
        with torch.no_grad():
            for data, target in dataloader:
                data, target = data.to(self.device), target.to(self.device)
                output = self.model(data)
                loss = criterion(output, target)
                
                total_loss += loss.item()
                preds = output.argmax(dim=1)
                all_preds.extend(preds.cpu().numpy())
                all_targets.extend(target.cpu().numpy())
        
        accuracy = accuracy_score(all_targets, all_preds)
        avg_loss = total_loss / len(dataloader)
        return avg_loss, accuracy, all_preds, all_targets

In [3]:
def prepare_and_train(dataset_name, datasets_info, target_accuracy=96.0):
    """Prepare data and train FT-Transformer"""
    print(f"\n{'='*80}")
    print(f"üöÄ TRAINING FT-TRANSFORMER ON: {dataset_name.upper()}")
    print('='*80)
    
    df = datasets_info[dataset_name]['dataframe']
    
    # Prepare features and labels
    X = df.select_dtypes(include=[np.number]).columns.tolist()
    if 'Label' in X:
        X.remove('Label')
    
    features = df[X].values
    labels = df['Label'].values
    
    # Encode labels
    le = LabelEncoder()
    labels_encoded = le.fit_transform(labels)
    num_classes = len(le.classes_)
    
    print(f"üìä Dataset: {dataset_name}")
    print(f"üìê Features: {features.shape[1]}, Samples: {features.shape[0]}")
    print(f"üéØ Classes: {num_classes} ({list(le.classes_)})")
    
    # Scale features
    scaler = StandardScaler()
    features_scaled = scaler.fit_transform(features)
    
    # Split data
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(
        features_scaled, labels_encoded, 
        test_size=0.2, 
        random_state=42, 
        stratify=labels_encoded
    )
    
    X_train, X_val, y_train, y_val = train_test_split(
        X_train, y_train, 
        test_size=0.2, 
        random_state=42, 
        stratify=y_train
    )
    
    print(f"üìä Splits - Train: {X_train.shape[0]}, Val: {X_val.shape[0]}, Test: {X_test.shape[0]}")
    
    # Create datasets
    train_dataset = ForensicTabularDataset(X_train, y_train)
    val_dataset = ForensicTabularDataset(X_val, y_val)
    test_dataset = ForensicTabularDataset(X_test, y_test)
    
    # FIX: Set num_workers=0 for Windows compatibility
    train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True, num_workers=0)
    val_loader = DataLoader(val_dataset, batch_size=256, shuffle=False, num_workers=0)
    test_loader = DataLoader(test_dataset, batch_size=256, shuffle=False, num_workers=0)
    
    # Calculate class weights
    class_counts = np.bincount(y_train)
    class_weights = 1.0 / class_counts
    class_weights = class_weights / class_weights.sum() * len(class_counts)
    class_weights = torch.FloatTensor(class_weights).to('cuda' if torch.cuda.is_available() else 'cpu')
    
    # Initialize model
    model = FTTransformer(
        num_features=features.shape[1],
        num_classes=num_classes,
        dim=128,
        depth=6,
        heads=8
    )
    
    # Initialize trainer
    trainer = ForensicTrainer(model)
    criterion = nn.CrossEntropyLoss(weight=class_weights)
    
    # Training loop
    best_val_acc = 0
    patience = 10
    patience_counter = 0
    
    print(f"\nüéØ Training started (Target: {target_accuracy}% accuracy)")
    print("Epoch | Train Loss | Train Acc | Val Loss | Val Acc")
    print("-" * 50)
    
    for epoch in range(100):
        train_loss, train_acc = trainer.train_epoch(train_loader, criterion)
        val_loss, val_acc, _, _ = trainer.validate(val_loader, criterion)
        trainer.scheduler.step()
        
        if epoch % 5 == 0 or epoch < 10:
            print(f"{epoch:5d} | {train_loss:9.4f} | {train_acc:8.2f}% | {val_loss:7.4f} | {val_acc:6.2f}%")
        
        # Early stopping and target achievement
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            patience_counter = 0
            torch.save(model.state_dict(), f'best_model_{dataset_name}.pth')
        else:
            patience_counter += 1
        
        if val_acc >= target_accuracy:
            print(f"\nüéâ TARGET ACHIEVED! Validation Accuracy: {val_acc:.2f}%")
            break
            
        if patience_counter >= patience:
            print(f"\nüõë Early stopping at epoch {epoch}")
            break
    
    # Load best model and test
    model.load_state_dict(torch.load(f'best_model_{dataset_name}.pth'))
    test_loss, test_acc, test_preds, test_targets = trainer.validate(test_loader, criterion)
    
    print(f"\nüìä FINAL RESULTS:")
    print(f"‚úÖ Best Validation Accuracy: {best_val_acc:.2f}%")
    print(f"‚úÖ Test Accuracy: {test_acc:.2f}%")
    
    print(f"\nüìà Classification Report:")
    print(classification_report(test_targets, test_preds, target_names=le.classes_))
    
    return model, scaler, le, test_acc

# Add main guard for Windows compatibility
if __name__ == "__main__":
    print("üîç Choosing the best dataset for training...")
    
    try:
        model, scaler, le, test_acc = prepare_and_train('borderline', datasets_info, target_accuracy=96.0)
        
        if test_acc >= 96.0:
            print(f"\nüéâ SUCCESS! Achieved {test_acc:.2f}% accuracy on borderline dataset!")
        else:
            print(f"\n‚ö†Ô∏è Accuracy {test_acc:.2f}% below target. Trying adasyn dataset...")
            model, scaler, le, test_acc = prepare_and_train('adasyn', datasets_info, target_accuracy=96.0)
            
    except Exception as e:
        print(f"‚ùå Error during training: {e}")
        print("Trying with smaller batch size...")
        
        # Fallback with smaller batch size
        try:
            # You might need to modify the prepare_and_train function to accept batch_size as parameter
            # or create a simplified version here
            pass
        except Exception as e2:
            print(f"‚ùå Final error: {e2}")

üîç Choosing the best dataset for training...

üöÄ TRAINING FT-TRANSFORMER ON: BORDERLINE
üìä Dataset: borderline
üìê Features: 107, Samples: 70000
üéØ Classes: 7 (['Benign', 'Bot', 'DDOS attack-HOIC', 'DDOS attack-LOIC-UDP', 'DoS attacks-Hulk', 'DoS attacks-SlowHTTPTest', 'Infilteration'])
üìä Splits - Train: 44800, Val: 11200, Test: 14000

üéØ Training started (Target: 96.0% accuracy)
Epoch | Train Loss | Train Acc | Val Loss | Val Acc
--------------------------------------------------
    0 |    0.3524 |     0.86% |  0.1698 |   0.91%
    1 |    0.1769 |     0.91% |  0.1593 |   0.91%
    2 |    0.1638 |     0.91% |  0.1582 |   0.91%
    3 |    0.1565 |     0.91% |  0.1535 |   0.92%
    4 |    0.1542 |     0.92% |  0.1557 |   0.92%
    5 |    0.1521 |     0.92% |  0.1489 |   0.92%
    6 |    0.1481 |     0.92% |  0.1520 |   0.92%
    7 |    0.1474 |     0.92% |  0.1494 |   0.92%
    8 |    0.1457 |     0.92% |  0.1479 |   0.92%
    9 |    0.1458 |     0.92% |  0.1508 |   0.92%


In [7]:
import torch
import torch.nn as nn
import numpy as np
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from pathlib import Path
import joblib
import os

# =============================================================================
# FT-TRANSFORMER ENSEMBLING & STACKING (FIXED)
# =============================================================================

class FTTransformerEnsemble:
    """Ensemble of multiple FT-Transformer models with different architectures"""
    def __init__(self, num_features, num_classes, device='cuda' if torch.cuda.is_available() else 'cpu'):
        self.num_features = num_features
        self.num_classes = num_classes
        self.device = device
        self.models = {}
    
    def create_ft_transformer_variants(self):
        """Create multiple FT-Transformer variants for ensemble - FIXED DIMENSIONS"""
        self.models = {
            'ftt_small': FTTransformer(
                num_features=self.num_features,
                num_classes=self.num_classes,
                dim=64,   # 64 divisible by 4 heads
                depth=4,  
                heads=4,  
                dropout=0.1
            ).to(self.device),
            
            'ftt_medium': FTTransformer(
                num_features=self.num_features,
                num_classes=self.num_classes,
                dim=128,  # 128 divisible by 8 heads
                depth=6,  
                heads=8,  
                dropout=0.1
            ).to(self.device),
            
            'ftt_large': FTTransformer(
                num_features=self.num_features,
                num_classes=self.num_classes,
                dim=256,  # 256 divisible by 16 heads
                depth=8,  
                heads=16, # Changed from 12 to 16
                dropout=0.2
            ).to(self.device),
            
            'ftt_wide': FTTransformer(
                num_features=self.num_features,
                num_classes=self.num_classes,
                dim=128,  # 128 divisible by 8 heads
                depth=4,  
                heads=8,
                dropout=0.1
            ).to(self.device)
        }
        
        print("‚úÖ Created FT-Transformer ensemble variants:")
        for name, model in self.models.items():
            num_params = sum(p.numel() for p in model.parameters())
            print(f"   - {name}: {num_params:,} parameters")
    
    def predict_proba(self, X):
        """Get probability predictions from all FT-Transformer models"""
        all_probs = []
        
        for model_name, model in self.models.items():
            model.eval()
            with torch.no_grad():
                if torch.is_tensor(X):
                    X_tensor = X.to(self.device)
                else:
                    X_tensor = torch.FloatTensor(X).to(self.device)
                
                outputs = model(X_tensor)
                probs = torch.softmax(outputs, dim=1)
                all_probs.append(probs.cpu().numpy())
        
        # Average probabilities (simple ensembling)
        avg_probs = np.mean(all_probs, axis=0)
        return avg_probs
    
    def predict(self, X):
        """Predict classes using ensemble voting"""
        probs = self.predict_proba(X)
        return np.argmax(probs, axis=1)

class FTTransformerStacking:
    """Stacking ensemble using FT-Transformer models as base learners"""
    def __init__(self, base_models, meta_model_type='logistic'):
        self.base_models = base_models
        self.meta_model_type = meta_model_type
        self.meta_model = None
        self.is_fitted = False
    
    def fit(self, X, y, val_ratio=0.3):
        """Fit stacking ensemble"""
        print("üîÑ Training stacking ensemble...")
        
        # Split data for meta-training
        X_train, X_meta, y_train, y_meta = train_test_split(
            X, y, test_size=val_ratio, random_state=42, stratify=y
        )
        
        # Get base model predictions on meta set
        base_predictions = self._get_base_predictions(X_meta)
        
        # Train meta-model
        if self.meta_model_type == 'logistic':
            self.meta_model = LogisticRegression(
                multi_class='multinomial', 
                max_iter=1000,
                random_state=42,
                C=0.1
            )
        elif self.meta_model_type == 'random_forest':
            self.meta_model = RandomForestClassifier(
                n_estimators=100,
                random_state=42,
                max_depth=10
            )
        
        self.meta_model.fit(base_predictions, y_meta)
        self.is_fitted = True
        
        print("‚úÖ Stacking ensemble trained successfully")
        return self
    
    def _get_base_predictions(self, X):
        """Get probability predictions from all base models"""
        base_predictions = []
        
        for model_name, model in self.base_models.items():
            model.eval()
            with torch.no_grad():
                X_tensor = torch.FloatTensor(X).to(next(model.parameters()).device)
                outputs = model(X_tensor)
                probs = torch.softmax(outputs, dim=1).cpu().numpy()
                base_predictions.append(probs)
        
        # Concatenate all predictions
        return np.hstack(base_predictions)
    
    def predict_proba(self, X):
        """Predict probabilities using stacking"""
        if not self.is_fitted:
            raise ValueError("Stacking ensemble not fitted yet!")
        
        # Get base model predictions
        base_preds = self._get_base_predictions(X)
        
        # Meta-model prediction
        return self.meta_model.predict_proba(base_preds)
    
    def predict(self, X):
        probs = self.predict_proba(X)
        return np.argmax(probs, axis=1)

# =============================================================================
# ADVANCED FT-TRANSFORMER ENSEMBLE TRAINER (FIXED)
# =============================================================================

class FTTransformerEnsembleTrainer:
    def __init__(self, num_features, num_classes, device='cuda' if torch.cuda.is_available() else 'cpu'):
        self.num_features = num_features
        self.num_classes = num_classes
        self.device = device
        self.ensemble = None
        self.stacking = None
    
    def train_ensemble_models(self, train_loader, val_loader, num_epochs=50, patience=8):
        """Train all FT-Transformer variants"""
        print("üöÄ Training FT-Transformer ensemble variants...")
        
        # Create ensemble variants
        self.ensemble = FTTransformerEnsemble(self.num_features, self.num_classes, self.device)
        self.ensemble.create_ft_transformer_variants()
        
        model_performances = {}
        
        for model_name, model in self.ensemble.models.items():
            print(f"\nüìä Training {model_name}...")
            acc = self._train_single_model(model, train_loader, val_loader, model_name, num_epochs, patience)
            model_performances[model_name] = acc
        
        return model_performances
    
    def _train_single_model(self, model, train_loader, val_loader, model_name, num_epochs, patience):
        """Train a single FT-Transformer model"""
        optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-5)
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs)
        criterion = nn.CrossEntropyLoss()
        
        best_val_acc = 0
        patience_counter = 0
        
        for epoch in range(num_epochs):
            # Training
            model.train()
            train_loss = 0
            for data, target in train_loader:
                data, target = data.to(self.device), target.to(self.device)
                optimizer.zero_grad()
                output = model(data)
                loss = criterion(output, target)
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                optimizer.step()
                train_loss += loss.item()
            
            # Validation
            model.eval()
            val_preds, val_targets = [], []
            with torch.no_grad():
                for data, target in val_loader:
                    data, target = data.to(self.device), target.to(self.device)
                    output = model(data)
                    preds = output.argmax(dim=1)
                    val_preds.extend(preds.cpu().numpy())
                    val_targets.extend(target.cpu().numpy())
            
            val_acc = accuracy_score(val_targets, val_preds)
            scheduler.step()
            
            if epoch % 10 == 0:
                print(f"   Epoch {epoch:3d}: Val Acc = {val_acc:.4f}")
            
            # Early stopping
            if val_acc > best_val_acc:
                best_val_acc = val_acc
                patience_counter = 0
                torch.save(model.state_dict(), f'best_{model_name}.pth')
            else:
                patience_counter += 1
            
            if patience_counter >= patience:
                if epoch > 10:  # Only stop if we've trained for a reasonable number of epochs
                    print(f"   Early stopping at epoch {epoch}")
                    break
        
        # Load best model
        if os.path.exists(f'best_{model_name}.pth'):
            model.load_state_dict(torch.load(f'best_{model_name}.pth'))
        print(f"‚úÖ {model_name} training completed. Best Val Acc: {best_val_acc:.4f}")
        
        return best_val_acc
    
    def evaluate_ensemble(self, X_test, y_test):
        """Evaluate ensemble performance"""
        if self.ensemble is None:
            raise ValueError("Ensemble not trained yet!")
        
        print("\nüìä ENSEMBLE EVALUATION:")
        print("-" * 40)
        
        # Individual model performance
        individual_results = {}
        for model_name, model in self.ensemble.models.items():
            model.eval()
            with torch.no_grad():
                X_tensor = torch.FloatTensor(X_test).to(self.device)
                outputs = model(X_tensor)
                preds = outputs.argmax(dim=1).cpu().numpy()
                acc = accuracy_score(y_test, preds)
                individual_results[model_name] = acc
                print(f"   {model_name:12}: {acc:.4f}")
        
        # Ensemble performance
        ensemble_preds = self.ensemble.predict(X_test)
        ensemble_acc = accuracy_score(y_test, ensemble_preds)
        print(f"   {'Ensemble':12}: {ensemble_acc:.4f}")
        
        return {
            'individual': individual_results,
            'ensemble': ensemble_acc
        }
    
    def train_stacking_ensemble(self, X, y, meta_model_type='logistic'):
        """Train stacking ensemble"""
        print(f"\nüéØ Training Stacking Ensemble with {meta_model_type}...")
        
        if self.ensemble is None:
            raise ValueError("Base models not trained yet!")
        
        self.stacking = FTTransformerStacking(self.ensemble.models, meta_model_type)
        self.stacking.fit(X, y)
        
        return self.stacking
    
    def evaluate_stacking(self, X_test, y_test):
        """Evaluate stacking ensemble performance"""
        if self.stacking is None:
            raise ValueError("Stacking ensemble not trained yet!")
        
        stacking_preds = self.stacking.predict(X_test)
        stacking_acc = accuracy_score(y_test, stacking_preds)
        
        print(f"üìä STACKING ENSEMBLE PERFORMANCE:")
        print(f"   Accuracy: {stacking_acc:.4f}")
        
        return stacking_acc

# =============================================================================
# SIMPLE ENSEMBLE WITH EXISTING MODELS (ALTERNATIVE APPROACH)
# =============================================================================

class SimpleFTTransformerEnsemble:
    """Simple ensemble using your already trained FT-Transformer models"""
    def __init__(self, model_paths, device='cuda' if torch.cuda.is_available() else 'cpu'):
        self.models = {}
        self.device = device
        
        # Load pre-trained models
        for name, path in model_paths.items():
            if os.path.exists(path):
                # Create model architecture (adjust parameters as needed)
                model = FTTransformer(
                    num_features=107,  # Adjust based on your data
                    num_classes=7,     # Adjust based on your data
                    dim=128,
                    depth=6,
                    heads=8,
                    dropout=0.1
                ).to(device)
                
                # Load trained weights
                model.load_state_dict(torch.load(path, map_location=device))
                model.eval()
                self.models[name] = model
                print(f"‚úÖ Loaded {name} from {path}")
    
    def predict_proba(self, X):
        """Get probability predictions from all models"""
        all_probs = []
        
        for model_name, model in self.models.items():
            with torch.no_grad():
                if torch.is_tensor(X):
                    X_tensor = X.to(self.device)
                else:
                    X_tensor = torch.FloatTensor(X).to(self.device)
                
                outputs = model(X_tensor)
                probs = torch.softmax(outputs, dim=1)
                all_probs.append(probs.cpu().numpy())
        
        # Average probabilities
        avg_probs = np.mean(all_probs, axis=0)
        return avg_probs
    
    def predict(self, X):
        probs = self.predict_proba(X)
        return np.argmax(probs, axis=1)

# =============================================================================
# MAIN EXECUTION - FIXED VERSION
# =============================================================================

def run_ft_transformer_ensembling(datasets_info, dataset_name='borderline'):
    """Run complete FT-Transformer ensembling pipeline"""
    print("üöÄ STARTING FT-TRANSFORMER ENSEMBLING PIPELINE...")
    
    # Prepare data
    df = datasets_info[dataset_name]['dataframe']
    
    # Prepare features and labels
    X = df.select_dtypes(include=[np.number]).columns.tolist()
    if 'Label' in X:
        X.remove('Label')
    
    features = df[X].values
    labels = df['Label'].values
    
    # Encode labels
    le = LabelEncoder()
    labels_encoded = le.fit_transform(labels)
    num_classes = len(le.classes_)
    
    print(f"üìä Dataset: {dataset_name}")
    print(f"üìê Features: {features.shape[1]}, Samples: {features.shape[0]}")
    print(f"üéØ Classes: {num_classes} ({list(le.classes_)})")
    
    # Scale features
    scaler = StandardScaler()
    features_scaled = scaler.fit_transform(features)
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        features_scaled, labels_encoded, 
        test_size=0.2, 
        random_state=42, 
        stratify=labels_encoded
    )
    
    X_train, X_val, y_train, y_val = train_test_split(
        X_train, y_train, 
        test_size=0.2, 
        random_state=42, 
        stratify=y_train
    )
    
    print(f"üìä Data Splits:")
    print(f"   Train: {X_train.shape[0]} samples")
    print(f"   Val:   {X_val.shape[0]} samples") 
    print(f"   Test:  {X_test.shape[0]} samples")
    
    # Create datasets and loaders
    train_dataset = ForensicTabularDataset(X_train, y_train)
    val_dataset = ForensicTabularDataset(X_val, y_val)
    test_dataset = ForensicTabularDataset(X_test, y_test)
    
    train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True, num_workers=0)
    val_loader = DataLoader(val_dataset, batch_size=256, shuffle=False, num_workers=0)
    
    # Initialize and train ensemble
    ensemble_trainer = FTTransformerEnsembleTrainer(
        num_features=features.shape[1],
        num_classes=num_classes
    )
    
    # Train ensemble models
    model_performances = ensemble_trainer.train_ensemble_models(train_loader, val_loader)
    
    # Evaluate ensemble
    results = ensemble_trainer.evaluate_ensemble(X_test, y_test)
    
    # Train stacking ensemble
    stacking = ensemble_trainer.train_stacking_ensemble(
        np.vstack([X_train, X_val]),  # Use all training data for stacking
        np.hstack([y_train, y_val]),
        meta_model_type='logistic'
    )
    
    # Evaluate stacking
    stacking_acc = ensemble_trainer.evaluate_stacking(X_test, y_test)
    
    # Final comparison
    print(f"\nüéØ FINAL RESULTS COMPARISON:")
    print("-" * 50)
    best_individual = max(results['individual'].values())
    print(f"   Best Individual Model: {best_individual:.4f}")
    print(f"   Simple Ensemble:       {results['ensemble']:.4f}")
    print(f"   Stacking Ensemble:     {stacking_acc:.4f}")
    
    improvement_simple = results['ensemble'] - best_individual
    improvement_stacking = stacking_acc - best_individual
    
    print(f"\nüìà IMPROVEMENT OVER BEST INDIVIDUAL MODEL:")
    print(f"   Simple Ensemble:  +{improvement_simple:.4f}")
    print(f"   Stacking Ensemble: +{improvement_stacking:.4f}")
    
    # Save ensemble models
    torch.save({
        'ensemble_models': {name: model.state_dict() for name, model in ensemble_trainer.ensemble.models.items()},
        'stacking_model': stacking.meta_model,
        'scaler': scaler,
        'label_encoder': le,
        'feature_names': X
    }, 'ft_transformer_ensemble.pth')
    
    print("üíæ Ensemble models saved to 'ft_transformer_ensemble.pth'")
    
    return {
        'ensemble_trainer': ensemble_trainer,
        'results': results,
        'stacking_accuracy': stacking_acc,
        'scaler': scaler,
        'label_encoder': le
    }

# =============================================================================
# QUICK ENSEMBLE WITH EXISTING MODEL (If you already have trained models)
# =============================================================================

def quick_ensemble_with_existing_model():
    """Quick ensemble using your existing trained model with different seeds"""
    print("üöÄ CREATING QUICK ENSEMBLE WITH EXISTING MODEL...")
    
    # Check if you have existing trained models
    model_files = [
        'best_model_borderline.pth',
        'best_model_adasyn.pth', 
        'best_model_week1.pth'
    ]
    
    existing_models = {}
    for model_file in model_files:
        if os.path.exists(model_file):
            # Load your existing model architecture
            model = FTTransformer(
                num_features=107,  # Adjust based on your data
                num_classes=7,     # Adjust based on your data  
                dim=128,
                depth=6,
                heads=8,
                dropout=0.1
            )
            model.load_state_dict(torch.load(model_file))
            model.eval()
            existing_models[model_file] = model
            print(f"‚úÖ Loaded {model_file}")
    
    if len(existing_models) > 1:
        print(f"üéØ Found {len(existing_models)} models for ensemble")
        return SimpleFTTransformerEnsemble(existing_models)
    else:
        print("‚ÑπÔ∏è  Not enough pre-trained models found for ensemble")
        return None

# =============================================================================
# RUN THE FIXED ENSEMBLING PIPELINE
# =============================================================================

if __name__ == "__main__":
    # Try the main ensembling approach first
    try:
        ensemble_results = run_ft_transformer_ensembling(datasets_info, 'borderline')
        
        print(f"\nüéâ FT-TRANSFORMER ENSEMBLING COMPLETED SUCCESSFULLY!")
        print(f"üìä Final Stacking Accuracy: {ensemble_results['stacking_accuracy']:.4f}")
        
    except Exception as e:
        print(f"‚ùå Error in main ensembling pipeline: {e}")
        print("üîÑ Trying alternative approach with existing models...")
        
        # Try alternative approach
        try:
            quick_ensemble = quick_ensemble_with_existing_model()
            if quick_ensemble:
                # Test the quick ensemble
                df = datasets_info['borderline']['dataframe']
                X = df.select_dtypes(include=[np.number]).columns.tolist()
                if 'Label' in X:
                    X.remove('Label')
                
                features = df[X].values
                labels = df['Label'].values
                
                le = LabelEncoder()
                labels_encoded = le.fit_transform(labels)
                
                scaler = StandardScaler()
                features_scaled = scaler.fit_transform(features)
                
                X_train, X_test, y_train, y_test = train_test_split(
                    features_scaled, labels_encoded, test_size=0.2, random_state=42, stratify=labels_encoded
                )
                
                preds = quick_ensemble.predict(X_test)
                acc = accuracy_score(y_test, preds)
                
                print(f"üéØ Quick Ensemble Accuracy: {acc:.4f}")
                
        except Exception as e2:
            print(f"‚ùå Alternative approach also failed: {e2}")
            import traceback
            traceback.print_exc()

üöÄ STARTING FT-TRANSFORMER ENSEMBLING PIPELINE...
üìä Dataset: borderline
üìê Features: 107, Samples: 70000
üéØ Classes: 7 (['Benign', 'Bot', 'DDOS attack-HOIC', 'DDOS attack-LOIC-UDP', 'DoS attacks-Hulk', 'DoS attacks-SlowHTTPTest', 'Infilteration'])
üìä Data Splits:
   Train: 44800 samples
   Val:   11200 samples
   Test:  14000 samples
üöÄ Training FT-Transformer ensemble variants...
‚úÖ Created FT-Transformer ensemble variants:
   - ftt_small: 209,287 parameters
   - ftt_medium: 1,212,423 parameters
   - ftt_large: 6,380,039 parameters
   - ftt_wide: 815,879 parameters

üìä Training ftt_small...
   Epoch   0: Val Acc = 0.9024
   Epoch  10: Val Acc = 0.9172
   Epoch  20: Val Acc = 0.9194
   Epoch  30: Val Acc = 0.9196
   Early stopping at epoch 38
‚úÖ ftt_small training completed. Best Val Acc: 0.9196

üìä Training ftt_medium...
   Epoch   0: Val Acc = 0.9096
   Epoch  10: Val Acc = 0.9187
   Epoch  20: Val Acc = 0.9201
   Epoch  30: Val Acc = 0.9198
   Early stopping at ep



‚úÖ Stacking ensemble trained successfully
üìä STACKING ENSEMBLE PERFORMANCE:
   Accuracy: 0.9252

üéØ FINAL RESULTS COMPARISON:
--------------------------------------------------
   Best Individual Model: 0.9256
   Simple Ensemble:       0.9252
   Stacking Ensemble:     0.9252

üìà IMPROVEMENT OVER BEST INDIVIDUAL MODEL:
   Simple Ensemble:  +-0.0004
   Stacking Ensemble: +-0.0004
üíæ Ensemble models saved to 'ft_transformer_ensemble.pth'

üéâ FT-TRANSFORMER ENSEMBLING COMPLETED SUCCESSFULLY!
üìä Final Stacking Accuracy: 0.9252
