In [6]:
import os
import pandas as pd
import numpy as np
from pathlib import Path

# Correct file paths based on what we found
correct_files = {
    'borderline': 'balanced_dataset_borderline_smote.csv',  # Corrected name
    'adasyn': 'balanced_dataset_adasyn.csv',                # Corrected name
    'week1': 'enhanced_dataset_week1.csv'                   # This one was correct
}

downloads_path = Path.home() / "Downloads"

def load_and_analyze_datasets():
    """Load and analyze all three datasets"""
    datasets_info = {}
    
    for name, filename in correct_files.items():
        file_path = downloads_path / filename
        print(f"\n{'='*80}")
        print(f"📊 LOADING: {name.upper()}")
        print(f"📁 File: {filename}")
        print('='*80)
        
        try:
            # Load dataset
            df = pd.read_csv(file_path)
            
            # Store basic info
            datasets_info[name] = {
                'dataframe': df,
                'shape': df.shape,
                'columns': df.columns.tolist(),
                'file_path': file_path
            }
            
            print(f"✅ Successfully loaded!")
            print(f"📐 Shape: {df.shape} ({df.shape[0]} rows, {df.shape[1]} columns)")
            
            # Display first 2 rows to understand structure
            print(f"\n📋 First 2 rows:")
            print(df.head(2))
            
            # Check for target column candidates
            target_candidates = ['anomaly', 'label', 'class', 'target', 'is_anomaly', 'attack', 'malicious', 'Category']
            found_targets = [col for col in target_candidates if col in df.columns]
            
            if found_targets:
                print(f"\n🎯 Potential target columns: {found_targets}")
                for target_col in found_targets:
                    print(f"\n--- Analysis of '{target_col}' ---")
                    print(f"Unique values: {df[target_col].unique()}")
                    print(f"Value counts:")
                    print(df[target_col].value_counts())
                    
                    # Calculate balance ratio
                    value_counts = df[target_col].value_counts()
                    balance_ratio = value_counts.min() / value_counts.max()
                    print(f"Balance ratio: {balance_ratio:.3f}")
                    
                    if balance_ratio > 0.7:
                        print("✅ Highly balanced dataset")
                    elif balance_ratio > 0.3:
                        print("✅ Reasonably balanced")
                    else:
                        print("⚠️ Imbalanced - may need special handling")
            else:
                print(f"\n❓ No standard target columns found.")
                print(f"All columns: {df.columns.tolist()}")
                
            # Data types and missing values
            print(f"\n📊 Data types:")
            print(df.dtypes.value_counts())
            
            missing = df.isnull().sum()
            if missing.sum() > 0:
                print(f"\n⚠️ Missing values found:")
                print(missing[missing > 0])
            else:
                print(f"\n✅ No missing values")
                
            # Basic statistics for numerical columns
            numerical_cols = df.select_dtypes(include=[np.number]).columns
            if len(numerical_cols) > 0:
                print(f"\n📈 Numerical columns ({len(numerical_cols)}): {list(numerical_cols)}")
            
        except Exception as e:
            print(f"❌ Error loading {filename}: {e}")
    
    return datasets_info

# Load all three datasets
print("🚀 LOADING ALL THREE DATASETS...")
datasets_info = load_and_analyze_datasets()

# Summary
print(f"\n{'='*80}")
print("📋 SUMMARY OF AVAILABLE DATASETS")
print('='*80)
for name, info in datasets_info.items():
    print(f"📁 {name.upper():<10} | Shape: {info['shape']} | Columns: {len(info['columns'])}")

🚀 LOADING ALL THREE DATASETS...

📊 LOADING: BORDERLINE
📁 File: balanced_dataset_borderline_smote.csv
✅ Successfully loaded!
📐 Shape: (70000, 108) (70000 rows, 108 columns)

📋 First 2 rows:
   Dst Port  Protocol  Flow Duration  Tot Fwd Pkts  Tot Bwd Pkts  \
0 -0.615849  -0.37982      -0.056439     -0.019356      0.001071   
1  1.892744  -0.37982      -0.109265     -0.019759      0.001071   

   TotLen Fwd Pkts  TotLen Bwd Pkts  Fwd Pkt Len Max  Fwd Pkt Len Min  \
0        -0.013169        -0.011306         0.598522        -0.267497   
1        -0.010098        -0.010362         1.696897        -0.267497   

   Fwd Pkt Len Mean  ...  latent_feature_12  latent_feature_13  \
0          0.432227  ...          -0.416912          -0.072620   
1          1.555438  ...          -1.949246           0.060735   

   latent_feature_14  latent_feature_15  latent_feature_16  latent_feature_17  \
0          -0.499757          -0.199525          -0.541627          -0.248089   
1           4.092486     

In [7]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np

class ForensicTabularDataset(Dataset):
    def __init__(self, features, labels):
        self.features = torch.FloatTensor(features)
        self.labels = torch.LongTensor(labels)
    
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

class FTTransformer(nn.Module):
    def __init__(self, num_features, num_classes, dim=128, depth=6, heads=8, dropout=0.1):
        super().__init__()
        self.dim = dim
        
        # Feature embedding
        self.feature_embedding = nn.Linear(num_features, dim)
        
        # Transformer encoder
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=dim, 
            nhead=heads, 
            dim_feedforward=dim*4,
            dropout=dropout,
            batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=depth)
        
        # Classification head
        self.classifier = nn.Sequential(
            nn.LayerNorm(dim),
            nn.Linear(dim, dim//2),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(dim//2, num_classes)
        )
        
        # Initialize weights
        self.apply(self._init_weights)
    
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.xavier_uniform_(module.weight)
            if module.bias is not None:
                module.bias.data.zero_()
    
    def forward(self, x):
        # x shape: [batch_size, num_features]
        batch_size = x.shape[0]
        
        # Embed features
        x = self.feature_embedding(x)  # [batch_size, dim]
        
        # Add sequence dimension for transformer
        x = x.unsqueeze(1)  # [batch_size, 1, dim]
        
        # Apply transformer
        x = self.transformer(x)  # [batch_size, 1, dim]
        
        # Pool and classify
        x = x.squeeze(1)  # [batch_size, dim]
        x = self.classifier(x)  # [batch_size, num_classes]
        
        return x

class ForensicTrainer:
    def __init__(self, model, device='cuda' if torch.cuda.is_available() else 'cpu'):
        self.model = model.to(device)
        self.device = device
        self.optimizer = torch.optim.AdamW(
            model.parameters(), 
            lr=1e-4, 
            weight_decay=1e-5
        )
        self.scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(self.optimizer, T_max=100)
        
    def train_epoch(self, dataloader, criterion):
        self.model.train()
        total_loss = 0
        all_preds = []
        all_targets = []
        
        for batch_idx, (data, target) in enumerate(dataloader):
            data, target = data.to(self.device), target.to(self.device)
            
            self.optimizer.zero_grad()
            output = self.model(data)
            loss = criterion(output, target)
            loss.backward()
            
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)
            self.optimizer.step()
            
            total_loss += loss.item()
            preds = output.argmax(dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_targets.extend(target.cpu().numpy())
        
        accuracy = accuracy_score(all_targets, all_preds)
        avg_loss = total_loss / len(dataloader)
        return avg_loss, accuracy
    
    def validate(self, dataloader, criterion):
        self.model.eval()
        total_loss = 0
        all_preds = []
        all_targets = []
        
        with torch.no_grad():
            for data, target in dataloader:
                data, target = data.to(self.device), target.to(self.device)
                output = self.model(data)
                loss = criterion(output, target)
                
                total_loss += loss.item()
                preds = output.argmax(dim=1)
                all_preds.extend(preds.cpu().numpy())
                all_targets.extend(target.cpu().numpy())
        
        accuracy = accuracy_score(all_targets, all_preds)
        avg_loss = total_loss / len(dataloader)
        return avg_loss, accuracy, all_preds, all_targets

In [9]:
def prepare_and_train(dataset_name, datasets_info, target_accuracy=96.0):
    """Prepare data and train FT-Transformer"""
    print(f"\n{'='*80}")
    print(f"🚀 TRAINING FT-TRANSFORMER ON: {dataset_name.upper()}")
    print('='*80)
    
    df = datasets_info[dataset_name]['dataframe']
    
    # Prepare features and labels
    X = df.select_dtypes(include=[np.number]).columns.tolist()
    if 'Label' in X:
        X.remove('Label')
    
    features = df[X].values
    labels = df['Label'].values
    
    # Encode labels
    le = LabelEncoder()
    labels_encoded = le.fit_transform(labels)
    num_classes = len(le.classes_)
    
    print(f"📊 Dataset: {dataset_name}")
    print(f"📐 Features: {features.shape[1]}, Samples: {features.shape[0]}")
    print(f"🎯 Classes: {num_classes} ({list(le.classes_)})")
    
    # Scale features
    scaler = StandardScaler()
    features_scaled = scaler.fit_transform(features)
    
    # Split data
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(
        features_scaled, labels_encoded, 
        test_size=0.2, 
        random_state=42, 
        stratify=labels_encoded
    )
    
    X_train, X_val, y_train, y_val = train_test_split(
        X_train, y_train, 
        test_size=0.2, 
        random_state=42, 
        stratify=y_train
    )
    
    print(f"📊 Splits - Train: {X_train.shape[0]}, Val: {X_val.shape[0]}, Test: {X_test.shape[0]}")
    
    # Create datasets
    train_dataset = ForensicTabularDataset(X_train, y_train)
    val_dataset = ForensicTabularDataset(X_val, y_val)
    test_dataset = ForensicTabularDataset(X_test, y_test)
    
    # FIX: Set num_workers=0 for Windows compatibility
    train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True, num_workers=0)
    val_loader = DataLoader(val_dataset, batch_size=256, shuffle=False, num_workers=0)
    test_loader = DataLoader(test_dataset, batch_size=256, shuffle=False, num_workers=0)
    
    # Calculate class weights
    class_counts = np.bincount(y_train)
    class_weights = 1.0 / class_counts
    class_weights = class_weights / class_weights.sum() * len(class_counts)
    class_weights = torch.FloatTensor(class_weights).to('cuda' if torch.cuda.is_available() else 'cpu')
    
    # Initialize model
    model = FTTransformer(
        num_features=features.shape[1],
        num_classes=num_classes,
        dim=128,
        depth=6,
        heads=8
    )
    
    # Initialize trainer
    trainer = ForensicTrainer(model)
    criterion = nn.CrossEntropyLoss(weight=class_weights)
    
    # Training loop
    best_val_acc = 0
    patience = 10
    patience_counter = 0
    
    print(f"\n🎯 Training started (Target: {target_accuracy}% accuracy)")
    print("Epoch | Train Loss | Train Acc | Val Loss | Val Acc")
    print("-" * 50)
    
    for epoch in range(100):
        train_loss, train_acc = trainer.train_epoch(train_loader, criterion)
        val_loss, val_acc, _, _ = trainer.validate(val_loader, criterion)
        trainer.scheduler.step()
        
        if epoch % 5 == 0 or epoch < 10:
            print(f"{epoch:5d} | {train_loss:9.4f} | {train_acc:8.2f}% | {val_loss:7.4f} | {val_acc:6.2f}%")
        
        # Early stopping and target achievement
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            patience_counter = 0
            torch.save(model.state_dict(), f'best_model_{dataset_name}.pth')
        else:
            patience_counter += 1
        
        if val_acc >= target_accuracy:
            print(f"\n🎉 TARGET ACHIEVED! Validation Accuracy: {val_acc:.2f}%")
            break
            
        if patience_counter >= patience:
            print(f"\n🛑 Early stopping at epoch {epoch}")
            break
    
    # Load best model and test
    model.load_state_dict(torch.load(f'best_model_{dataset_name}.pth'))
    test_loss, test_acc, test_preds, test_targets = trainer.validate(test_loader, criterion)
    
    print(f"\n📊 FINAL RESULTS:")
    print(f"✅ Best Validation Accuracy: {best_val_acc:.2f}%")
    print(f"✅ Test Accuracy: {test_acc:.2f}%")
    
    print(f"\n📈 Classification Report:")
    print(classification_report(test_targets, test_preds, target_names=le.classes_))
    
    return model, scaler, le, test_acc

# Add main guard for Windows compatibility
if __name__ == "__main__":
    print("🔍 Choosing the best dataset for training...")
    
    try:
        model, scaler, le, test_acc = prepare_and_train('borderline', datasets_info, target_accuracy=96.0)
        
        if test_acc >= 96.0:
            print(f"\n🎉 SUCCESS! Achieved {test_acc:.2f}% accuracy on borderline dataset!")
        else:
            print(f"\n⚠️ Accuracy {test_acc:.2f}% below target. Trying adasyn dataset...")
            model, scaler, le, test_acc = prepare_and_train('adasyn', datasets_info, target_accuracy=96.0)
            
    except Exception as e:
        print(f"❌ Error during training: {e}")
        print("Trying with smaller batch size...")
        
        # Fallback with smaller batch size
        try:
            # You might need to modify the prepare_and_train function to accept batch_size as parameter
            # or create a simplified version here
            pass
        except Exception as e2:
            print(f"❌ Final error: {e2}")

🔍 Choosing the best dataset for training...

🚀 TRAINING FT-TRANSFORMER ON: BORDERLINE
📊 Dataset: borderline
📐 Features: 107, Samples: 70000
🎯 Classes: 7 (['Benign', 'Bot', 'DDOS attack-HOIC', 'DDOS attack-LOIC-UDP', 'DoS attacks-Hulk', 'DoS attacks-SlowHTTPTest', 'Infilteration'])
📊 Splits - Train: 44800, Val: 11200, Test: 14000

🎯 Training started (Target: 96.0% accuracy)
Epoch | Train Loss | Train Acc | Val Loss | Val Acc
--------------------------------------------------
    0 |    0.3507 |     0.86% |  0.1703 |   0.91%
    1 |    0.1739 |     0.91% |  0.1632 |   0.91%
    2 |    0.1625 |     0.91% |  0.1558 |   0.92%
    3 |    0.1573 |     0.91% |  0.1536 |   0.92%
    4 |    0.1536 |     0.92% |  0.1523 |   0.92%
    5 |    0.1516 |     0.92% |  0.1493 |   0.92%
    6 |    0.1501 |     0.92% |  0.1548 |   0.92%
    7 |    0.1475 |     0.92% |  0.1481 |   0.92%
    8 |    0.1467 |     0.92% |  0.1475 |   0.92%
    9 |    0.1455 |     0.92% |  0.1459 |   0.92%
   10 |    0.1438 |  