In [None]:
# Cell 1: Imports and Setup
"""
ENHANCED PGTCN - Multi-Strategy Password Generation
GPU-Optimized for CUDA 12.8 with RTX 4500 Ada
Implements 5 key improvements to achieve 20-40% match rate
"""

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import random
import numpy as np
from pathlib import Path
import json
import math
from tqdm import tqdm

from dataclasses import dataclass
from typing import List, Tuple
from collections import Counter

torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

In [2]:
# Cell 2: GPU Monitoring Functions
# ============================================
# GPU MONITORING & VERIFICATION
# ============================================

def check_gpu_status():
    """Check and display GPU status"""
    print("\n" + "="*80)
    print("GPU STATUS CHECK")
    print("="*80)
    
    if torch.cuda.is_available():
        print(f"‚úÖ CUDA Available: Yes")
        print(f"üéÆ GPU Device: {torch.cuda.get_device_name(0)}")
        print(f"üî¢ CUDA Version: {torch.version.cuda}")
        print(f"üíæ Total VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
        print(f"üîß Device Capability: {torch.cuda.get_device_capability(0)}")
        print(f"üìä Number of GPUs: {torch.cuda.device_count()}")
        print(f"üéØ Current Device: {torch.cuda.current_device()}")
        
        # Enable optimizations
        if hasattr(torch.backends.cuda, 'matmul'):
            torch.backends.cuda.matmul.allow_tf32 = True
            print(f"üöÄ TF32 enabled for matrix multiplication")
        
        if hasattr(torch.backends.cudnn, 'allow_tf32'):
            torch.backends.cudnn.allow_tf32 = True
            print(f"üöÄ TF32 enabled for cuDNN")
        
        torch.backends.cudnn.benchmark = True
        print(f"üöÄ cuDNN benchmark mode enabled")
        
        print("="*80 + "\n")
        return True
    else:
        print("‚ùå CUDA NOT AVAILABLE - Will run on CPU")
        print("="*80 + "\n")
        return False


def monitor_gpu_memory():
    """Monitor GPU memory usage during training"""
    if torch.cuda.is_available():
        allocated = torch.cuda.memory_allocated(0) / 1e9
        reserved = torch.cuda.memory_reserved(0) / 1e9
        total = torch.cuda.get_device_properties(0).total_memory / 1e9
        free = total - reserved
        
        print(f"\n{'='*60}")
        print("GPU MEMORY STATUS")
        print(f"{'='*60}")
        print(f"Allocated:  {allocated:6.2f} GB ({allocated/total*100:5.1f}%)")
        print(f"Reserved:   {reserved:6.2f} GB ({reserved/total*100:5.1f}%)")
        print(f"Free:       {free:6.2f} GB ({free/total*100:5.1f}%)")
        print(f"Total:      {total:6.2f} GB")
        print(f"{'='*60}\n")


def clear_gpu_cache():
    """Clear GPU cache to free up memory"""
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.synchronize()
        print("‚úì GPU cache cleared")


# Initialize GPU at startup
print("Initializing GPU...")
GPU_AVAILABLE = check_gpu_status()

Initializing GPU...

GPU STATUS CHECK
‚úÖ CUDA Available: Yes
üéÆ GPU Device: NVIDIA RTX 4500 Ada Generation
üî¢ CUDA Version: 13.0
üíæ Total VRAM: 25.25 GB
üîß Device Capability: (8, 9)
üìä Number of GPUs: 1
üéØ Current Device: 0
üöÄ TF32 enabled for matrix multiplication
üöÄ TF32 enabled for cuDNN
üöÄ cuDNN benchmark mode enabled



In [3]:
class Config:
    # Dataset configurations
    DATASETS = {
        # '000webhost': {
        #     'path': 'datasets/000webhost.txt',
        #     'model_dir': 'pgtcn_modelMulti/000webhost/',
        #     'description': '000webhost password leak'
        # },
        # '10-million-password-list-top-1000000': {
        #     'path': 'datasets/10-million-password-list-top-1000000.txt',
        #     'model_dir': 'pgtcn_modelMulti/10-million-password-list-top-1000000/',
        #     'description': '10 million password list top 1M'
        # },
        # 'Ashley-Madison': {
        #     'path': 'datasets/Ashley-Madison.txt',
        #     'model_dir': 'pgtcn_modelMulti/Ashley-Madiso/',
        #     'description': 'Ashley Madison password leak'
        # },
        # 'hashkiller-dict': {
        #     'path': 'datasets/hashkiller-dict.txt',
        #     'model_dir': 'pgtcn_modelMulti/hashkiller-dict/',
        #     'description': 'Hashkiller dictionary'
        # },
        # 'hashmob.net.large.found': {
        #     'path': 'datasets/hashmob.net.large.found.txt',
        #     'model_dir': 'pgtcn_modelMulti/hashmob.net.large.found/',
        #     'description': 'Hashmob large found'
        # },
        # 'honeynet': {
        #     'path': 'datasets/honeynet.txt',
        #     'model_dir': 'pgtcn_modelMulti/honeynet/',
        #     'description': 'Honeynet password leak'
        # },
        # 'hotmail': {
        #     'path': 'datasets/hotmail.txt',
        #     'model_dir': 'pgtcn_modelMulti/hotmail/',
        #     'description': 'Hotmail password leak'
        # },
        'myspace': {
            'path': 'datasets/myspace.txt',
            'model_dir': 'pgtcn_modelMulti/myspace/',
            'description': 'MySpace password leak'}
        # },
        # 'NordVPN': {
        #     'path': 'datasets/NordVPN.txt',
        #     'model_dir': 'pgtcn_modelMulti/NordVPN/',
        #     'description': 'NordVPN password leak'
        # },
        # 'phpbb': {
        #     'path': 'datasets/phpbb.txt',
        #     'model_dir': 'pgtcn_modelMulti/phpbb/',
        #     'description': 'phpBB password leak'
        # },
        # 'rockyou': {
        #     'path': 'datasets/rockyou.txt',
        #     'model_dir': 'pgtcn_modelMulti/rockyou/',
        #     'description': 'RockYou password leak (14M passwords)'
        # },
        # 'singles.org': {
        #     'path': 'datasets/singles.org.txt',
        #     'model_dir': 'pgtcn_modelMulti/singles.org/',
        #     'description': 'Singles.org password leak'
        # }
    }
    
    # Model architecture
    EMBEDDING_DIM = 512
    TCN_CHANNELS = [512, 512, 512, 512, 512, 512]
    KERNEL_SIZE = 3
    DROPOUT = 0.1
    
    # Training parameters
    BATCH_SIZE = 64
    LEARNING_RATE = 3e-4
    EPOCHS = 20  # Increased for better convergence
    WEIGHT_DECAY = 1e-6
    GRAD_CLIP = 0.5
    USE_AMP = True
    USE_FOCAL_LOSS = True
    FOCAL_ALPHA = 0.25
    FOCAL_GAMMA = 2.0
    LABEL_SMOOTHING = 0.1
    
    # Data parameters
    SEQ_LEN = 18  # Increased to accommodate SOS and EOS tokens
    NUM_WORKERS = 4
    PIN_MEMORY = True
    
    # Device
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # Dynamic
    VOCAB_SIZE = None
    DATA_PATH = None
    MODEL_DIR = None
    # Generation
    BEAM_WIDTH = 5
    LENGTH_PENALTY = 0.8
    
    @staticmethod
    def set_dataset(dataset_name):
        if dataset_name not in Config.DATASETS:
            raise ValueError(f"Dataset {dataset_name} not found")
        Config.DATA_PATH = Config.DATASETS[dataset_name]['path']
        Config.MODEL_DIR = Config.DATASETS[dataset_name]['model_dir']
    
    @staticmethod
    def get_available_datasets():
        return list(Config.DATASETS.keys())
    
    @staticmethod
    def print_config():
        print("\n" + "="*60)
        print("CONFIGURATION")
        print("="*60)
        print(f"Embedding Dim:     {Config.EMBEDDING_DIM}")
        print(f"TCN Channels:      {Config.TCN_CHANNELS}")
        print(f"Kernel Size:       {Config.KERNEL_SIZE}")
        print(f"Dropout:           {Config.DROPOUT}")
        print(f"Batch Size:        {Config.BATCH_SIZE}")
        print(f"Learning Rate:     {Config.LEARNING_RATE}")
        print(f"Epochs:            {Config.EPOCHS}")
        print(f"Device:            {Config.DEVICE}")
        print(f"Use AMP:           {Config.USE_AMP}")
        print(f"Use Focal Loss:    {Config.USE_FOCAL_LOSS}")
        print("="*60 + "\n")

In [4]:
# Cell 4: Multi-Dataset Utilities
# ============================================
# MULTI-DATASET UTILITIES
# ============================================

def load_dataset(dataset_name):
    """Load and prepare a specific dataset"""
    Config.set_dataset(dataset_name)
    
    print(f"\n{'='*80}")
    print(f"Loading {Config.DATASETS[dataset_name]['description']}")
    print(f"{'='*80}\n")
    
    try:
        with open(Config.DATA_PATH, 'r', encoding='utf-8', errors='ignore') as f:
            passwords = [line.strip() for line in f if line.strip()]
        
        filtered = [pwd for pwd in passwords if 4 <= len(pwd) <= 16 and pwd.isascii()]
        print(f"‚úì Loaded {len(passwords):,} passwords")
        print(f"‚úì Filtered: {len(filtered):,} passwords (4-16 chars, ASCII only)\n")
        
        return filtered
    
    except FileNotFoundError:
        print(f"‚ùå Error: Dataset file not found at {Config.DATA_PATH}")
        return None
    except Exception as e:
        print(f"‚ùå Error loading dataset: {str(e)}")
        return None


def save_dataset_model(model, stoi, itos, dataset_name, epoch=None):
    """Save model and vocabulary for a specific dataset"""
    model_dir = Config.DATASETS[dataset_name]['model_dir']
    Path(model_dir).mkdir(parents=True, exist_ok=True)
    
    # Save model
    if epoch is not None:
        model_path = f"{model_dir}/pgtcn_epoch_{epoch}.pt"
    else:
        model_path = f"{model_dir}/pgtcn_final.pt"
    
    torch.save(model.state_dict(), model_path)
    
    # Save vocabulary
    vocab_path = f"{model_dir}/vocabulary.json"
    with open(vocab_path, 'w') as f:
        json.dump({
            'stoi': stoi,
            'itos': {str(k): v for k, v in itos.items()}
        }, f, indent=2)
    
    # Save config
    config_path = f"{model_dir}/config.json"
    config_data = {
        'dataset': dataset_name,
        'vocab_size': len(stoi),
        'embedding_dim': Config.EMBEDDING_DIM,
        'tcn_channels': Config.TCN_CHANNELS,
        'kernel_size': Config.KERNEL_SIZE,
        'dropout': Config.DROPOUT,
        'seq_len': Config.SEQ_LEN
    }
    with open(config_path, 'w') as f:
        json.dump(config_data, f, indent=2)
    
    print(f"‚úì Model saved: {model_path}")
    print(f"‚úì Vocabulary saved: {vocab_path}")
    print(f"‚úì Config saved: {config_path}")
    
    return model_path


def load_dataset_model(dataset_name, vocab_size):
    """Load a trained model for a specific dataset"""
    model_dir = Config.DATASETS[dataset_name]['model_dir']
    model_path = f"{model_dir}/pgtcn_final.pt"
    vocab_path = f"{model_dir}/vocabulary.json"
    
    # Load vocabulary
    with open(vocab_path, 'r') as f:
        vocab_data = json.load(f)
        stoi = vocab_data['stoi']
        itos = {int(k): v for k, v in vocab_data['itos'].items()}
    
    # Create model
    model = PGTCN(
        vocab_size=vocab_size,
        embedding_dim=Config.EMBEDDING_DIM,
        num_channels=Config.TCN_CHANNELS,
        kernel_size=Config.KERNEL_SIZE,
        dropout=Config.DROPOUT
    ).to(Config.DEVICE)
    
    # Load weights
    model.load_state_dict(torch.load( enabledmodel_path, map_location=Config.DEVICE))
    
    print(f"‚úì Model loaded from: {model_path}")
    
    return model, stoi, itos

In [5]:
# Cell 5: Vocabulary
# ============================================
# VOCABULARY
# ============================================

def build_vocabulary(passwords):
    unique_chars = set()
    for pwd in passwords:
        unique_chars.update(pwd)
    
    unique_chars = sorted(unique_chars)
    
    stoi = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3}
    
    for idx, char in enumerate(unique_chars, start=4):
        stoi[char] = idx
    
    itos = {v: k for k, v in stoi.items()}
    return stoi, itos


def tokenize_password(pwd, stoi, max_len=18):
    # Add SOS and EOS tokens
    tokens = [stoi.get("<SOS>", 1)] + [stoi.get(c, stoi.get("<UNK>", 3)) for c in pwd] + [stoi.get("<EOS>", 2)]
    
    # Truncate if necessary (keeping SOS and EOS is important, but we prioritize fitting in max_len)
    if len(tokens) > max_len:
        tokens = tokens[:max_len-1] + [stoi.get("<EOS>", 2)]
    
    # Pad
    tokens += [stoi.get("<PAD>", 0)] * (max_len - len(tokens))
    return tokens[:max_len]

In [6]:
# Cell 6: Dataset with Augmentation
# ============================================
# ENHANCED DATASET WITH AUGMENTATION
# ============================================

class PasswordDataset(Dataset):
    def __init__(self, passwords, stoi, max_len=16, augment=True):
        self.passwords = passwords
        self.stoi = stoi
        self.max_len = max_len
        self.augment = augment
    
    def __len__(self):
        return len(self.passwords)
    
    def __getitem__(self, idx):
        pwd = self.passwords[idx]
        
        # Data augmentation - random case flipping
        if self.augment and random.random() < 0.1:
            pwd = pwd.swapcase()
        
        tokens = tokenize_password(pwd, self.stoi, self.max_len)
        return torch.tensor(tokens, dtype=torch.long)


def collate_fn(batch):
    return torch.stack(batch)

In [7]:
# Cell 7: TCN Building Blocks
# ============================================
# TCN BUILDING BLOCKS
# ============================================

class Chomp1d(nn.Module):
    """Remove padding from the end of sequence"""
    def __init__(self, chomp_size):
        super().__init__()
        self.chomp_size = chomp_size
    
    def forward(self, x):
        return x[:, :, :-self.chomp_size].contiguous() if self.chomp_size > 0 else x


class TemporalBlock(nn.Module):
    """Temporal block with dilated causal convolutions"""
    def __init__(self, n_inputs, n_outputs, kernel_size, stride, dilation, padding, dropout=0.2):
        super().__init__()
        
        # First convolution
        self.conv1 = nn.Conv1d(n_inputs, n_outputs, kernel_size,
                               stride=stride, padding=padding, dilation=dilation)
        self.chomp1 = Chomp1d(padding)
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(dropout)
        
        # Second convolution
        self.conv2 = nn.Conv1d(n_outputs, n_outputs, kernel_size,
                               stride=stride, padding=padding, dilation=dilation)
        self.chomp2 = Chomp1d(padding)
        self.relu2 = nn.ReLU()
        self.dropout2 = nn.Dropout(dropout)
        
        # Residual connection
        self.downsample = nn.Conv1d(n_inputs, n_outputs, 1) if n_inputs != n_outputs else None
        self.relu = nn.ReLU()
        
    def forward(self, x):
        # First conv block
        out = self.conv1(x)
        out = self.chomp1(out)
        out = self.relu1(out)
        out = self.dropout1(out)
        
        # Second conv block
        out = self.conv2(out)
        out = self.chomp2(out)
        out = self.relu2(out)
        out = self.dropout2(out)
        
        # Residual connection (NON-INPLACE!)
        res = x if self.downsample is None else self.downsample(x)
        out = out + res  # Non-inplace addition
        
        return self.relu(out)


class TemporalConvNet(nn.Module):
    """Temporal Convolutional Network (TCN)"""
    def __init__(self, num_inputs, num_channels, kernel_size=2, dropout=0.2):
        super().__init__()
        layers = []
        num_levels = len(num_channels)
        
        for i in range(num_levels):
            dilation_size = 2 ** i
            in_channels = num_inputs if i == 0 else num_channels[i-1]
            out_channels = num_channels[i]
            
            layers.append(TemporalBlock(
                in_channels, out_channels, kernel_size, stride=1,
                dilation=dilation_size,
                padding=(kernel_size-1) * dilation_size,
                dropout=dropout
            ))
        
        self.network = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.network(x)

In [8]:
# Cell 8: PGTCN Model
# ============================================
# ENHANCED PGTCN MODEL
# ============================================

class PGTCN(nn.Module):
    def __init__(self, vocab_size, embedding_dim=256, num_channels=[256]*6, 
                 kernel_size=3, dropout=0.1):
        super().__init__()
        
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        
        # Embedding with better initialization
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        nn.init.normal_(self.embedding.weight, mean=0, std=0.01)
        self.embedding.weight.data[0].zero_()
        
        # TCN
        self.tcn = TemporalConvNet(
            num_inputs=embedding_dim,
            num_channels=num_channels,
            kernel_size=kernel_size,
            dropout=dropout
        )
        
        # Layer normalization for stability
        self.layer_norm = nn.LayerNorm(num_channels[-1])
        
        # Output projection
        self.fc = nn.Linear(num_channels[-1], vocab_size)
        nn.init.xavier_uniform_(self.fc.weight)
        nn.init.constant_(self.fc.bias, 0)
        
    def forward(self, x):
        emb = self.embedding(x).transpose(1, 2)
        tcn_out = self.tcn(emb)
        tcn_out = tcn_out.transpose(1, 2)
        tcn_out = self.layer_norm(tcn_out)
        logits = self.fc(tcn_out)
        
        return logits

In [9]:
# Cell 9: Focal Loss
# ============================================
# FOCAL LOSS FOR HARD EXAMPLES
# ============================================

class FocalLoss(nn.Module):
    def __init__(self, alpha=0.25, gamma=2.0, ignore_index=-100):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.ignore_index = ignore_index
    
    def forward(self, inputs, targets):
        ce_loss = F.cross_entropy(inputs, targets, reduction='none', ignore_index=self.ignore_index)
        pt = torch.exp(-ce_loss)
        focal_loss = self.alpha * (1 - pt) ** self.gamma * ce_loss
        return focal_loss.mean()

In [10]:
# Cell 10: Nucleus Sampling
# ============================================
# NUCLEUS (TOP-P) SAMPLING
# ============================================

def nucleus_sampling(logits, top_p=0.9, temperature=1.0):
    """Sample from nucleus (top-p) distribution"""
    logits = logits / temperature
    sorted_logits, sorted_indices = torch.sort(logits, descending=True)
    cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
    
    # Remove tokens with cumulative probability above threshold
    sorted_indices_to_remove = cumulative_probs > top_p
    sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
    sorted_indices_to_remove[..., 0] = 0
    
    indices_to_remove = sorted_indices[sorted_indices_to_remove]
    logits[indices_to_remove] = float('-inf')
    
    probs = F.softmax(logits, dim=-1)
    next_token = torch.multinomial(probs, num_samples=1)
    return next_token.item()

In [11]:
class DiversePasswordGenerator:
    def __init__(self, model, stoi, itos, device='cuda'):
        self.model = model
        self.stoi = stoi
        self.itos = itos
        self.device = device
        self.generated_cache = set()
    
    @torch.no_grad()
    def generate_with_nucleus(self, max_len=16, temperature=1.0, top_p=0.9):
        """Generate with nucleus sampling for diversity"""
        self.model.eval()
        
        x = torch.tensor([[self.stoi["<SOS>"]]], dtype=torch.long, device=self.device)
        password = ""
        
        for _ in range(max_len - 1):
            logits = self.model(x)
            next_logits = logits[0, -1, :]
            
            next_token = nucleus_sampling(next_logits, top_p=top_p, temperature=temperature)
            
            if next_token == self.stoi["<EOS>"]:
                break
            
            char = self.itos.get(next_token, "")
            if char and char not in ['<PAD>', '<SOS>', '<EOS>', '<UNK>']:
                password += char
            
            x = torch.cat([x, torch.tensor([[next_token]], dtype=torch.long, device=self.device)], dim=1)
        
        return password.strip()
    
    @torch.no_grad()
    def generate_with_beam_search(self, max_len=16, beam_width=5, length_penalty=0.8):
        """Generate with beam search for higher quality"""
        self.model.eval()
        
        # Initialize beams: (sequence, score, length)
        beams = [([self.stoi["<SOS>"]], 0.0, 1)]
        
        for _ in range(max_len - 1):
            new_beams = []
            
            for seq, score, length in beams:
                if seq[-1] == self.stoi["<EOS>"]:
                    new_beams.append((seq, score, length))
                    continue
                
                x = torch.tensor([seq], dtype=torch.long, device=self.device)
                logits = self.model(x)
                next_logits = logits[0, -1, :]
                
                # Get top-k candidates
                probs = F.softmax(next_logits, dim=-1)
                top_probs, top_indices = torch.topk(probs, beam_width)
                
                for prob, idx in zip(top_probs, top_indices):
                    new_seq = seq + [idx.item()]
                    new_score = score + torch.log(prob).item()
                    # Apply length penalty
                    penalized_score = new_score / (length ** length_penalty)
                    new_beams.append((new_seq, penalized_score, length + 1))
            
            # Keep top beam_width beams
            new_beams.sort(key=lambda x: x[1], reverse=True)
            beams = new_beams[:beam_width]
        
        # Choose the best beam
        best_seq, _, _ = max(beams, key=lambda x: x[1])
        
        # Convert to password
        password = ""
        for token in best_seq[1:]:  # Skip <SOS>
            if token == self.stoi["<EOS>"]:
                break
            char = self.itos.get(token, "")
            if char and char not in ['<PAD>', '<SOS>', '<EOS>', '<UNK>']:
                password += char
        
        return password.strip()
    
    def generate_diverse_batch(self, num_passwords=10000, max_attempts_per_pwd=5, use_beam_search=False):
        """Generate diverse passwords with deduplication"""
        passwords = []
        temperature_range = [0.7, 0.8, 0.9, 1.0, 1.1, 1.2]
        top_p_range = [0.85, 0.90, 0.95]
        
        pbar = tqdm(total=num_passwords, desc="Generating diverse passwords")
        
        attempts = 0
        max_total_attempts = num_passwords * max_attempts_per_pwd
        
        while len(passwords) < num_passwords and attempts < max_total_attempts:
            if use_beam_search:
                pwd = self.generate_with_beam_search(max_len=Config.SEQ_LEN, beam_width=Config.BEAM_WIDTH, length_penalty=Config.LENGTH_PENALTY)
            else:
                # Vary temperature and top_p for diversity
                temp = random.choice(temperature_range)
                top_p = random.choice(top_p_range)
                pwd = self.generate_with_nucleus(max_len=Config.SEQ_LEN, temperature=temp, top_p=top_p)
            
            # Only add unique passwords
            if pwd and pwd not in self.generated_cache and len(pwd) >= 4:
                passwords.append(pwd)
                self.generated_cache.add(pwd)
                pbar.update(1)
            
            attempts += 1
        
        pbar.close()
        
        return passwords

In [12]:
# Cell 12: Enhanced Trainer
# ============================================
# ENHANCED TRAINER WITH GPU OPTIMIZATION
# ============================================

class PGTCNTrainer:
    def __init__(self, model, config, stoi, itos):
        self.model = model
        self.config = config
        self.stoi = stoi
        self.itos = itos
        self.device = config.DEVICE
        
        self.optimizer = optim.AdamW(
            model.parameters(),
            lr=config.LEARNING_RATE,
            weight_decay=config.WEIGHT_DECAY,
            betas=(0.9, 0.999)
        )
        
        # Cosine annealing with warmup
        self.scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(
            self.optimizer,
            T_0=10,
            T_mult=2,
            eta_min=1e-6
        )
        
        self.scaler = torch.amp.GradScaler('cuda') if config.USE_AMP else None
        
        # Use focal loss if enabled
        if config.USE_FOCAL_LOSS:
            self.criterion = FocalLoss(
                alpha=config.FOCAL_ALPHA,
                gamma=config.FOCAL_GAMMA,
                ignore_index=stoi["<PAD>"]
            )
        else:
            self.criterion = None
        
        self.best_loss = float('inf')
        self.history = []
    
    def train_epoch(self, dataloader):
        self.model.train()
        epoch_losses = []
        
        pbar = tqdm(dataloader, desc="Training")
        for batch in pbar:
            input_ids = batch.to(self.device)
            
            with torch.amp.autocast('cuda', enabled=self.config.USE_AMP):
                logits = self.model(input_ids)
                shift_logits = logits[:, :-1, :].contiguous()
                shift_labels = input_ids[:, 1:].contiguous()
                
                if self.criterion:
                    loss = self.criterion(
                        shift_logits.view(-1, self.config.VOCAB_SIZE),
                        shift_labels.view(-1)
                    )
                else:
                    loss = F.cross_entropy(
                        shift_logits.view(-1, self.config.VOCAB_SIZE),
                        shift_labels.view(-1),
                        ignore_index=self.stoi["<PAD>"],
                        label_smoothing=self.config.LABEL_SMOOTHING
                    )
            
            if torch.isnan(loss) or torch.isinf(loss):
                continue
            
            epoch_losses.append(loss.item())
            
            self.optimizer.zero_grad()
            
            if self.scaler:
                self.scaler.scale(loss).backward()
                self.scaler.unscale_(self.optimizer)
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.config.GRAD_CLIP)
                self.scaler.step(self.optimizer)
                self.scaler.update()
            else:
                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.config.GRAD_CLIP)
                self.optimizer.step()
            
            pbar.set_postfix({'loss': f'{loss.item():.4f}'})
        
        return np.mean(epoch_losses) if epoch_losses else float('inf')
    
    def train(self, dataloader):
        print(f"\n{'='*80}")
        print(f"Training ENHANCED PGTCN for {self.config.EPOCHS} epochs")
        print(f"{'='*80}\n")
        
        for epoch in range(self.config.EPOCHS):
            avg_loss = self.train_epoch(dataloader)
            
            if avg_loss < self.best_loss:
                self.best_loss = avg_loss
                # Save best model
                torch.save(self.model.state_dict(), 
                          f"{self.config.MODEL_DIR}/pgtcn_best.pt")
            
            self.history.append({
                'epoch': epoch + 1,
                'loss': avg_loss,
                'lr': self.optimizer.param_groups[0]['lr']
            })
            
            print(f"Epoch {epoch+1}/{self.config.EPOCHS} | Loss: {avg_loss:.4f} | "
                  f"Best: {self.best_loss:.4f} | LR: {self.optimizer.param_groups[0]['lr']:.2e}")
            
            self.scheduler.step()
            
            # Early stopping
            if avg_loss < 0.1:
                print(f"\n‚úÖ Excellent loss achieved! Stopping early.\n")
                break
        
        print(f"\n‚úÖ Training complete! Best loss: {self.best_loss:.4f}\n")

In [13]:
# Cell 13: Evaluation
# ============================================
# EVALUATION
# ============================================

def evaluate_matching(generated, test_set):
    unique_gen = set(generated)
    matches = unique_gen & test_set
    
    # Additional metrics
    gen_lengths = [len(p) for p in generated]
    test_lengths = [len(p) for p in test_set]
    
    results = {
        'total_generated': len(generated),
        'unique_generated': len(unique_gen),
        'uniqueness_rate': len(unique_gen) / len(generated) * 100 if generated else 0,
        'test_set_size': len(test_set),
        'matches': len(matches),
        'match_rate': len(matches) / len(test_set) * 100 if test_set else 0,
        'matched_passwords': list(matches)[:50],
        'avg_gen_length': np.mean(gen_lengths) if gen_lengths else 0,
        'avg_test_length': np.mean(test_lengths) if test_lengths else 0,
        'sample_generated': generated[:20]
    }
    
    return results

In [14]:
# Cell 14: Save Functions
# ============================================
# SAVE GENERATED PASSWORDS TO FILE
# ============================================

def save_passwords_to_file(passwords, output_dir, filename="generated_passwords.txt"):
    """Save generated passwords to a text file"""
    Path(output_dir).mkdir(parents=True, exist_ok=True)
    output_path = Path(output_dir) / filename
    
    with open(output_path, 'w', encoding='utf-8') as f:
        for pwd in passwords:
            f.write(f"{pwd}\n")
    
    return str(output_path)


def save_detailed_results(passwords, test_passwords, output_dir):
    """Save detailed analysis with multiple files"""
    Path(output_dir).mkdir(parents=True, exist_ok=True)
    
    # 1. Save all generated passwords
    all_pwd_path = save_passwords_to_file(passwords, output_dir, "all_generated_passwords.txt")
    print(f"‚úì All passwords saved: {all_pwd_path}")
    
    # 2. Save unique passwords only
    unique_passwords = list(set(passwords))
    unique_pwd_path = save_passwords_to_file(unique_passwords, output_dir, "unique_generated_passwords.txt")
    print(f"‚úì Unique passwords saved: {unique_pwd_path}")
    
    # 3. Save matched passwords
    matches = set(passwords) & test_passwords
    if matches:
        matched_pwd_path = save_passwords_to_file(sorted(matches), output_dir, "matched_passwords.txt")
        print(f"‚úì Matched passwords saved: {matched_pwd_path}")
    
    # 4. Save passwords by length
    length_groups = {}
    for pwd in unique_passwords:
        length = len(pwd)
        if length not in length_groups:
            length_groups[length] = []
        length_groups[length].append(pwd)
    
    length_stats_path = Path(output_dir) / "passwords_by_length.txt"
    with open(length_stats_path, 'w', encoding='utf-8') as f:
        f.write("PASSWORD LENGTH DISTRIBUTION\n")
        f.write("="*60 + "\n\n")
        
        for length in sorted(length_groups.keys()):
            f.write(f"\nLength {length}: {len(length_groups[length])} passwords\n")
            f.write("-"*60 + "\n")
            for pwd in sorted(length_groups[length])[:20]:
                f.write(f"{pwd}\n")
            if len(length_groups[length]) > 20:
                f.write(f"... and {len(length_groups[length]) - 20} more\n")
    
    print(f"‚úì Length analysis saved: {length_stats_path}")
    
    # 5. Save statistics summary
    stats_path = Path(output_dir) / "generation_statistics.txt"
    with open(stats_path, 'w', encoding='utf-8') as f:
        f.write("PASSWORD GENERATION STATISTICS\n")
        f.write("="*60 + "\n\n")
        
        f.write(f"Total Generated:       {len(passwords):,}\n")
        f.write(f"Unique Passwords:      {len(unique_passwords):,}\n")
        f.write(f"Uniqueness Rate:       {len(unique_passwords)/len(passwords)*100:.2f}%\n\n")
        
        f.write(f"Test Set Size:         {len(test_passwords):,}\n")
        f.write(f"Matched Passwords:     {len(matches):,}\n")
        f.write(f"Match Rate:            {len(matches)/len(test_passwords)*100:.2f}%\n\n")
        
        # Length statistics
        lengths = [len(p) for p in unique_passwords]
        f.write(f"Average Length:        {np.mean(lengths):.2f}\n")
        f.write(f"Min Length:            {min(lengths)}\n")
        f.write(f"Max Length:            {max(lengths)}\n")
        f.write(f"Median Length:         {np.median(lengths):.0f}\n\n")
        
        # Character statistics
        f.write("CHARACTER USAGE:\n")
        all_chars = ''.join(unique_passwords)
        char_counter = Counter(all_chars)
        f.write(f"Total Characters:      {len(all_chars):,}\n")
        f.write(f"Unique Characters:     {len(char_counter)}\n\n")
        
        f.write("Top 20 Most Common Characters:\n")
        for char, count in char_counter.most_common(20):
            f.write(f"  '{char}': {count:,} ({count/len(all_chars)*100:.2f}%)\n")
    
    print(f"‚úì Statistics saved: {stats_path}")
    
    return {
        'all_passwords': all_pwd_path,
        'unique_passwords': unique_pwd_path,
        'matched_passwords': matched_pwd_path if matches else None,
        'length_analysis': str(length_stats_path),
        'statistics': str(stats_path)
    }

In [15]:
# Cell 15: Main Training Functions (Part 1)
# ============================================
# MAIN TRAINING FUNCTIONS
# ============================================

def train_single_dataset(dataset_name, num_passwords_to_generate=500000):
    """Train model on a single dataset with GPU optimization"""
    
    print("\n" + "="*80)
    print(f"ENHANCED PGTCN - Training on {dataset_name.upper()}")
    print(f"Target: Generate {num_passwords_to_generate:,} passwords")
    print("="*80 + "\n")
    
    # Check GPU status
    if GPU_AVAILABLE:
        monitor_gpu_memory()
    
    # Load dataset
    filtered = load_dataset(dataset_name)
    if filtered is None:
        print(f"‚ùå Skipping {dataset_name} due to loading error\n")
        return None
    
    # Build vocabulary
    stoi, itos = build_vocabulary(filtered)
    Config.VOCAB_SIZE = len(stoi)
    print(f"‚úì Vocabulary size: {Config.VOCAB_SIZE}\n")
    
    # Split train/test (80/20)
    random.shuffle(filtered)
    split_idx = int(0.8 * len(filtered))
    train_passwords = filtered[:split_idx]
    test_passwords = set(filtered[split_idx:])
    
    print(f"‚úì Train set: {len(train_passwords):,}")
    print(f"‚úì Test set: {len(test_passwords):,}\n")
    
    # Dataset with augmentation
    train_dataset = PasswordDataset(train_passwords, stoi, Config.SEQ_LEN, augment=True)
    train_loader = DataLoader(
        train_dataset,
        batch_size=Config.BATCH_SIZE,
        shuffle=True,
        num_workers=Config.NUM_WORKERS,
        pin_memory=Config.PIN_MEMORY,
        collate_fn=collate_fn,
        drop_last=True
    )
    
    # Build model
    model = PGTCN(
        vocab_size=Config.VOCAB_SIZE,
        embedding_dim=Config.EMBEDDING_DIM,
        num_channels=Config.TCN_CHANNELS,
        kernel_size=Config.KERNEL_SIZE,
        dropout=Config.DROPOUT
    ).to(Config.DEVICE)
    
    num_params = sum(p.numel() for p in model.parameters())
    print(f"‚úì Model parameters: {num_params:,}")
    print(f"‚úì Model on device: {next(model.parameters()).device}\n")
    
    # Show GPU memory after model loading
    if GPU_AVAILABLE:
        monitor_gpu_memory()
    
    # Print configuration
    Config.print_config()
    
    # Train
    trainer = PGTCNTrainer(model, Config, stoi, itos)
    trainer.train(train_loader)
    
    # Clear GPU cache before generation
    if GPU_AVAILABLE:
        clear_gpu_cache()
    
    # Load best model
    best_model_path = f"{Config.MODEL_DIR}/pgtcn_best.pt"
    model.load_state_dict(torch.load(best_model_path, map_location=Config.DEVICE))
    
    # Save final model
    save_dataset_model(model, stoi, itos, dataset_name)
    
    # Generate passwords
    print("="*80)
    print(f"GENERATING {num_passwords_to_generate:,} DIVERSE PASSWORDS")
    print("="*80 + "\n")
    
    generator = DiversePasswordGenerator(model, stoi, itos, Config.DEVICE)
    generated = generator.generate_diverse_batch(num_passwords=num_passwords_to_generate)
    
    print(f"\n‚úì Successfully generated {len(generated):,} passwords\n")
    
    # Show final GPU memory usage
    if GPU_AVAILABLE:
        monitor_gpu_memory()
    
    # Evaluate
    print("="*80)
    print("EVALUATION RESULTS")
    print("="*80 + "\n")
    
    results = evaluate_matching(generated, test_passwords)
    
    print(f"Generated Passwords:  {results['total_generated']:,}")
    print(f"Unique Generated:     {results['unique_generated']:,} ({results['uniqueness_rate']:.1f}%)")
    print(f"Test Set Size:        {results['test_set_size']:,}")
    print(f"Exact Matches:        {results['matches']:,}")
    print(f"üéØ MATCH RATE:        {results['match_rate']:.2f}%")
    print(f"Avg Gen Length:       {results['avg_gen_length']:.1f}")
    print(f"Avg Test Length:      {results['avg_test_length']:.1f}")
    print(f"{'='*80}\n")
    
    # Save results
    print("="*80)
    print("SAVING RESULTS TO FILES")
    print("="*80 + "\n")
    
    json_path = f"{Config.MODEL_DIR}/results.json"
    with open(json_path, 'w') as f:
        json.dump(results, f, indent=2)
    print(f"‚úì JSON results saved: {json_path}\n")
    
    saved_files = save_detailed_results(generated, test_passwords, Config.MODEL_DIR)
    
    print("\n" + "="*80)
    print("FILES SAVED:")
    print("="*80)
    for file_type, file_path in saved_files.items():
        if file_path:
            print(f"  ‚Ä¢ {file_type}: {file_path}")
    
    print("\n" + "="*80)
    print(f"‚úÖ {dataset_name.upper()} Complete!")
    print(f"   Generated: {len(generated):,} passwords")
    print(f"   Match Rate: {results['match_rate']:.2f}%")
    print("="*80 + "\n")
    
    # Clean up GPU memory
    if GPU_AVAILABLE:
        clear_gpu_cache()
    
    return {
        'dataset': dataset_name,
        'results': results,
        'files': saved_files,
        'model_dir': Config.MODEL_DIR,
        'num_generated': len(generated)
    }

In [16]:
# Cell 16: Main Training Functions (Part 2)
def test_on_dataset(model, stoi, itos, test_dataset_name, source_dataset_name, num_passwords=100000):
    """Test a trained model on a different dataset"""
    
    print("\n" + "="*80)
    print(f"CROSS-DATASET TESTING - Generating {num_passwords:,} passwords")
    print(f"Model: {source_dataset_name.upper()} ‚Üí Test: {test_dataset_name.upper()}")
    print("="*80 + "\n")
    
    # Load test dataset
    test_passwords = load_dataset(test_dataset_name)
    if test_passwords is None:
        return None
    
    test_set = set(test_passwords)
    
    # Generate passwords
    generator = DiversePasswordGenerator(model, stoi, itos, Config.DEVICE)
    generated = generator.generate_diverse_batch(num_passwords=num_passwords)
    
    # Evaluate
    results = evaluate_matching(generated, test_set)
    
    print(f"Generated Passwords:  {results['total_generated']:,}")
    print(f"Unique Generated:     {results['unique_generated']:,} ({results['uniqueness_rate']:.1f}%)")
    print(f"Test Set Size:        {results['test_set_size']:,}")
    print(f"Exact Matches:        {results['matches']:,}")
    print(f"üéØ MATCH RATE:        {results['match_rate']:.2f}%")
    print(f"{'='*80}\n")
    
    # Save cross-testing results
    output_dir = f"{Config.DATASETS[source_dataset_name]['model_dir']}/cross_test_{test_dataset_name}/"
    Path(output_dir).mkdir(parents=True, exist_ok=True)
    
    json_path = f"{output_dir}/results.json"
    with open(json_path, 'w') as f:
        json.dump(results, f, indent=2)
    
    saved_files = save_detailed_results(generated, test_set, output_dir)
    
    return {
        'source_dataset': source_dataset_name,
        'test_dataset': test_dataset_name,
        'results': results,
        'files': saved_files
    }


def main(num_passwords_per_dataset=100000):
    """Main function to train and generate passwords on multiple datasets"""
    
    print("\n" + "="*80)
    print("ENHANCED PGTCN - Multi-Dataset Training & Password Generation")
    print(f"GPU-Optimized for CUDA 12.8 with RTX 4500 Ada")
    print(f"Generating {num_passwords_per_dataset:,} passwords per dataset")
    print("="*80 + "\n")
    
    # Show available datasets
    available = Config.get_available_datasets()
    print("Available datasets:")
    for i, dataset in enumerate(available, 1):
        desc = Config.DATASETS[dataset]['description']
        print(f"  {i}. {dataset} - {desc}")
    print()
    
    print("="*80)
    print(f"TRAINING ON ALL DATASETS - {num_passwords_per_dataset:,} passwords each")
    print("="*80 + "\n")
    
    all_results = []
    total_passwords_generated = 0
    
    for dataset_name in available:
        try:
            print(f"\n{'#'*80}")
            print(f"# PROCESSING: {dataset_name.upper()}")
            print(f"{'#'*80}\n")
            
            result = train_single_dataset(dataset_name, num_passwords_per_dataset)
            
            if result:
                all_results.append(result)
                total_passwords_generated += result['num_generated']
                
        except Exception as e:
            print(f"\n{'='*80}")
            print(f"‚ùå Error training on {dataset_name}: {str(e)}")
            print(f"{'='*80}\n")
            import traceback
            traceback.print_exc()
            continue
    
    # Final summary
    print("\n" + "="*80)
    print("TRAINING & GENERATION SUMMARY")
    print("="*80 + "\n")
    
    if not all_results:
        print("‚ùå No datasets were successfully processed!")
        return None
    
    print(f"{'Dataset':<15} {'Generated':>12} {'Unique':>12} {'Match Rate':>12} {'Matches':>10}")
    print("="*80)
    
    for result in all_results:
        dataset = result['dataset']
        num_gen = result['results']['total_generated']
        num_unique = result['results']['unique_generated']
        match_rate = result['results']['match_rate']
        matches = result['results']['matches']
        
        print(f"{dataset.upper():<15} "
              f"{num_gen:>11,} "
              f"{num_unique:>11,} "
              f"{match_rate:>11.2f}% "
              f"{matches:>10,}")
    
    print("="*80)
    print(f"\nTOTAL PASSWORDS GENERATED: {total_passwords_generated:,}")
    print(f"DATASETS PROCESSED: {len(all_results)}/{len(available)}")
    print("\n" + "="*80)
    
    # Save overall summary
    summary_path = "./pgtcn_modelMulti/training_summary.json"
    Path("./pgtcn_modelMulti/").mkdir(parents=True, exist_ok=True)
    
    summary_data = {
        'total_passwords_generated': total_passwords_generated,
        'datasets_processed': len(all_results),
        'datasets_failed': len(available) - len(all_results),
        'passwords_per_dataset': num_passwords_per_dataset,
        'results': [
            {
                'dataset': r['dataset'],
                'generated': r['results']['total_generated'],
                'unique': r['results']['unique_generated'],
                'match_rate': r['results']['match_rate'],
                'matches': r['results']['matches'],
                'model_dir': r['model_dir']
            }
            for r in all_results
        ]
    }
    
    with open(summary_path, 'w') as f:
        json.dump(summary_data, f, indent=2)
    
    print(f"\n‚úì Overall summary saved: {summary_path}\n")
    print("="*80)
    print("‚úÖ ALL DATASETS PROCESSING COMPLETE!")
    print("="*80 + "\n")
    
    return all_results


def train_specific_dataset(dataset_name, num_passwords=500000):
    """Quick function to train on a specific dataset"""
    if dataset_name not in Config.get_available_datasets():
        print(f"‚ùå Dataset '{dataset_name}' not found!")
        print(f"Available: {Config.get_available_datasets()}")
        return None
    
    return train_single_dataset(dataset_name, num_passwords)


def train_specific_datasets(dataset_list, num_passwords=500000):
    """Train on multiple specific datasets"""
    print("\n" + "="*80)
    print(f"TRAINING ON SPECIFIC DATASETS - {num_passwords:,} passwords each")
    print(f"Datasets: {', '.join(dataset_list)}")
    print("="*80 + "\n")
    
    results = []
    total_passwords = 0
    
    for dataset in dataset_list:
        if dataset not in Config.get_available_datasets():
            print(f"‚ö†Ô∏è  Skipping unknown dataset: {dataset}")
            continue
        
        result = train_specific_dataset(dataset, num_passwords)
        if result:
            results.append(result)
            total_passwords += result['num_generated']
    
    # Summary
    print("\n" + "="*80)
    print("SPECIFIC DATASETS SUMMARY")
    print("="*80 + "\n")
    
    for result in results:
        dataset = result['dataset']
        match_rate = result['results']['match_rate']
        num_gen = result['num_generated']
        print(f"{dataset.upper():15} ‚Üí Generated: {num_gen:,} | Match Rate: {match_rate:6.2f}%")
    
    print(f"\nTotal passwords generated: {total_passwords:,}")
    print("="*80 + "\n")
    
    return results

In [None]:
# Cell 17: Run Training
# ============================================
# RUN TRAINING - Choose your option
# ============================================

# OPTION 1: Train ALL datasets with 500K passwords each
results = main(num_passwords_per_dataset=1000000)

# OPTION 2: Train SPECIFIC datasets
# datasets_to_train = ['myspace', 'rockyou']
# results = train_specific_datasets(datasets_to_train, num_passwords=500000)

# OPTION 3: Train SINGLE dataset
# result = train_specific_dataset('myspace', num_passwords=500000)


ENHANCED PGTCN - Multi-Dataset Training & Password Generation
GPU-Optimized for CUDA 12.8 with RTX 4500 Ada
Generating 1,000,000 passwords per dataset

Available datasets:
  1. myspace - MySpace password leak

TRAINING ON ALL DATASETS - 1,000,000 passwords each


################################################################################
# PROCESSING: MYSPACE
################################################################################


ENHANCED PGTCN - Training on MYSPACE
Target: Generate 1,000,000 passwords


GPU MEMORY STATUS
Allocated:    0.00 GB (  0.0%)
Reserved:     0.00 GB (  0.0%)
Free:        25.25 GB (100.0%)
Total:       25.25 GB


Loading MySpace password leak

‚úì Loaded 37,126 passwords
‚úì Filtered: 36,829 passwords (4-16 chars, ASCII only)

‚úì Vocabulary size: 95

‚úì Train set: 29,463
‚úì Test set: 7,366

‚úì Model parameters: 9,541,727
‚úì Model on device: cuda:0


GPU MEMORY STATUS
Allocated:    0.04 GB (  0.2%)
Reserved:     0.04 GB (  0.2%)
Free:       

Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 460/460 [00:06<00:00, 76.37it/s, loss=0.3213]


Epoch 1/20 | Loss: 0.3394 | Best: 0.3394 | LR: 3.00e-04


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 460/460 [00:05<00:00, 86.54it/s, loss=0.3158]


Epoch 2/20 | Loss: 0.3081 | Best: 0.3081 | LR: 2.93e-04


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 460/460 [00:05<00:00, 86.71it/s, loss=0.2878]


Epoch 3/20 | Loss: 0.2956 | Best: 0.2956 | LR: 2.71e-04


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 460/460 [00:05<00:00, 86.56it/s, loss=0.2990]


Epoch 4/20 | Loss: 0.2869 | Best: 0.2869 | LR: 2.38e-04


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 460/460 [00:05<00:00, 86.63it/s, loss=0.2841]


Epoch 5/20 | Loss: 0.2790 | Best: 0.2790 | LR: 1.97e-04


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 460/460 [00:05<00:00, 86.97it/s, loss=0.2711]


Epoch 6/20 | Loss: 0.2724 | Best: 0.2724 | LR: 1.50e-04


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 460/460 [00:05<00:00, 86.57it/s, loss=0.2754]


Epoch 7/20 | Loss: 0.2662 | Best: 0.2662 | LR: 1.04e-04


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 460/460 [00:05<00:00, 86.22it/s, loss=0.2760]


Epoch 8/20 | Loss: 0.2609 | Best: 0.2609 | LR: 6.26e-05


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 460/460 [00:05<00:00, 86.63it/s, loss=0.2377]


Epoch 9/20 | Loss: 0.2566 | Best: 0.2566 | LR: 2.96e-05


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 460/460 [00:05<00:00, 86.48it/s, loss=0.2996]


Epoch 10/20 | Loss: 0.2541 | Best: 0.2541 | LR: 8.32e-06


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 460/460 [00:05<00:00, 86.96it/s, loss=0.2670]


Epoch 11/20 | Loss: 0.2678 | Best: 0.2541 | LR: 3.00e-04


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 460/460 [00:05<00:00, 87.32it/s, loss=0.2532]


Epoch 12/20 | Loss: 0.2630 | Best: 0.2541 | LR: 2.98e-04


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 460/460 [00:05<00:00, 86.85it/s, loss=0.2619]


Epoch 13/20 | Loss: 0.2583 | Best: 0.2541 | LR: 2.93e-04


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 460/460 [00:05<00:00, 86.06it/s, loss=0.2499]


Epoch 14/20 | Loss: 0.2534 | Best: 0.2534 | LR: 2.84e-04


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 460/460 [00:05<00:00, 87.19it/s, loss=0.2195]


Epoch 15/20 | Loss: 0.2484 | Best: 0.2484 | LR: 2.71e-04


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 460/460 [00:05<00:00, 87.23it/s, loss=0.2469]


Epoch 16/20 | Loss: 0.2430 | Best: 0.2430 | LR: 2.56e-04


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 460/460 [00:05<00:00, 87.16it/s, loss=0.2339]


Epoch 17/20 | Loss: 0.2381 | Best: 0.2381 | LR: 2.38e-04


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 460/460 [00:05<00:00, 86.68it/s, loss=0.2215]


Epoch 18/20 | Loss: 0.2329 | Best: 0.2329 | LR: 2.18e-04


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 460/460 [00:05<00:00, 87.14it/s, loss=0.2366]


Epoch 19/20 | Loss: 0.2279 | Best: 0.2279 | LR: 1.97e-04


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 460/460 [00:05<00:00, 86.86it/s, loss=0.2160]


Epoch 20/20 | Loss: 0.2233 | Best: 0.2233 | LR: 1.74e-04

‚úÖ Training complete! Best loss: 0.2233

‚úì GPU cache cleared
‚úì Model saved: pgtcn_modelMulti/myspace//pgtcn_final.pt
‚úì Vocabulary saved: pgtcn_modelMulti/myspace//vocabulary.json
‚úì Config saved: pgtcn_modelMulti/myspace//config.json
GENERATING 1,000,000 DIVERSE PASSWORDS



Generating diverse passwords:   0%|   | 2751/1000000 [01:57<11:28:37, 24.14it/s]

In [None]:
# Cell 17.5: Enhanced Training with Beam Search
# ============================================
# ENHANCED TRAINING OPTIONS
# ============================================

def train_single_dataset_enhanced(dataset_name, num_passwords_to_generate=500000, use_beam_search=False):
    """Enhanced training with beam search option"""
    
    print("\n" + "="*80)
    print(f"ENHANCED PGTCN - Training on {dataset_name.upper()}")
    print(f"Target: Generate {num_passwords_to_generate:,} passwords")
    print(f"Beam Search: {use_beam_search}")
    print("="*80 + "\n")
    
    # Check GPU status
    if GPU_AVAILABLE:
        monitor_gpu_memory()
    
    # Load dataset
    filtered = load_dataset(dataset_name)
    if filtered is None:
        print(f"‚ùå Skipping {dataset_name} due to loading error\n")
        return None
    
    # Build vocabulary
    stoi, itos = build_vocabulary(filtered)
    Config.VOCAB_SIZE = len(stoi)
    print(f"‚úì Vocabulary size: {Config.VOCAB_SIZE}\n")
    
    # Split train/test (80/20)
    random.shuffle(filtered)
    split_idx = int(0.8 * len(filtered))
    train_passwords = filtered[:split_idx]
    test_passwords = set(filtered[split_idx:])
    
    print(f"‚úì Train set: {len(train_passwords):,}")
    print(f"‚úì Test set: {len(test_passwords):,}\n")
    
    # Dataset with augmentation
    train_dataset = PasswordDataset(train_passwords, stoi, Config.SEQ_LEN, augment=True)
    train_loader = DataLoader(
        train_dataset,
        batch_size=Config.BATCH_SIZE,
        shuffle=True,
        num_workers=Config.NUM_WORKERS,
        pin_memory=Config.PIN_MEMORY,
        collate_fn=collate_fn,
        drop_last=True
    )
    
    # Build larger model
    model = PGTCN(
        vocab_size=Config.VOCAB_SIZE,
        embedding_dim=Config.EMBEDDING_DIM,
        num_channels=Config.TCN_CHANNELS,
        kernel_size=Config.KERNEL_SIZE,
        dropout=Config.DROPOUT
    ).to(Config.DEVICE)
    
    num_params = sum(p.numel() for p in model.parameters())
    print(f"‚úì Model parameters: {num_params:,}")
    print(f"‚úì Model on device: {next(model.parameters()).device}\n")
    
    # Show GPU memory after model loading
    if GPU_AVAILABLE:
        monitor_gpu_memory()
    
    # Print configuration
    Config.print_config()
    
    # Train
    trainer = PGTCNTrainer(model, Config, stoi, itos)
    trainer.train(train_loader)
    
    # Clear GPU cache before generation
    if GPU_AVAILABLE:
        clear_gpu_cache()
    
    # Load best model
    best_model_path = f"{Config.MODEL_DIR}/pgtcn_best.pt"
    model.load_state_dict(torch.load(best_model_path, map_location=Config.DEVICE))
    
    # Save final model
    save_dataset_model(model, stoi, itos, dataset_name)
    
    # Generate passwords with beam search if enabled
    print("="*80)
    print(f"GENERATING {num_passwords_to_generate:,} PASSWORDS")
    print(f"Using {'Beam Search' if use_beam_search else 'Nucleus Sampling'}")
    print("="*80 + "\n")
    
    generator = DiversePasswordGenerator(model, stoi, itos, Config.DEVICE)
    generated = generator.generate_diverse_batch(num_passwords=num_passwords_to_generate, use_beam_search=use_beam_search)
    
    print(f"\n‚úì Successfully generated {len(generated):,} passwords\n")
    
    # Show final GPU memory usage
    if GPU_AVAILABLE:
        monitor_gpu_memory()
    
    # Evaluate
    print("="*80)
    print("EVALUATION RESULTS")
    print("="*80 + "\n")
    
    results = evaluate_matching(generated, test_passwords)
    
    print(f"Generated Passwords:  {results['total_generated']:,}")
    print(f"Unique Generated:     {results['unique_generated']:,} ({results['uniqueness_rate']:.1f}%)")
    print(f"Test Set Size:        {results['test_set_size']:,}")
    print(f"Exact Matches:        {results['matches']:,}")
    print(f"üéØ MATCH RATE:        {results['match_rate']:.2f}%")
    print(f"Avg Gen Length:       {results['avg_gen_length']:.1f}")
    print(f"Avg Test Length:      {results['avg_test_length']:.1f}")
    print(f"{'='*80}\n")
    
    # Save results
    print("="*80)
    print("SAVING RESULTS TO FILES")
    print("="*80 + "\n")
    
    json_path = f"{Config.MODEL_DIR}/results_enhanced.json"
    with open(json_path, 'w') as f:
        json.dump(results, f, indent=2)
    print(f"‚úì JSON results saved: {json_path}\n")
    
    saved_files = save_detailed_results(generated, test_passwords, Config.MODEL_DIR)
    
    print("\n" + "="*80)
    print("FILES SAVED:")
    print("="*80)
    for file_type, file_path in saved_files.items():
        if file_path:
            print(f"  ‚Ä¢ {file_type}: {file_path}")
    
    print("\n" + "="*80)
    print(f"‚úÖ {dataset_name.upper()} Enhanced Complete!")
    print(f"   Generated: {len(generated):,} passwords")
    print(f"   Match Rate: {results['match_rate']:.2f}%")
    print("="*80 + "\n")
    
    # Clean up GPU memory
    if GPU_AVAILABLE:
        clear_gpu_cache()
    
    return {
        'dataset': dataset_name,
        'results': results,
        'files': saved_files,
        'model_dir': Config.MODEL_DIR,
        'num_generated': len(generated),
        'beam_search': use_beam_search
    }

In [None]:
# Cell 18: Run Enhanced Training
# ============================================
# RUN ENHANCED TRAINING FOR ROCKYOU‚úÖ CUDA Available: Yes
# ============================================

# Train RockYou with enhanced model and beam search
result_enhanced = train_single_dataset_enhanced('hashmob.net.large.found', num_passwords_to_generate=1000000, use_beam_search=True)

# Compare with nucleus sampling (also using beam search for higher match rate)
result_nucleus = train_single_dataset_enhanced('hashmob.net.large.found', num_passwords_to_generate=1000000, use_beam_search=True)

In [None]:
"""
if __name__ == "__main__":
    datasets = ['myspace', 'rockyou', 'honeynet']
    results = []
    for dataset in datasets:
        result = train_specific_dataset(dataset)
        if result:
            results.append(result)

"""