In [None]:
# Install required packages
!pip install ebooklib nltk tqdm

# Import all necessary libraries
import os
import re
import pickle
import string
from tqdm import tqdm
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import autocast, GradScaler
import torch.nn.functional as F
from ebooklib import epub
import ebooklib
import nltk
from nltk.tokenize import word_tokenize



In [None]:
from google.colab import files

# Upload EPUB file
print("Please upload your EPUB file:")
uploaded = files.upload()

# Get the uploaded filename
epub_filename = list(uploaded.keys())[0]
epub_path = f"/content/{epub_filename}"
print(f"Uploaded file: {epub_filename}")

# Set up NLTK data
nltk.download('punkt', quiet=True)

Please upload your EPUB file:


Saving linux.epub to linux.epub
Uploaded file: linux.epub


True

In [None]:
def extract_text_from_epub(epub_path):
    """Extract clean text from EPUB file"""
    try:
        book = epub.read_epub(epub_path)
        text = ""
        for item in book.get_items():
            if item.get_type() == ebooklib.ITEM_DOCUMENT:
                try:
                    raw_text = item.get_content().decode('utf-8')
                    # Remove HTML tags but preserve text structure
                    clean_text = re.sub('<[^<]+?>', '', raw_text)
                    text += clean_text + "\n"
                except Exception as e:
                    print(f"Warning: Could not decode item: {e}")
                    continue
        return text.strip()
    except Exception as e:
        print(f"Error reading EPUB: {e}")
        return ""

def preprocess_text(text):
    """Light preprocessing to preserve character-level information"""
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text)
    # Remove excessive newlines but keep some structure
    text = re.sub(r'\n{3,}', '\n\n', text)
    return text.strip()

# Extract and preprocess text
print("Extracting text from EPUB...")
raw_text = extract_text_from_epub(epub_path)
cleaned_text = preprocess_text(raw_text)

print(f"Text extracted: {len(cleaned_text)} characters")
print("Sample text:")
print(cleaned_text[:500])

Extracting text from EPUB...
Text extracted: 429330 characters
Sample text:
&#13; Contents in Detail&#13; &#13; Cover Page&#13; Title Page&#13; Copyright Page&#13; Dedication&#13; About the Author&#13; About the Technical Reviewer&#13; BRIEF CONTENTS&#13; CONTENTS IN DETAIL&#13; ACKNOWLEDGMENTS&#13; INTRODUCTION&#13; &#13; What’s in This Book&#13; What Is Ethical Hacking?&#13; Why Hackers Use Linux&#13; Downloading Kali Linux&#13; Virtual Machines&#13; Setting Up Kali&#13; &#13; 1 GETTING STARTED WITH THE BASICS&#13; &#13; Introductory Terms and Concepts&#13; A Tour of 


In [None]:
class OptimizedBookDataset(Dataset):
    def __init__(self, text, seq_length):
        self.text = text
        self.seq_length = seq_length
        # Create vocabulary
        self.vocab = sorted(set(text))
        self.char_to_idx = {char: idx for idx, char in enumerate(self.vocab)}
        self.idx_to_char = {idx: char for idx, char in enumerate(self.vocab)}
        self.vocab_size = len(self.vocab)
        print(f"Vocabulary size: {self.vocab_size}")

    def __len__(self):
        return max(0, len(self.text) - self.seq_length)

    def __getitem__(self, idx):
        # Generate sequences on-the-fly to save memory
        chunk = self.text[idx:idx + self.seq_length + 1]
        if len(chunk) < self.seq_length + 1:
            # Pad if chunk is too short
            chunk = chunk + ' ' * (self.seq_length + 1 - len(chunk))

        encoded = [self.char_to_idx.get(char, 0) for char in chunk]
        return torch.tensor(encoded[:-1], dtype=torch.long), torch.tensor(encoded[1:], dtype=torch.long)

# Create dataset
SEQ_LENGTH = 50  # Reduced for faster training
print("Creating dataset...")
dataset = OptimizedBookDataset(cleaned_text, SEQ_LENGTH)

# Save dataset for later use
dataset_save_path = "/content/book_dataset.pkl"
with open(dataset_save_path, "wb") as f:
    pickle.dump({
        'vocab': dataset.vocab,
        'char_to_idx': dataset.char_to_idx,
        'idx_to_char': dataset.idx_to_char,
        'text': cleaned_text,
        'seq_length': SEQ_LENGTH
    }, f)
print(f"Dataset saved to {dataset_save_path}")

Creating dataset...
Vocabulary size: 109
Dataset saved to /content/book_dataset.pkl


In [None]:
class OptimizedTextGenerator(nn.Module):
    def __init__(self, vocab_size, embedding_dim=128, hidden_dim=256, num_layers=2, dropout=0.3):
        super(OptimizedTextGenerator, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.vocab_size = vocab_size

        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        # LSTM with dropout
        self.lstm = nn.LSTM(
            embedding_dim,
            hidden_dim,
            num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0,
            bidirectional=False
        )

        # Dropout and output layer
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden=None):
        # Embedding
        embedded = self.embedding(x)  # (batch, seq_len) -> (batch, seq_len, embedding_dim)

        # LSTM
        lstm_out, hidden = self.lstm(embedded, hidden)

        # Dropout
        lstm_out = self.dropout(lstm_out)

        # Output projection
        output = self.fc(lstm_out)

        return output, hidden

# Initialize model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

model = OptimizedTextGenerator(
    vocab_size=len(dataset.vocab),
    embedding_dim=128,
    hidden_dim=256,
    num_layers=2,
    dropout=0.3
).to(device)

print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")

Using device: cuda
Model parameters: 963,565


In [None]:
# DataLoader with optimizations
BATCH_SIZE = 64  # Increased for better GPU utilization
dataloader = DataLoader(
    dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=2,
    pin_memory=True,
    persistent_workers=True
)

# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-5)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2)
scaler = GradScaler()  # For mixed precision training

print(f"Number of batches: {len(dataloader)}")

Number of batches: 6708


  scaler = GradScaler()  # For mixed precision training


In [None]:
def train_model(model, dataloader, num_epochs=10):
    """Optimized training function with fixed autocast"""

    # Training history
    train_losses = []

    # Determine device and autocast dtype
    if torch.cuda.is_available():
        device = torch.device('cuda')
        # Use autocast for GPU
        from torch.amp import autocast
    else:
        device = torch.device('cpu')
        # CPU doesn't benefit much from autocast, so we'll use a dummy context
        from contextlib import nullcontext
        autocast = lambda: nullcontext()

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        num_batches = 0

        # Progress bar
        progress_bar = tqdm(dataloader, desc=f"Epoch {epoch+1}/{num_epochs}")

        for batch_idx, (inputs, targets) in enumerate(progress_bar):
            # Move to device
            inputs, targets = inputs.to(device), targets.to(device)

            # Zero gradients
            optimizer.zero_grad()

            # Forward pass with proper autocast
            if torch.cuda.is_available():
                with autocast('cuda'):
                    outputs, _ = model(inputs)
                    # Reshape for loss calculation
                    outputs = outputs.reshape(-1, outputs.size(-1))
                    targets = targets.reshape(-1)
                    loss = criterion(outputs, targets)
            else:
                outputs, _ = model(inputs)
                outputs = outputs.reshape(-1, outputs.size(-1))
                targets = targets.reshape(-1)
                loss = criterion(outputs, targets)

            # Backward pass
            if torch.cuda.is_available():
                scaler.scale(loss).backward()

                # Gradient clipping
                scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

                # Optimizer step
                scaler.step(optimizer)
                scaler.update()
            else:
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                optimizer.step()

            # Update progress bar
            total_loss += loss.item()
            num_batches += 1
            progress_bar.set_postfix({
                'loss': f'{loss.item():.4f}',
                'avg_loss': f'{total_loss/num_batches:.4f}'
            })

            # Clear cache periodically to prevent memory issues
            if batch_idx % 500 == 0 and torch.cuda.is_available():
                torch.cuda.empty_cache()

        # Calculate average loss
        avg_loss = total_loss / num_batches
        train_losses.append(avg_loss)

        # Learning rate scheduling
        scheduler.step(avg_loss)

        print(f"Epoch {epoch+1}/{num_epochs} - Average Loss: {avg_loss:.4f}")

        # Save model checkpoint
        if (epoch + 1) % 2 == 0:
            checkpoint_path = f"/content/model_checkpoint_epoch_{epoch+1}.pth"
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': avg_loss,
                'vocab': dataset.vocab,
                'char_to_idx': dataset.char_to_idx,
                'idx_to_char': dataset.idx_to_char
            }, checkpoint_path)
            print(f"Checkpoint saved: {checkpoint_path}")

    return train_losses

# Train the model
print("Starting training...")
train_losses = train_model(model, dataloader, num_epochs=5)
print("Training completed!")

Starting training...


Epoch 1/5: 100%|██████████| 6708/6708 [01:22<00:00, 80.87it/s, loss=1.1052, avg_loss=1.3545]


Epoch 1/5 - Average Loss: 1.3545


Epoch 2/5: 100%|██████████| 6708/6708 [01:19<00:00, 83.86it/s, loss=1.0418, avg_loss=1.0313]


Epoch 2/5 - Average Loss: 1.0313
Checkpoint saved: /content/model_checkpoint_epoch_2.pth


Epoch 3/5: 100%|██████████| 6708/6708 [01:20<00:00, 82.84it/s, loss=0.9978, avg_loss=0.9635]


Epoch 3/5 - Average Loss: 0.9635


Epoch 4/5: 100%|██████████| 6708/6708 [01:20<00:00, 83.83it/s, loss=0.9223, avg_loss=0.9296]


Epoch 4/5 - Average Loss: 0.9296
Checkpoint saved: /content/model_checkpoint_epoch_4.pth


Epoch 5/5: 100%|██████████| 6708/6708 [01:20<00:00, 83.01it/s, loss=0.9487, avg_loss=0.9080]

Epoch 5/5 - Average Loss: 0.9080
Training completed!





In [None]:
# Continue training for better results
train_model(model, dataloader, num_epochs=10)  # Additional 10 epochs

Epoch 1/10: 100%|██████████| 6708/6708 [01:21<00:00, 81.99it/s, loss=0.8998, avg_loss=0.8925]


Epoch 1/10 - Average Loss: 0.8925


Epoch 2/10: 100%|██████████| 6708/6708 [01:21<00:00, 82.33it/s, loss=0.8862, avg_loss=0.8801]


Epoch 2/10 - Average Loss: 0.8801
Checkpoint saved: /content/model_checkpoint_epoch_2.pth


Epoch 3/10: 100%|██████████| 6708/6708 [01:19<00:00, 84.15it/s, loss=0.8505, avg_loss=0.8700]


Epoch 3/10 - Average Loss: 0.8700


Epoch 4/10: 100%|██████████| 6708/6708 [01:21<00:00, 81.91it/s, loss=0.8854, avg_loss=0.8620]


Epoch 4/10 - Average Loss: 0.8620
Checkpoint saved: /content/model_checkpoint_epoch_4.pth


Epoch 5/10: 100%|██████████| 6708/6708 [01:20<00:00, 83.52it/s, loss=0.8086, avg_loss=0.8549]


Epoch 5/10 - Average Loss: 0.8549


Epoch 6/10: 100%|██████████| 6708/6708 [01:20<00:00, 83.01it/s, loss=0.8132, avg_loss=0.8486]


Epoch 6/10 - Average Loss: 0.8486
Checkpoint saved: /content/model_checkpoint_epoch_6.pth


Epoch 7/10: 100%|██████████| 6708/6708 [01:20<00:00, 83.00it/s, loss=0.9134, avg_loss=0.8430]


Epoch 7/10 - Average Loss: 0.8430


Epoch 8/10: 100%|██████████| 6708/6708 [01:21<00:00, 82.80it/s, loss=0.8348, avg_loss=0.8380]


Epoch 8/10 - Average Loss: 0.8380
Checkpoint saved: /content/model_checkpoint_epoch_8.pth


Epoch 9/10: 100%|██████████| 6708/6708 [01:20<00:00, 83.16it/s, loss=0.7137, avg_loss=0.8330]


Epoch 9/10 - Average Loss: 0.8330


Epoch 10/10: 100%|██████████| 6708/6708 [01:22<00:00, 81.28it/s, loss=0.8533, avg_loss=0.8290]


Epoch 10/10 - Average Loss: 0.8290
Checkpoint saved: /content/model_checkpoint_epoch_10.pth


[0.8925476255929392,
 0.8801363064870282,
 0.8700400313576845,
 0.8619626557002411,
 0.8548643312144294,
 0.8486020285314083,
 0.8429967897078361,
 0.837973459328863,
 0.8330102307887467,
 0.8289728522140779]

In [None]:
# Save the final trained model
final_model_path = "/content/final_text_generator.pth"
torch.save({
    'model_state_dict': model.state_dict(),
    'vocab': dataset.vocab,
    'char_to_idx': dataset.char_to_idx,
    'idx_to_char': dataset.idx_to_char,
    'seq_length': SEQ_LENGTH,
    'embedding_dim': 128,
    'hidden_dim': 256,
    'num_layers': 2
}, final_model_path)

print(f"Final model saved to {final_model_path}")

Final model saved to /content/final_text_generator.pth


In [None]:
# FIXED Block 9: Text Generation Function
def generate_text(model, dataset, seed_text="The", length=200, temperature=0.8):
    """Generate text using the trained model"""
    model.eval()  # Important: set to evaluation mode
    device = next(model.parameters()).device
    print(f"Using device: {device}")
    print(f"Seed text: '{seed_text}'")

    # Encode seed text
    encoded_seed = [dataset.char_to_idx.get(char, 0) for char in seed_text]
    input_seq = torch.tensor(encoded_seed, dtype=torch.long).unsqueeze(0).to(device)

    generated_text = seed_text
    print(f"Starting generation with seed length: {len(seed_text)}")

    hidden = None
    with torch.no_grad():
        for i in range(length):
            # Get prediction
            with autocast():
                output, hidden = model(input_seq, hidden)

            # Apply temperature scaling
            output = output[0, -1] / temperature
            probabilities = F.softmax(output, dim=0)

            # Sample next character
            next_char_idx = torch.multinomial(probabilities, 1).item()
            next_char = dataset.idx_to_char[next_char_idx]

            generated_text += next_char

            # Update input sequence
            input_seq = torch.cat([input_seq, torch.tensor([[next_char_idx]], device=device)], dim=1)
            # Keep only the last seq_length characters
            if input_seq.size(1) > dataset.seq_length:
                input_seq = input_seq[:, -dataset.seq_length:]

            # Progress indicator
            if (i + 1) % 50 == 0:
                print(f"Generated {i + 1}/{length} characters")

    return generated_text

# Test with debugging
print("=== Testing Text Generation ===")
try:
    sample_text = generate_text(model, dataset, seed_text="The ", length=100, temperature=0.7)
    print("\n=== GENERATED TEXT ===")
    print(sample_text)
    print("=== END OF GENERATION ===")
except Exception as e:
    print(f"Error in generation: {e}")
    import traceback
    traceback.print_exc()

=== Testing Text Generation ===
Using device: cuda:0
Seed text: 'The '
Starting generation with seed length: 4
Generated 50/100 characters


  with autocast():


Generated 100/100 characters

=== GENERATED TEXT ===
The nl command to see it in the /usr/bin directory. You may need to check whether reners-indivilual newf
=== END OF GENERATION ===


In [None]:
# FIXED Block 10: Load Saved Dataset and Model
def load_dataset_and_model(dataset_path, model_path=None):
    """Load saved dataset and optionally model"""
    print(f"Loading dataset from: {dataset_path}")

    # Load dataset
    try:
        with open(dataset_path, "rb") as f:
            dataset_data = pickle.load(f)
        print("Dataset loaded successfully!")
        print(f"Dataset keys: {list(dataset_data.keys())}")
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return None, None

    # Recreate dataset object
    class LoadedDataset(Dataset):
        def __init__(self, data):
            self.text = data['text']
            self.vocab = data['vocab']
            self.char_to_idx = data['char_to_idx']
            self.idx_to_char = data['idx_to_char']
            self.seq_length = data['seq_length']
            self.vocab_size = len(self.vocab)
            print(f"Loaded dataset - Text length: {len(self.text)}, Vocab size: {self.vocab_size}")

        def __len__(self):
            return max(0, len(self.text) - self.seq_length)

        def __getitem__(self, idx):
            chunk = self.text[idx:idx + self.seq_length + 1]
            if len(chunk) < self.seq_length + 1:
                chunk = chunk + ' ' * (self.seq_length + 1 - len(chunk))
            encoded = [self.char_to_idx.get(char, 0) for char in chunk]
            return torch.tensor(encoded[:-1], dtype=torch.long), torch.tensor(encoded[1:], dtype=torch.long)

    dataset = LoadedDataset(dataset_data)

    # Load model if provided
    model = None
    if model_path and os.path.exists(model_path):
        try:
            device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
            print(f"Loading model on device: {device}")
            checkpoint = torch.load(model_path, map_location=device)

            model = OptimizedTextGenerator(
                vocab_size=len(dataset.vocab),
                embedding_dim=checkpoint.get('embedding_dim', 128),
                hidden_dim=checkpoint.get('hidden_dim', 256),
                num_layers=checkpoint.get('num_layers', 2)
            )
            model.load_state_dict(checkpoint['model_state_dict'])
            model.to(device)
            model.eval()
            print("Model loaded successfully!")
        except Exception as e:
            print(f"Error loading model: {e}")
            import traceback
            traceback.print_exc()
            return dataset, None
    else:
        print(f"Model file not found: {model_path}")

    return dataset, model

# Test the loading function
print("\n=== Testing Load Functions ===")
dataset_path = "/content/book_dataset.pkl"
model_path = "/content/final_text_generator.pth"

print("Loading saved dataset and model...")
loaded_dataset, loaded_model = load_dataset_and_model(dataset_path, model_path)

if loaded_dataset:
    print(f"Loaded dataset successfully! Vocab size: {len(loaded_dataset.vocab)}")

    # Test generation with loaded model
    if loaded_model:
        print("Testing generation with loaded model...")
        test_text = generate_text(loaded_model, loaded_dataset, "Test: ", 50, 0.8)
        print("Loaded model generation:")
        print(test_text)


=== Testing Load Functions ===
Loading saved dataset and model...
Loading dataset from: /content/book_dataset.pkl
Dataset loaded successfully!
Dataset keys: ['vocab', 'char_to_idx', 'idx_to_char', 'text', 'seq_length']
Loaded dataset - Text length: 429330, Vocab size: 109
Loading model on device: cuda
Model loaded successfully!
Loaded dataset successfully! Vocab size: 109
Testing generation with loaded model...
Using device: cuda:0
Seed text: 'Test: '
Starting generation with seed length: 6
Generated 50/50 characters
Loaded model generation:
Test: Your File Foreming Ports --enable-gre --enable-mpl


  with autocast():


In [None]:
# NEW Block: Comprehensive Testing and Debugging
def debug_model_and_dataset(model, dataset):
    """Debug function to check if everything is working"""
    print("=== DEBUGGING MODEL AND DATASET ===")

    # Check dataset
    print(f"Dataset length: {len(dataset)}")
    print(f"Vocabulary size: {len(dataset.vocab)}")
    print(f"First 10 vocab chars: {dataset.vocab[:10]}")
    print(f"Sequence length: {dataset.seq_length}")

    # Check a sample from dataset
    if len(dataset) > 0:
        sample_input, sample_target = dataset[0]
        print(f"Sample input shape: {sample_input.shape}")
        print(f"Sample target shape: {sample_target.shape}")
        print(f"Sample input: {sample_input[:10]}")
        print(f"Sample target: {sample_target[:10]}")

    # Check model
    print(f"Model device: {next(model.parameters()).device}")
    print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")

    # Test forward pass
    if len(dataset) > 0:
        test_input, _ = dataset[0]
        test_input = test_input.unsqueeze(0).to(next(model.parameters()).device)
        try:
            with torch.no_grad():
                output, hidden = model(test_input)
                print(f"Forward pass successful!")
                print(f"Output shape: {output.shape}")
                print(f"Hidden state type: {type(hidden)}")
        except Exception as e:
            print(f"Forward pass failed: {e}")
            import traceback
            traceback.print_exc()

# Run debugging
debug_model_and_dataset(model, dataset)

# Simple generation test
print("\n=== SIMPLE GENERATION TEST ===")
try:
    # Very simple test first
    simple_text = generate_text(model, dataset, "A", 20, 1.0)
    print(f"Simple generation result: '{simple_text}'")
except Exception as e:
    print(f"Simple generation failed: {e}")
    import traceback
    traceback.print_exc()

=== DEBUGGING MODEL AND DATASET ===
Dataset length: 429280
Vocabulary size: 109
First 10 vocab chars: [' ', '!', '"', '#', '$', '%', '&', "'", '(', ')']
Sequence length: 50
Sample input shape: torch.Size([50])
Sample target shape: torch.Size([50])
Sample input: tensor([ 6,  3, 17, 19, 27,  0, 33, 76, 75, 81])
Sample target: tensor([ 3, 17, 19, 27,  0, 33, 76, 75, 81, 66])
Model device: cuda:0
Model parameters: 963,565
Forward pass successful!
Output shape: torch.Size([1, 50, 109])
Hidden state type: <class 'tuple'>

=== SIMPLE GENERATION TEST ===
Using device: cuda:0
Seed text: 'A'
Starting generation with seed length: 1
Simple generation result: 'A allocated to captur'


  with autocast():


In [None]:
# This is what Block 9 does:
sample_text = generate_text(model, dataset, seed_text="The ", length=300, temperature=0.7)
print(sample_text)

Using device: cuda:0
Seed text: 'The '
Starting generation with seed length: 4
Generated 50/300 characters


  with autocast():


Generated 100/300 characters
Generated 150/300 characters
Generated 200/300 characters
Generated 250/300 characters
Generated 300/300 characters
The kill command to allow the only in order to note that places them the grep command prompt in the first file has been reparated by the configuration files, you can use a new user with the file /etc/resolv.conf file to scan on a process and learning and returns a job to the system is anonymity. The bas


In [None]:
# Next week, you can do this:
loaded_dataset, loaded_model = load_dataset_and_model(
    "/content/book_dataset.pkl",           # Your saved dataset
    "/content/final_text_generator.pth"    # Your trained model
)

# Generate more text immediately:
new_text = generate_text(loaded_model, loaded_dataset, "In conclusion", 500)
print(new_text)

# Or continue training for better results:
# train_model(loaded_model, new_dataloader, num_epochs=5)  # Additional training

Loading dataset from: /content/book_dataset.pkl
Dataset loaded successfully!
Dataset keys: ['vocab', 'char_to_idx', 'idx_to_char', 'text', 'seq_length']
Loaded dataset - Text length: 429330, Vocab size: 109
Loading model on device: cuda
Model loaded successfully!
Using device: cuda:0
Seed text: 'In conclusion'
Starting generation with seed length: 13
Generated 50/500 characters


  with autocast():


Generated 100/500 characters
Generated 150/500 characters
Generated 200/500 characters
Generated 250/500 characters
Generated 300/500 characters
Generated 350/500 characters
Generated 400/500 characters
Generated 450/500 characters
Generated 500/500 characters
In conclusion should be used for the SSH service with configuration for PostgreSQL password to transmit to every directory to string into. And other options contains log files in the CPU. Tand and Database&#13; Compressing with gzip&#13; Here, we can supply the permission enter the following:&#13; kali &gt;cat &gt; hackingskillsEveryone with at by previous scripts, traffic to your PATH variable, so you can delital configuration file, you need to change passwords and then explore the security vulnerability, s


In [None]:
# Try different seeds:
generate_text(model, dataset, "Chapter 1", 200)
generate_text(model, dataset, "Introduction", 200)
generate_text(model, dataset, "In this book", 200)

# Try different temperatures:
generate_text(model, dataset, "The", 200, temperature=0.5)  # More focused
generate_text(model, dataset, "The", 200, temperature=1.2)  # More creative

Using device: cuda:0
Seed text: 'Chapter 1'
Starting generation with seed length: 9
Generated 50/200 characters


  with autocast():


Generated 100/200 characters
Generated 150/200 characters
Generated 200/200 characters
Using device: cuda:0
Seed text: 'Introduction'
Starting generation with seed length: 12
Generated 50/200 characters
Generated 100/200 characters
Generated 150/200 characters
Generated 200/200 characters
Using device: cuda:0
Seed text: 'In this book'
Starting generation with seed length: 12
Generated 50/200 characters
Generated 100/200 characters
Generated 150/200 characters
Generated 200/200 characters
Using device: cuda:0
Seed text: 'The'
Starting generation with seed length: 3
Generated 50/200 characters
Generated 100/200 characters
Generated 150/200 characters
Generated 200/200 characters
Using device: cuda:0
Seed text: 'The'
Starting generation with seed length: 3
Generated 50/200 characters
Generated 100/200 characters
Generated 150/200 characters
Generated 200/200 characters


'Then HackersAriseForiate with a best programmers with git clone, and so account, like Apache is constant. When you are the root user by entering a shortcut use if you have IP addresses associated with th'

##  -DATAloader issue

In [None]:
# Install required packages
!pip install ebooklib nltk tqdm

# Import all necessary libraries
import os
import re
import pickle
import string
from tqdm import tqdm
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import autocast, GradScaler
import torch.nn.functional as F
from ebooklib import epub
import ebooklib
import nltk
from nltk.tokenize import word_tokenize



In [None]:
from google.colab import files

# Upload EPUB file
print("Please upload your EPUB file:")
uploaded = files.upload()

# Get the uploaded filename
epub_filename = list(uploaded.keys())[0]
epub_path = f"/content/{epub_filename}"
print(f"Uploaded file: {epub_filename}")

# Set up NLTK data
nltk.download('punkt', quiet=True)

Please upload your EPUB file:


Saving linux.epub to linux (3).epub
Uploaded file: linux (3).epub


True

In [None]:
def extract_text_from_epub(epub_path):
    """Extract clean text from EPUB file"""
    try:
        book = epub.read_epub(epub_path)
        text = ""
        for item in book.get_items():
            if item.get_type() == ebooklib.ITEM_DOCUMENT:
                try:
                    raw_text = item.get_content().decode('utf-8')
                    # Remove HTML tags but preserve text structure
                    clean_text = re.sub('<[^<]+?>', '', raw_text)
                    text += clean_text + "\n"
                except Exception as e:
                    print(f"Warning: Could not decode item: {e}")
                    continue
        return text.strip()
    except Exception as e:
        print(f"Error reading EPUB: {e}")
        return ""

def preprocess_text(text):
    """Light preprocessing to preserve character-level information"""
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text)
    # Remove excessive newlines but keep some structure
    text = re.sub(r'\n{3,}', '\n\n', text)
    return text.strip()

# Extract and preprocess text
print("Extracting text from EPUB...")
raw_text = extract_text_from_epub(epub_path)
cleaned_text = preprocess_text(raw_text)

print(f"Text extracted: {len(cleaned_text)} characters")
print("Sample text:")
print(cleaned_text[:500])

Extracting text from EPUB...
Text extracted: 429330 characters
Sample text:
&#13; Contents in Detail&#13; &#13; Cover Page&#13; Title Page&#13; Copyright Page&#13; Dedication&#13; About the Author&#13; About the Technical Reviewer&#13; BRIEF CONTENTS&#13; CONTENTS IN DETAIL&#13; ACKNOWLEDGMENTS&#13; INTRODUCTION&#13; &#13; What’s in This Book&#13; What Is Ethical Hacking?&#13; Why Hackers Use Linux&#13; Downloading Kali Linux&#13; Virtual Machines&#13; Setting Up Kali&#13; &#13; 1 GETTING STARTED WITH THE BASICS&#13; &#13; Introductory Terms and Concepts&#13; A Tour of 


In [None]:
# NEW IMPROVED APPROACH - Token-level modeling
import re
from collections import Counter

class ImprovedBookDataset(Dataset):
    def __init__(self, text, seq_length, min_freq=2):
        self.seq_length = seq_length

        # Better tokenization
        self.tokens = self.tokenize_text(text)

        # Build vocabulary with minimum frequency filtering
        token_counts = Counter(self.tokens)
        self.vocab = ['<PAD>', '<UNK>'] + [token for token, count in token_counts.items() if count >= min_freq]

        self.token_to_idx = {token: idx for idx, token in enumerate(self.vocab)}
        self.idx_to_token = {idx: token for idx, token in enumerate(self.vocab)}

        print(f"Vocabulary size: {len(self.vocab)}")
        print(f"Total tokens: {len(self.tokens)}")
        print(f"Unique tokens (before filtering): {len(token_counts)}")

    def tokenize_text(self, text):
        # Better tokenization - preserve words, numbers, punctuation
        # Split on whitespace and preserve punctuation
        tokens = re.findall(r'\b\w+\b|[^\w\s]', text)
        return [token.lower() for token in tokens if token.strip()]

    def __len__(self):
        return max(0, len(self.tokens) - self.seq_length)

    def __getitem__(self, idx):
        # Get sequence of tokens
        token_sequence = self.tokens[idx:idx + self.seq_length + 1]

        # Pad if sequence is too short
        if len(token_sequence) < self.seq_length + 1:
            token_sequence.extend(['<PAD>'] * (self.seq_length + 1 - len(token_sequence)))

        # Convert to indices
        indices = [self.token_to_idx.get(token, self.token_to_idx['<UNK>']) for token in token_sequence]

        return torch.tensor(indices[:-1], dtype=torch.long), torch.tensor(indices[1:], dtype=torch.long)

# Create improved dataset
print("Creating improved token-level dataset...")
improved_dataset = ImprovedBookDataset(cleaned_text, seq_length=100, min_freq=2)

Creating improved token-level dataset...
Vocabulary size: 3248
Total tokens: 105649
Unique tokens (before filtering): 5377


In [None]:
# Function to save the improved token-level dataset
def save_improved_dataset(dataset, filepath="/content/improved_book_dataset.pkl"):
    """Save the improved token-level dataset"""
    try:
        dataset_data = {
            'tokens': dataset.tokens,
            'vocab': dataset.vocab,
            'token_to_idx': dataset.token_to_idx,
            'idx_to_token': dataset.idx_to_token,
            'seq_length': dataset.seq_length
        }

        with open(filepath, "wb") as f:
            pickle.dump(dataset_data, f)

        print(f"Improved dataset saved successfully to {filepath}")
        print(f"Dataset size: {len(dataset.tokens)} tokens")
        print(f"Vocabulary size: {len(dataset.vocab)} tokens")

        return True
    except Exception as e:
        print(f"Error saving dataset: {e}")
        import traceback
        traceback.print_exc()
        return False

# Function to load the improved token-level dataset
def load_improved_dataset(filepath="/content/improved_book_dataset.pkl"):
    """Load the improved token-level dataset"""
    try:
        with open(filepath, "rb") as f:
            dataset_data = pickle.load(f)

        print(f"Improved dataset loaded from {filepath}")

        # Recreate dataset object
        class LoadedImprovedDataset(Dataset):
            def __init__(self, data):
                self.tokens = data['tokens']
                self.vocab = data['vocab']
                self.token_to_idx = data['token_to_idx']
                self.idx_to_token = data['idx_to_token']
                self.seq_length = data['seq_length']

                print(f"Loaded dataset - Tokens: {len(self.tokens)}, Vocab: {len(self.vocab)}")

            def tokenize_text(self, text):
                # Reuse the same tokenization logic
                tokens = re.findall(r'\b\w+\b|[^\w\s]', text)
                return [token.lower() for token in tokens if token.strip()]

            def __len__(self):
                return max(0, len(self.tokens) - self.seq_length)

            def __getitem__(self, idx):
                token_sequence = self.tokens[idx:idx + self.seq_length + 1]

                if len(token_sequence) < self.seq_length + 1:
                    token_sequence.extend(['<PAD>'] * (self.seq_length + 1 - len(token_sequence)))

                indices = [self.token_to_idx.get(token, self.token_to_idx['<UNK>']) for token in token_sequence]

                return torch.tensor(indices[:-1], dtype=torch.long), torch.tensor(indices[1:], dtype=torch.long)

        loaded_dataset = LoadedImprovedDataset(dataset_data)
        return loaded_dataset

    except Exception as e:
        print(f"Error loading dataset: {e}")
        import traceback
        traceback.print_exc()
        return None

# Save the improved dataset
print("Saving improved dataset...")
save_success = save_improved_dataset(improved_dataset, "/content/improved_book_dataset.pkl")

if save_success:
    print("Dataset saved successfully!")

    # Test loading the dataset
    print("\nTesting dataset loading...")
    loaded_improved_dataset = load_improved_dataset("/content/improved_book_dataset.pkl")

    if loaded_improved_dataset:
        print("Dataset loaded successfully!")
        print(f"Loaded dataset info:")
        print(f"  - Tokens: {len(loaded_improved_dataset.tokens)}")
        print(f"  - Vocabulary: {len(loaded_improved_dataset.vocab)}")
        print(f"  - Sample vocab: {loaded_improved_dataset.vocab[:10]}")

        # Test a sample from loaded dataset
        if len(loaded_improved_dataset) > 0:
            sample_input, sample_target = loaded_improved_dataset[0]
            print(f"  - Sample input shape: {sample_input.shape}")
            print(f"  - Sample target shape: {sample_target.shape}")

Saving improved dataset...
Improved dataset saved successfully to /content/improved_book_dataset.pkl
Dataset size: 105649 tokens
Vocabulary size: 3248 tokens
Dataset saved successfully!

Testing dataset loading...
Improved dataset loaded from /content/improved_book_dataset.pkl
Loaded dataset - Tokens: 105649, Vocab: 3248
Dataset loaded successfully!
Loaded dataset info:
  - Tokens: 105649
  - Vocabulary: 3248
  - Sample vocab: ['<PAD>', '<UNK>', '&', '#', '13', ';', 'contents', 'in', 'detail', 'cover']
  - Sample input shape: torch.Size([100])
  - Sample target shape: torch.Size([100])


In [None]:
class EnhancedTextGenerator(nn.Module):
    def __init__(self, vocab_size, embedding_dim=256, hidden_dim=512, num_layers=3, dropout=0.3):
        super(EnhancedTextGenerator, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.vocab_size = vocab_size

        # Larger embedding dimension
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)

        # Multi-layer LSTM with more capacity
        self.lstm = nn.LSTM(
            embedding_dim,
            hidden_dim,
            num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0,
            bidirectional=False
        )

        # Additional layers for better learning
        self.dropout = nn.Dropout(dropout)
        self.layer_norm = nn.LayerNorm(hidden_dim)
        self.fc1 = nn.Linear(hidden_dim, hidden_dim // 2)
        self.fc2 = nn.Linear(hidden_dim // 2, vocab_size)
        self.relu = nn.ReLU()

    def forward(self, x, hidden=None):
        # Embedding
        embedded = self.embedding(x)

        # LSTM
        lstm_out, hidden = self.lstm(embedded, hidden)

        # Normalization and dropout
        lstm_out = self.layer_norm(lstm_out)
        lstm_out = self.dropout(lstm_out)

        # Additional layers
        out = self.relu(self.fc1(lstm_out))
        out = self.dropout(out)
        output = self.fc2(out)

        return output, hidden

# Initialize enhanced model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
enhanced_model = EnhancedTextGenerator(
    vocab_size=len(improved_dataset.vocab),
    embedding_dim=256,
    hidden_dim=512,
    num_layers=3,
    dropout=0.3
).to(device)

print(f"Enhanced model parameters: {sum(p.numel() for p in enhanced_model.parameters()):,}")

Enhanced model parameters: 7,578,032


In [None]:
# Debug information before training
print("=== DEBUG INFORMATION ===")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name()}")
    print(f"CUDA memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")

print(f"Model device: {next(enhanced_model.parameters()).device}")
print(f"Dataset size: {len(improved_dataset)}")
print(f"Vocabulary size: {len(improved_dataset.vocab)}")

# Test dataset sample
if len(improved_dataset) > 0:
    sample_input, sample_target = improved_dataset[0]
    print(f"Sample input shape: {sample_input.shape}")
    print(f"Sample target shape: {sample_target.shape}")
    print(f"Sample input[:10]: {sample_input[:10]}")
    print(f"Sample target[:10]: {sample_target[:10]}")

# Test model forward pass
try:
    test_input = sample_input.unsqueeze(0).to(device)
    with torch.no_grad():
        test_output, _ = enhanced_model(test_input)
        print(f"Test output shape: {test_output.shape}")
        print("✅ Model forward pass successful!")
except Exception as e:
    print(f"❌ Model forward pass failed: {e}")
    import traceback
    traceback.print_exc()

=== DEBUG INFORMATION ===
CUDA available: False
Model device: cpu
Dataset size: 105549
Vocabulary size: 3248
Sample input shape: torch.Size([100])
Sample target shape: torch.Size([100])
Sample input[:10]: tensor([2, 3, 4, 5, 6, 7, 8, 2, 3, 4])
Sample target[:10]: tensor([3, 4, 5, 6, 7, 8, 2, 3, 4, 5])
Test output shape: torch.Size([1, 100, 3248])
✅ Model forward pass successful!


In [None]:
# FIXED Enhanced Text Generator Model
class FixedEnhancedTextGenerator(nn.Module):
    def __init__(self, vocab_size, embedding_dim=256, hidden_dim=512, num_layers=2, dropout=0.3):
        super(FixedEnhancedTextGenerator, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.vocab_size = vocab_size

        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)

        # LSTM layer with proper initialization
        self.lstm = nn.LSTM(
            embedding_dim,
            hidden_dim,
            num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0,
            bidirectional=False
        )

        # Output layers
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def init_hidden(self, batch_size, device):
        """Initialize hidden state properly"""
        hidden = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(device)
        cell = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(device)
        return (hidden, cell)

    def forward(self, x, hidden=None):
        # Get device from input
        device = x.device
        batch_size = x.size(0)

        # Embedding
        embedded = self.embedding(x)  # (batch, seq_len) -> (batch, seq_len, embedding_dim)

        # Initialize hidden state if not provided
        if hidden is None:
            hidden = self.init_hidden(batch_size, device)

        # LSTM forward pass
        lstm_out, hidden = self.lstm(embedded, hidden)

        # Dropout
        lstm_out = self.dropout(lstm_out)

        # Output projection
        output = self.fc(lstm_out)

        return output, hidden

# Create the fixed model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

fixed_enhanced_model = FixedEnhancedTextGenerator(
    vocab_size=len(improved_dataset.vocab),
    embedding_dim=256,
    hidden_dim=512,
    num_layers=2,  # Reduced from 3 to 2 for stability
    dropout=0.3
).to(device)

print(f"Fixed model parameters: {sum(p.numel() for p in fixed_enhanced_model.parameters()):,}")

Using device: cpu
Fixed model parameters: 6,175,920


In [None]:
# Create the enhanced dataloader
print("Creating enhanced dataloader...")
enhanced_dataloader = DataLoader(
    improved_dataset,
    batch_size=32,
    shuffle=True,
    num_workers=2 if torch.cuda.is_available() else 0,  # 0 for CPU to avoid issues
    pin_memory=torch.cuda.is_available(),
    persistent_workers=True if torch.cuda.is_available() else False
)

print(f"Dataloader created successfully!")
print(f"Number of batches: {len(enhanced_dataloader)}")
print(f"Batch size: {enhanced_dataloader.batch_size}")

Creating enhanced dataloader...
Dataloader created successfully!
Number of batches: 3299
Batch size: 32


In [None]:
# Define loss function and other components
enhanced_criterion = nn.CrossEntropyLoss(ignore_index=0)  # Ignore padding tokens

print("Enhanced components initialized!")
print(f"Vocabulary size: {len(improved_dataset.vocab)}")

Enhanced components initialized!
Vocabulary size: 3248


In [None]:
# Test the fixed model
print("=== Testing Fixed Model ===")

# Get a sample batch
try:
    for batch_idx, (inputs, targets) in enumerate(enhanced_dataloader):
        inputs, targets = inputs.to(device), targets.to(device)
        print(f"Input shape: {inputs.shape}")
        print(f"Target shape: {targets.shape}")

        # Test forward pass
        with torch.no_grad():
            outputs, hidden = fixed_enhanced_model(inputs)
            print(f"Output shape: {outputs.shape}")
            print(f"Hidden state type: {type(hidden)}")

            # Calculate loss
            loss = enhanced_criterion(outputs.reshape(-1, outputs.size(-1)), targets.reshape(-1))
            print(f"Test loss: {loss.item():.4f}")
        break

    print("✅ Fixed model test successful!")

except Exception as e:
    print(f"❌ Fixed model test failed: {e}")
    import traceback
    traceback.print_exc()

=== Testing Fixed Model ===
Input shape: torch.Size([32, 100])
Target shape: torch.Size([32, 100])
Output shape: torch.Size([32, 100, 3248])
Hidden state type: <class 'tuple'>
Test loss: 8.0829
✅ Fixed model test successful!


In [None]:
# FIXED Training Function
from torch.amp import autocast

def fixed_enhanced_train_model(model, dataloader, num_epochs=10):
    """Fixed training function with proper error handling"""

    # Optimizer and scheduler
    optimizer = torch.optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.01)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2)
    criterion = nn.CrossEntropyLoss(ignore_index=0)
    scaler = torch.cuda.amp.GradScaler() if torch.cuda.is_available() else None

    train_losses = []
    best_loss = float('inf')

    device = next(model.parameters()).device
    use_amp = torch.cuda.is_available() and scaler is not None

    print(f"Training on device: {device}")
    print(f"Using AMP: {use_amp}")

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        num_batches = 0

        progress_bar = tqdm(dataloader, desc=f"Epoch {epoch+1}/{num_epochs}")

        for batch_idx, (inputs, targets) in enumerate(progress_bar):
            try:
                inputs, targets = inputs.to(device), targets.to(device)

                optimizer.zero_grad()

                # Forward pass
                if use_amp:
                    with autocast('cuda'):
                        outputs, _ = model(inputs)
                        outputs = outputs.reshape(-1, outputs.size(-1))
                        targets = targets.reshape(-1)
                        loss = criterion(outputs, targets)

                    scaler.scale(loss).backward()
                    scaler.unscale_(optimizer)
                    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                    scaler.step(optimizer)
                    scaler.update()
                else:
                    outputs, _ = model(inputs)
                    outputs = outputs.reshape(-1, outputs.size(-1))
                    targets = targets.reshape(-1)
                    loss = criterion(outputs, targets)

                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                    optimizer.step()

                total_loss += loss.item()
                num_batches += 1

                # Update progress bar
                progress_bar.set_postfix({
                    'loss': f'{loss.item():.4f}',
                    'avg_loss': f'{total_loss/num_batches:.4f}'
                })

                # Clear cache periodically
                if batch_idx % 200 == 0 and torch.cuda.is_available():
                    torch.cuda.empty_cache()

            except RuntimeError as e:
                if "out of memory" in str(e):
                    print(f"⚠️  Out of memory at batch {batch_idx}, skipping...")
                    if torch.cuda.is_available():
                        torch.cuda.empty_cache()
                    continue
                else:
                    raise e
            except Exception as e:
                print(f"⚠️  Error in batch {batch_idx}: {e}")
                continue

        # Calculate epoch statistics
        if num_batches > 0:
            avg_loss = total_loss / num_batches
            train_losses.append(avg_loss)

            # Update learning rate
            scheduler.step(avg_loss)

            print(f"Epoch {epoch+1}/{num_epochs} - Average Loss: {avg_loss:.4f}")

            # Save best model
            if avg_loss < best_loss:
                best_loss = avg_loss
                torch.save({
                    'epoch': epoch,
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'loss': avg_loss,
                    'vocab': improved_dataset.vocab,
                    'token_to_idx': improved_dataset.token_to_idx,
                    'idx_to_token': improved_dataset.idx_to_token
                }, "/content/best_fixed_model.pth")
                print(f"⭐ New best model saved! Loss: {avg_loss:.4f}")
        else:
            print(f"Epoch {epoch+1}/{num_epochs} - No batches processed")

    return train_losses

# Test single batch before full training
print("\n=== Pre-training Test ===")
try:
    for inputs, targets in enhanced_dataloader:
        inputs, targets = inputs.to(device), targets.to(device)
        print(f"Test batch - Input: {inputs.shape}, Target: {targets.shape}")

        with torch.no_grad():
            outputs, _ = fixed_enhanced_model(inputs)
            print(f"Model output: {outputs.shape}")
        break
    print("✅ Pre-training test passed!")
except Exception as e:
    print(f"❌ Pre-training test failed: {e}")
    import traceback
    traceback.print_exc()

# Start training
print("\n🚀 Starting fixed enhanced training...")
try:
    fixed_losses = fixed_enhanced_train_model(fixed_enhanced_model, enhanced_dataloader, num_epochs=10)
    print("🎉 Training completed successfully!")
except Exception as e:
    print(f"❌ Training failed: {e}")
    import traceback
    traceback.print_exc()


=== Pre-training Test ===
Test batch - Input: torch.Size([32, 100]), Target: torch.Size([32, 100])


KeyboardInterrupt: 

In [None]:
# Test text generation with the trained model
def test_generation(model, dataset, seed_text="The", max_length=50):
    """Simple generation test"""
    model.eval()
    device = next(model.parameters()).device

    # Tokenize seed
    seed_tokens = dataset.tokenize_text(seed_text)
    seed_indices = [dataset.token_to_idx.get(token, dataset.token_to_idx['<UNK>']) for token in seed_tokens]

    input_seq = torch.tensor([seed_indices], dtype=torch.long).to(device)
    generated_tokens = seed_tokens.copy()

    hidden = None
    with torch.no_grad():
        for _ in range(max_length):
            output, hidden = model(input_seq, hidden)
            # Get last token prediction
            logits = output[0, -1]
            # Get most likely token
            next_token_idx = torch.argmax(logits).item()

            if next_token_idx < len(dataset.idx_to_token):
                next_token = dataset.idx_to_token[next_token_idx]
                if next_token != '<PAD>':
                    generated_tokens.append(next_token)

            # Update input
            new_input = torch.tensor([[next_token_idx]], device=device)
            input_seq = torch.cat([input_seq, new_input], dim=1)
            if input_seq.size(1) > 100:
                input_seq = input_seq[:, -100:]

    return ' '.join(generated_tokens)

# Test generation
print("\n=== Testing Text Generation ===")
try:
    test_text = test_generation(fixed_enhanced_model, improved_dataset, "The", 30)
    print("Generated text:")
    print(test_text)
except Exception as e:
    print(f"Generation test failed: {e}")
    import traceback
    traceback.print_exc()


=== Testing Text Generation ===
Generated text:
the # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #


In [None]:
# Save the trained model and dataset info
print("\n=== Saving Model and Dataset ===")
try:
    # Save model
    torch.save({
        'model_state_dict': fixed_enhanced_model.state_dict(),
        'vocab': improved_dataset.vocab,
        'token_to_idx': improved_dataset.token_to_idx,
        'idx_to_token': improved_dataset.idx_to_token,
        'model_config': {
            'vocab_size': len(improved_dataset.vocab),
            'embedding_dim': 256,
            'hidden_dim': 512,
            'num_layers': 2
        }
    }, "/content/final_trained_model.pth")

    # Save dataset info
    import pickle
    with open("/content/dataset_info.pkl", "wb") as f:
        pickle.dump({
            'vocab': improved_dataset.vocab,
            'token_to_idx': improved_dataset.token_to_idx,
            'idx_to_token': improved_dataset.idx_to_token,
            'text_sample': cleaned_text[:1000]  # Save a sample for reference
        }, f)

    print("✅ Model and dataset saved successfully!")
    print("Files saved:")
    print("  - /content/final_trained_model.pth")
    print("  - /content/dataset_info.pkl")

except Exception as e:
    print(f"❌ Failed to save: {e}")
    import traceback
    traceback.print_exc()

# resoucess cheack-up

In [None]:
import subprocess
import psutil
import torch
import os
import time

def check_gpu():
    """Check if GPU is available and accessible."""
    try:
        # Check via nvidia-smi
        result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
        if result.returncode != 0:
            return {"available": False, "message": "GPU not available (nvidia-smi failed)", "details": result.stderr}

        # Check PyTorch CUDA
        if torch.cuda.is_available():
            gpu_name = torch.cuda.get_device_name(0)
            return {"available": True, "message": "GPU is available", "name": gpu_name}
        else:
            return {"available": False, "message": "CUDA not available in PyTorch", "details": "nvidia-smi works but PyTorch can't access GPU"}
    except Exception as e:
        return {"available": False, "message": "Error checking GPU", "error": str(e)}

def check_ram():
    """Check available RAM."""
    ram_gb = psutil.virtual_memory().available / (1024**3)
    return {"available_gb": round(ram_gb, 2)}

def check_disk():
    """Check available disk space."""
    disk = psutil.disk_usage("/")
    return {"available_gb": round(disk.free / (1024**3), 2)}

def check_execution_speed():
    """Run a small compute task to estimate performance (detect throttling)."""
    try:
        start = time.time()
        # Perform a small matrix operation
        x = torch.randn(1000, 1000)
        y = torch.randn(1000, 1000)
        for _ in range(10):
            z = torch.mm(x, y)
        duration = time.time() - start
        return {"duration_seconds": round(duration, 3), "fast": duration < 2.0}
    except Exception as e:
        return {"error": str(e)}

def check_tpu():
    """Check if TPU is available (Colab Pro+ or specific allocation)."""
    try:
        import tensorflow as tf
        resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
        tf.config.experimental_connect_to_cluster(resolver)
        tf.tpu.experimental.initialize_tpu_system(resolver)
        return {"available": True, "message": "TPU is available"}
    except Exception as e:
        return {"available": False, "message": "TPU not available", "error": str(e)}

def check_runtime_type():
    """Check the runtime environment."""
    try:
        # This file exists in Colab
        import google.colab
        return "colab"
    except:
        return "not_colab"

def check_usage_limits():
    """Main function to check overall usability."""
    print("🔍 Checking Google Colab Resource Availability...\n")

    # 1. Confirm we're in Colab
    runtime = check_runtime_type()
    if runtime != "colab":
        print("❌ This script is not running in Google Colab.")
        return

    # 2. Check GPU
    gpu = check_gpu()
    print(f"🎮 GPU: {'✅ Available' if gpu['available'] else '❌ Not Available'}")
    if gpu['available']:
        print(f"   → Model: {gpu['name']}")
    else:
        print(f"   → {gpu['message']}")

    # 3. Check RAM
    ram = check_ram()
    print(f"🧠 RAM Available: {ram['available_gb']} GB")

    # 4. Check Disk
    disk = check_disk()
    print(f"💾 Disk Available: {disk['available_gb']} GB")

    # 5. Check Execution Speed (heuristic for throttling)
    print("⏱️ Running speed test (small matrix ops)...")
    speed = check_execution_speed()
    if "error" in speed:
        print(f"   → Speed test failed: {speed['error']}")
    else:
        print(f"   → Speed test took {speed['duration_seconds']} seconds")
        if speed['fast']:
            print("   → ✅ Likely normal performance")
        else:
            print("   → ⚠️  Slow execution — possible resource throttling or limitation")

    # 6. Optional: Check TPU
    print("⚡ Checking TPU...")
    tpu = check_tpu()
    print(f"   → TPU: {'✅ Available' if tpu['available'] else '❌ Not Available'}")

    # Final Assessment
    print("\n📝 Final Assessment:")
    if not gpu['available']:
        print("⚠️  Limited or restricted runtime: GPU not accessible — likely under cool-down or usage limit.")
    elif speed.get("duration_seconds", 10) > 3.0:
        print("⚠️  Performance is slow — possible throttling or low-tier allocation.")
    elif ram['available_gb'] < 5:
        print("⚠️  Low RAM — may not support large models.")
    else:
        print("✅ This Colab session appears to have good resource availability and is usable.")

    print("\n💡 Tip: If resources are limited, try reconnecting or using Colab Pro.")

# Run the check
check_usage_limits()

🔍 Checking Google Colab Resource Availability...

🎮 GPU: ❌ Not Available
   → Error checking GPU
🧠 RAM Available: 11.47 GB
💾 Disk Available: 69.4 GB
⏱️ Running speed test (small matrix ops)...
   → Speed test took 0.705 seconds
   → ✅ Likely normal performance
⚡ Checking TPU...
   → TPU: ❌ Not Available

📝 Final Assessment:
⚠️  Limited or restricted runtime: GPU not accessible — likely under cool-down or usage limit.

💡 Tip: If resources are limited, try reconnecting or using Colab Pro.


In [None]:
import subprocess
import torch
import time
import requests
from datetime import datetime, timedelta

def check_gpu_usability():
    """
    Check if GPU is usable or likely under cool-down/restriction.
    Attempts to infer cool-down state and estimate time until recovery.
    """
    print("🔍 Assessing GPU Usability and Cool-Down Status...\n")

    # 1. Check if nvidia-smi is accessible
    try:
        result = subprocess.run(
            ['nvidia-smi'],
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True,
            timeout=10
        )
        if result.returncode != 0:
            print("❌ nvidia-smi failed — GPU likely not allocated.")
            print(f"   Error: {result.stderr.strip()}")
            _suggest_cool_down()
            return
        else:
            print("✅ nvidia-smi succeeded — GPU device detected.")
    except Exception as e:
        print(f"❌ Error running nvidia-smi: {e}")
        _suggest_cool_down()
        return

    # 2. Check PyTorch CUDA
    if not torch.cuda.is_available():
        print("❌ CUDA is not available in PyTorch — GPU cannot be used.")
        _suggest_cool_down()
        return
    else:
        gpu_name = torch.cuda.get_device_name(0)
        print(f"✅ CUDA is available!")
        print(f"   → GPU Model: {gpu_name}")

    # 3. Run a small GPU computation test
    try:
        print("🧪 Running GPU compute test...")
        x = torch.randn(1000, 1000).cuda()
        y = torch.randn(1000, 1000).cuda()
        start = time.time()
        for _ in range(10):
            z = torch.mm(x, y)
        torch.cuda.synchronize()
        duration = time.time() - start
        print(f"   → GPU test completed in {duration:.3f} seconds")
        if duration > 5.0:
            print("⚠️  Slow GPU — possible throttling or low-priority allocation.")
        else:
            print("✅ GPU is responsive and usable.")
        return  # GPU is fully usable
    except Exception as e:
        print(f"❌ GPU computation failed: {e}")
        _suggest_cool_down()
        return

def _suggest_cool_down():
    """
    Provide educated guess about cool-down status and recovery time.
    """
    print("\n🚨 Likely GPU Unavailable Due To:")
    print("   • Usage limit reached")
    print("   • Session duration or compute quota exceeded")
    print("   • Account-level restrictions (especially on free tier)")

    print("\n📅 Estimated Recovery Time:")

    # Heuristic based on common Colab behavior
    print("   • ⏳ Free Tier: 6–24 hours (often ~12 hours)")
    print("   • 💠 Colab Pro: 2–8 hours (depends on usage)")
    print("   • 💎 Pro+: 1–4 hours (shorter cooldowns)")

    print("\n💡 Tips to Recover Faster:")
    print("   • Avoid running heavy workloads for > 8–12 hours continuously.")
    print("   • Disconnect and close the notebook for several hours.")
    print("   • Try accessing Colab from a different browser/incognito after 6+ hours.")
    print("   • Consider upgrading to Colab Pro/Pro+ for better access.")
    print("   • Use lightweight models or CPU when possible during cooldown.")

    # Optional: Show current time and estimate
    now = datetime.now()
    est_recovery = now + timedelta(hours=12)
    print(f"\n📌 Estimated earliest recovery: {est_recovery.strftime('%Y-%m-%d %H:%M')} (local time)")
    print("   → This is an estimate — actual time may vary.")

# Run the check
check_gpu_usability()

🔍 Assessing GPU Usability and Cool-Down Status...

❌ Error running nvidia-smi: [Errno 2] No such file or directory: 'nvidia-smi'

🚨 Likely GPU Unavailable Due To:
   • Usage limit reached
   • Session duration or compute quota exceeded
   • Account-level restrictions (especially on free tier)

📅 Estimated Recovery Time:
   • ⏳ Free Tier: 6–24 hours (often ~12 hours)
   • 💠 Colab Pro: 2–8 hours (depends on usage)
   • 💎 Pro+: 1–4 hours (shorter cooldowns)

💡 Tips to Recover Faster:
   • Avoid running heavy workloads for > 8–12 hours continuously.
   • Disconnect and close the notebook for several hours.
   • Try accessing Colab from a different browser/incognito after 6+ hours.
   • Consider upgrading to Colab Pro/Pro+ for better access.
   • Use lightweight models or CPU when possible during cooldown.

📌 Estimated earliest recovery: 2025-08-14 22:02 (local time)
   → This is an estimate — actual time may vary.


In [None]:
import psutil
import platform
import subprocess
from datetime import datetime

def get_cpu_info():
    print("🧮 CPU Information\n")

    # Basic system info
    print(f"System: {platform.system()} {platform.release()}")
    print(f"Architecture: {platform.machine()} ({platform.architecture()[0]})")
    print(f"Node (Hostname): {platform.node()}")
    print(f"Python Version: {platform.python_version()}")
    print()

    # CPU model
    try:
        # Try to get CPU info from /proc/cpuinfo (Linux, including Colab)
        if platform.system() == "Linux":
            result = subprocess.run(
                ["cat", "/proc/cpuinfo"],
                stdout=subprocess.PIPE,
                text=True
            )
            cpu_info = result.stdout
            for line in cpu_info.split("\n"):
                if "model name" in line:
                    model_name = line.split(":")[1].strip()
                    print(f"CPU Model: {model_name}")
                    break
        else:
            # Fallback for non-Linux
            print(f"CPU Model: {platform.processor()}")
    except Exception as e:
        print(f"CPU Model: Could not retrieve ({e})")

    # Number of cores
    physical_cores = psutil.cpu_count(logical=False)
    logical_cores = psutil.cpu_count(logical=True)
    print(f"Physical Cores: {physical_cores}")
    print(f"Logical Cores (Hyperthreading): {logical_cores}")

    # CPU frequency
    cpufreq = psutil.cpu_freq()
    if cpufreq:
        print(f"Current Frequency: {cpufreq.current:.2f} MHz")
        print(f"Max Frequency: {cpufreq.max:.2f} MHz")
        print(f"Min Frequency: {cpufreq.min:.2f} MHz")
    else:
        print("CPU Frequency: Not available")

    # CPU usage
    cpu_usage = psutil.cpu_percent(percpu=False, interval=1)
    print(f"Overall CPU Usage: {cpu_usage}%")

    # Per-core usage (optional, can be long)
    print("CPU Usage Per Core:", end=" ")
    for i, usage in enumerate(psutil.cpu_percent(percpu=True, interval=1)):
        print(f"Core {i}: {usage}%", end=" | ")
    print("\n")

    # Boot time
    boot_time = datetime.fromtimestamp(psutil.boot_time())
    print(f"System Boot Time: {boot_time.strftime('%Y-%m-%d %H:%M:%S')}")

# Run the function
get_cpu_info()

🧮 CPU Information

System: Linux 6.1.123+
Architecture: x86_64 (64bit)
Node (Hostname): bd1ce50d9fcf
Python Version: 3.11.13

CPU Model: AMD EPYC 7B12
Physical Cores: 1
Logical Cores (Hyperthreading): 2
Current Frequency: 2250.00 MHz
Max Frequency: 0.00 MHz
Min Frequency: 0.00 MHz
Overall CPU Usage: 58.6%
CPU Usage Per Core: Core 0: 64.0% | Core 1: 37.8% | 

System Boot Time: 2025-08-14 09:59:42


In [None]:
!lscpu  # to see CPU info
!nvidia-smi  # to check GPU

Architecture:             x86_64
  CPU op-mode(s):         32-bit, 64-bit
  Address sizes:          46 bits physical, 48 bits virtual
  Byte Order:             Little Endian
CPU(s):                   2
  On-line CPU(s) list:    0,1
Vendor ID:                GenuineIntel
  Model name:             Intel(R) Xeon(R) CPU @ 2.00GHz
    CPU family:           6
    Model:                85
    Thread(s) per core:   2
    Core(s) per socket:   1
    Socket(s):            1
    Stepping:             3
    BogoMIPS:             4000.30
    Flags:                fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge m
                          ca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht sysc
                          all nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xt
                          opology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq
                           ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt
                           aes xsave avx f16c rdrand hypervisor 