In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
from collections import Counter
import string

# Step 1: Preprocess the text data
class TextPreprocessor:
    def __init__(self, text, seq_length=10):
        self.text = text.lower().translate(str.maketrans('', '', string.punctuation))
        self.seq_length = seq_length
        self.words = self.text.split()
        self.word_counts = Counter(self.words)
        self.vocab = sorted(self.word_counts, key=self.word_counts.get, reverse=True)
        self.vocab_to_int = {word: i for i, word in enumerate(self.vocab)}
        self.int_to_vocab = {i: word for i, word in enumerate(self.vocab)}
        self.n_vocab = len(self.vocab)
        
    def create_sequences(self):
        sequences = []
        for i in range(len(self.words) - self.seq_length):
            seq = self.words[i:i + self.seq_length]
            next_word = self.words[i + self.seq_length]
            sequences.append(([self.vocab_to_int[word] for word in seq], 
                            self.vocab_to_int[next_word]))
        return sequences

# Step 2: Create Dataset
class TextDataset(Dataset):
    def __init__(self, sequences):
        self.sequences = sequences
        
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        sequence, target = self.sequences[idx]
        return torch.tensor(sequence), torch.tensor([target])

# Step 3: Define LSTM Model
class NextWordLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim=128, hidden_dim=256, n_layers=2):
        super(NextWordLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)
        
    def forward(self, x, hidden):
        # Embed the input
        x = self.embedding(x)
        
        # Forward pass through LSTM
        out, hidden = self.lstm(x, hidden)
        
        # Reshape output and pass through final layer
        out = out.contiguous().view(-1, out.shape[2])
        out = self.fc(out)
        
        return out, hidden
    
    def init_hidden(self, batch_size, device):
        weight = next(self.parameters()).data
        hidden = (weight.new(self.lstm.num_layers, batch_size, self.lstm.hidden_size).zero_().to(device),
                  weight.new(self.lstm.num_layers, batch_size, self.lstm.hidden_size).zero_().to(device))
        return hidden

# Step 4: Training Function
def train_model(model, dataloader, epochs=10, lr=0.001):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    
    for epoch in range(epochs):
        hidden = model.init_hidden(batch_size, device)
        
        for inputs, targets in dataloader:
            inputs, targets = inputs.to(device), targets.to(device)
            
            # Zero gradients
            optimizer.zero_grad()
            
            # Forward pass
            hidden = tuple([h.data for h in hidden])
            outputs, hidden = model(inputs, hidden)
            
            # Calculate loss and backpropagate
            loss = criterion(outputs.view(inputs.size(0), -1, vocab_size)[:, -1, :], targets.squeeze())
            loss.backward()
            optimizer.step()
            
        print(f'Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}')

# Step 5: Prediction Function
def predict_next_word(model, preprocessor, initial_text, device, top_k=5):
    model.eval()
    
    # Process input text
    initial_text = initial_text.lower().translate(str.maketrans('', '', string.punctuation))
    words = initial_text.split()[-preprocessor.seq_length:]
    
    # Convert words to integers
    seq = [preprocessor.vocab_to_int.get(word, 0) for word in words]
    seq = torch.tensor(seq).unsqueeze(0).to(device)
    
    # Get prediction
    hidden = model.init_hidden(1, device)
    with torch.no_grad():
        outputs, _ = model(seq, hidden)
        outputs = outputs[-1]  # Get the last output (next word prediction)
        
    # Get top predictions
    probs = torch.softmax(outputs, dim=0)
    top_probs, top_indices = torch.topk(probs, top_k)
    
    # Convert to words and probabilities
    top_words = [preprocessor.int_to_vocab[idx.item()] for idx in top_indices]
    top_probs = top_probs.cpu().numpy()
    
    return list(zip(top_words, top_probs))

# Example usage
if __name__ == "__main__":
    # Sample text (in practice, use a larger corpus)
    text = """The quick brown fox jumps over the lazy dog. 
              The dog barked at the fox, but the fox kept running. 
              A quick movement caught the dog's attention."""
    
    # Preprocess text
    seq_length = 5
    preprocessor = TextPreprocessor(text, seq_length)
    sequences = preprocessor.create_sequences()
    
    # Create dataloader
    batch_size = 2
    dataset = TextDataset(sequences)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    
    # Initialize model
    vocab_size = preprocessor.n_vocab
    model = NextWordLSTM(vocab_size)
    
    # Train model
    train_model(model, dataloader, epochs=30)
    
    # Make predictions
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    test_text = "the quick brown fox"
    predictions = predict_next_word(model, preprocessor, test_text, device)
    
    print(f"\nNext word predictions for '{test_text}':")
    for word, prob in predictions:
        print(f"{word}: {prob:.2%}")

In [1]:
import torch
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
from collections import Counter
import string
from datasets import load_dataset

# Step 1: Preprocess the text data using WikiText
class WikiTextPreprocessor:
    def __init__(self, seq_length=10):
        self.seq_length = seq_length
        self.dataset = load_dataset('wikitext', 'wikitext-2-raw-v1')
        self.text = self._combine_dataset()
        self.words = self._process_text()
        self.word_counts = Counter(self.words)
        self.vocab = sorted(self.word_counts, key=self.word_counts.get, reverse=True)
        self.vocab_to_int = {word: i for i, word in enumerate(self.vocab)}
        self.int_to_vocab = {i: word for i, word in enumerate(self.vocab)}
        self.n_vocab = len(self.vocab)
        
    def _combine_dataset(self):
        # Combine only the first 100 non-empty entries from training text
        train_texts = [item for item in self.dataset['train']['text'] if item.strip()]
        text = "\n".join(train_texts[:100])
        return text.lower().translate(str.maketrans('', '', string.punctuation))
    
    def _process_text(self):
        return [word for word in self.text.split() if word]
        
    def create_sequences(self):
        sequences = []
        for i in range(len(self.words) - self.seq_length):
            seq = self.words[i:i + self.seq_length]
            next_word = self.words[i + self.seq_length]
            sequences.append(([self.vocab_to_int[word] for word in seq], 
                            self.vocab_to_int[next_word]))
        return sequences

# Step 2: Create Dataset (same as before)
class TextDataset(Dataset):
    def __init__(self, sequences):
        self.sequences = sequences
        
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        sequence, target = self.sequences[idx]
        return torch.tensor(sequence), torch.tensor([target])

# Step 3: Define LSTM Model (same as before)
class NextWordLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim=128, hidden_dim=256, n_layers=2):
        super(NextWordLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)
        
    def forward(self, x, hidden):
        batch_size = x.size(0)
        x = self.embedding(x)
        out, hidden = self.lstm(x, hidden)
        out = self.fc(out[:, -1, :])  # Only use the last output
        return out, hidden
    
    def init_hidden(self, batch_size, device):
        weight = next(self.parameters()).data
        hidden = (weight.new(self.lstm.num_layers, batch_size, self.lstm.hidden_size).zero_().to(device),
                  weight.new(self.lstm.num_layers, batch_size, self.lstm.hidden_size).zero_().to(device))
        return hidden

# Step 4: Training Function (same as before)
def train_model(model, dataloader, epochs=10, lr=0.001):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    
    for epoch in range(epochs):
        total_loss = 0
        for inputs, targets in dataloader:
            batch_size = inputs.size(0)
            hidden = model.init_hidden(batch_size, device)
            
            inputs, targets = inputs.to(device), targets.to(device)
            optimizer.zero_grad()
            hidden = tuple([h.data for h in hidden])
            outputs, hidden = model(inputs, hidden)
            loss = criterion(outputs, targets.squeeze())
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            
        avg_loss = total_loss / len(dataloader)
        print(f'Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}')

# Step 5: Prediction Function (same as before)
def predict_next_word(model, preprocessor, initial_text, device, top_k=5):
    model.eval()
    initial_text = initial_text.lower().translate(str.maketrans('', '', string.punctuation))
    words = initial_text.split()[-preprocessor.seq_length:]
    seq = [preprocessor.vocab_to_int.get(word, 0) for word in words]
    seq = torch.tensor(seq).unsqueeze(0).to(device)
    
    hidden = model.init_hidden(1, device)
    with torch.no_grad():
        outputs, _ = model(seq, hidden)
        outputs = outputs[-1]
        
    probs = torch.softmax(outputs, dim=0)
    top_probs, top_indices = torch.topk(probs, top_k)
    top_words = [preprocessor.int_to_vocab[idx.item()] for idx in top_indices]
    top_probs = top_probs.cpu().numpy()
    
    return list(zip(top_words, top_probs))

# Example usage
if __name__ == "__main__":
    # Initialize with WikiText
    seq_length = 5
    preprocessor = WikiTextPreprocessor(seq_length)
    sequences = preprocessor.create_sequences()
    
    # Create dataloader
    batch_size = 128  # Increased batch size for better training
    dataset = TextDataset(sequences)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    
    # Initialize model with larger dimensions for WikiText
    vocab_size = preprocessor.n_vocab
    model = NextWordLSTM(vocab_size, embedding_dim=256, hidden_dim=512, n_layers=2)
    
    # Train model with more epochs
    train_model(model, dataloader, epochs=30, lr=0.001)
    
    # Make predictions
    device = torch.device('mps' if torch.mps.is_available() else 'cpu')
    num_predictions = 60  # Number of words to predict
    current_text = "the united states"

    for _ in range(num_predictions):
        predictions = predict_next_word(model, preprocessor, current_text, device)
        if predictions:
            best_word, _ = max(predictions, key=lambda x: x[1])
            current_text += " " + best_word
        else:
            break

    print(f"\nFinal sentence after {num_predictions} predictions:")
    print(current_text)


ModuleNotFoundError: No module named 'datasets'

In [4]:
num_predictions = 60  # Number of words to predict
current_text = "the united states"

for _ in range(num_predictions):
    predictions = predict_next_word(model, preprocessor, current_text, device)
    if predictions:
        best_word, _ = max(predictions, key=lambda x: x[1])
        current_text += " " + best_word
    else:
        break

print(f"\nFinal sentence after {num_predictions} predictions:")
print(current_text)



Final sentence after 60 predictions:
the united states government and began a desperate but ultimately futile dispatch of letters and telegrams asking for reinforcements although rumors were widely spread that they were already coming the first telegraph wire to span between little rock and memphis had recently been completed local attorney john m harrel was asked to compose the first telegraph dispatched from arkansas s capital in his


In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
from collections import Counter
import string
from datasets import load_dataset
import math

# Step 1: Preprocess the text data using WikiText (same as before)
class WikiTextPreprocessor:
    def __init__(self, seq_length=10):
        self.seq_length = seq_length
        self.dataset = load_dataset('wikitext', 'wikitext-2-raw-v1')
        self.text = self._combine_dataset()
        self.words = self._process_text()
        self.word_counts = Counter(self.words)
        self.vocab = sorted(self.word_counts, key=self.word_counts.get, reverse=True)
        self.vocab_to_int = {word: i for i, word in enumerate(self.vocab)}
        self.int_to_vocab = {i: word for i, word in enumerate(self.vocab)}
        self.n_vocab = len(self.vocab)
        
    def _combine_dataset(self):
        # Combine only the first 100 non-empty entries from training text
        train_texts = [item for item in self.dataset['train']['text'] if item.strip()]
        text = "\n".join(train_texts)
        return text.lower().translate(str.maketrans('', '', string.punctuation))
    
    def _process_text(self):
        return [word for word in self.text.split() if word]
        
    def create_sequences(self):
        sequences = []
        for i in range(len(self.words) - self.seq_length):
            seq = self.words[i:i + self.seq_length]
            next_word = self.words[i + self.seq_length]
            sequences.append(([self.vocab_to_int[word] for word in seq], 
                            self.vocab_to_int[next_word]))
        return sequences

# Step 2: Create Dataset (same as before)
class TextDataset(Dataset):
    def __init__(self, sequences):
        self.sequences = sequences
        
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        sequence, target = self.sequences[idx]
        return torch.tensor(sequence), torch.tensor([target])

# Positional Encoding for Transformer
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

# Transformer Model
class NextWordTransformer(nn.Module):
    def __init__(self, vocab_size, embedding_dim=128, nhead=8, num_layers=2, dropout=0.1):
        super(NextWordTransformer, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.pos_encoder = PositionalEncoding(embedding_dim)
        
        # Transformer Encoder
        encoder_layers = nn.TransformerEncoderLayer(
            d_model=embedding_dim, 
            nhead=nhead, 
            dropout=dropout,
            batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers=num_layers)
        
        # Output layer
        self.fc = nn.Linear(embedding_dim, vocab_size)
        
        # Initialize weights
        self.init_weights()
        
    def init_weights(self):
        initrange = 0.1
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()
        self.fc.weight.data.uniform_(-initrange, initrange)
        
    def forward(self, x):
        # Embedding + positional encoding
        x = self.embedding(x)
        x = self.pos_encoder(x)
        
        # Transformer encoder
        # Generate a square causal mask for the sequence
        mask = nn.Transformer.generate_square_subsequent_mask(x.size(1)).to(x.device)
        x = self.transformer_encoder(x, mask=mask)
        
        # Only use the last output for prediction
        x = self.fc(x[:, -1, :])
        return x

# Step 4: Training Function (modified for Transformer)
def train_model(model, dataloader, epochs=10, lr=0.001):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    
    for epoch in range(epochs):
        total_loss = 0
        model.train()
        for inputs, targets in dataloader:
            inputs, targets = inputs.to(device), targets.to(device)
            optimizer.zero_grad()
            
            outputs = model(inputs)
            loss = criterion(outputs, targets.squeeze())
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            
        avg_loss = total_loss / len(dataloader)
        print(f'Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}')

# Step 5: Prediction Function (modified for Transformer)
def predict_next_word(model, preprocessor, initial_text, device, top_k=5):
    model.eval()
    initial_text = initial_text.lower().translate(str.maketrans('', '', string.punctuation))
    words = initial_text.split()[-preprocessor.seq_length:]
    seq = [preprocessor.vocab_to_int.get(word, 0) for word in words]
    seq = torch.tensor(seq).unsqueeze(0).to(device)
    
    with torch.no_grad():
        outputs = model(seq)
        
    probs = torch.softmax(outputs, dim=1)
    top_probs, top_indices = torch.topk(probs, top_k)
    top_words = [preprocessor.int_to_vocab[idx.item()] for idx in top_indices[0]]
    top_probs = top_probs[0].cpu().numpy()
    
    return list(zip(top_words, top_probs))

# Example usage
if __name__ == "__main__":
    # Initialize with WikiText
    seq_length = 40  # Longer sequence length works better with Transformers
    preprocessor = WikiTextPreprocessor(seq_length)
    sequences = preprocessor.create_sequences()
    
    # Create dataloader
    batch_size = 128  # Adjusted batch size
    dataset = TextDataset(sequences)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    
    # Initialize Transformer model
    vocab_size = preprocessor.n_vocab
    model = NextWordTransformer(
        vocab_size, 
        embedding_dim=256, 
        nhead=8, 
        num_layers=4, 
        dropout=0.1
    )
    
    # Train model
    train_model(model, dataloader, epochs=40, lr=0.0001)
    
    # Make predictions
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(device)
    num_predictions = 60  # Number of words to predict
    current_text = "the united states"

    for _ in range(num_predictions):
        predictions = predict_next_word(model, preprocessor, current_text, device)
        if predictions:
            best_word, _ = max(predictions, key=lambda x: x[1])
            current_text += " " + best_word
        else:
            break

    print(f"\nFinal sentence after {num_predictions} predictions:")
    print(current_text)

  from .autonotebook import tqdm as notebook_tqdm
Using the latest cached version of the dataset since wikitext couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'wikitext-2-raw-v1' at C:\Users\araut1\.cache\huggingface\datasets\wikitext\wikitext-2-raw-v1\0.0.0\b08601e04326c79dfdd32d625aee71d232d685c3 (last modified on Fri Apr  4 13:48:47 2025).


Epoch 1/40, Loss: 7.2463
Epoch 2/40, Loss: 6.5955


KeyboardInterrupt: 

In [None]:
torch.save({
    'model_state_dict': model.state_dict(),
    'preprocessor': preprocessor,
}, 'transformer_word_predictor.pth')

# Then later when you want to load and use it:
def load_and_generate(initial_text, num_predictions=60, model_path='transformer_word_predictor.pth'):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    # Load the saved model and preprocessor
    checkpoint = torch.load(model_path)
    preprocessor = checkpoint['preprocessor']
    
    # Recreate the model architecture
    model = NextWordTransformer(
        preprocessor.n_vocab,
        embedding_dim=256,
        nhead=8,
        num_layers=4,
        dropout=0.1
    )
    model.load_state_dict(checkpoint['model_state_dict'])
    model = model.to(device)
    model.eval()
    
    # Generate text
    current_text = initial_text.lower().translate(str.maketrans('', '', string.punctuation))
    
    for _ in range(num_predictions):
        predictions = predict_next_word(model, preprocessor, current_text, device)
        if predictions:
            best_word, _ = max(predictions, key=lambda x: x[1])
            current_text += " " + best_word
        else:
            break
    
    print(f"\nGenerated text after {num_predictions} predictions:")
    print(current_text)

# Example usage of loading and generating
load_and_generate("the united states", 60)

In [None]:
from huggingface_hub import login

# Replace 'your-access-token' with your actual Hugging Face token
login("your-access-token")
