In [7]:
!pip install torch --index-url https://download.pytorch.org/whl/cpu

Looking in indexes: https://download.pytorch.org/whl/cpu
Collecting torch
  Downloading https://download.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp312-cp312-win_amd64.whl.metadata (29 kB)
Downloading https://download.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp312-cp312-win_amd64.whl (113.7 MB)
   ---------------------------------------- 0.0/113.7 MB ? eta -:--:--
    --------------------------------------- 2.4/113.7 MB 16.8 MB/s eta 0:00:07
   -- ------------------------------------- 5.8/113.7 MB 16.8 MB/s eta 0:00:07
   --- ------------------------------------ 9.2/113.7 MB 16.8 MB/s eta 0:00:07
   ---- ----------------------------------- 13.1/113.7 MB 17.1 MB/s eta 0:00:06
   ----- ---------------------------------- 16.8/113.7 MB 17.3 MB/s eta 0:00:06
   ------- -------------------------------- 20.7/113.7 MB 17.9 MB/s eta 0:00:06
   -------- ------------------------------- 23.3/113.7 MB 17.0 MB/s eta 0:00:06
   --------- ------------------------------ 26.2/113.7 MB 16.9 MB/s eta 0:00:06


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchvision 0.24.1 requires torch==2.9.1, but you have torch 2.10.0+cpu which is incompatible.

[notice] A new release of pip is available: 25.0.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
!pip install datasets




[notice] A new release of pip is available: 25.0.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import math
import numpy as np
import datasets

# Check for GPU availability
# If 'cuda' is present, we use the graphics card for faster training
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Set random seed for reproducibility
# This ensures the model learns the same way every time we restart
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

Using device: cpu


In [None]:
import os
import requests
from datasets import load_dataset, DatasetDict

# 1. Download the file manually from a public GitHub repo
# Using Book 1 (Sorcerer's Stone).
url = "https://raw.githubusercontent.com/amephraim/nlp/master/texts/J.%20K.%20Rowling%20-%20Harry%20Potter%201%20-%20Sorcerer's%20Stone.txt"
file_path = "harry_potter_book1.txt"

print(f"Downloading from {url}...")
response = requests.get(url)

if response.status_code == 200:
    with open(file_path, 'wb') as f:
        f.write(response.content)
    print("Download successful.")
else:
    print("Failed to download. Using dummy data for testing.")
    with open(file_path, 'w') as f:
        f.write("Harry Potter is a wizard. He goes to Hogwarts. He has friends named Ron and Hermione.")

# 2. Load the local file into the library
dataset = load_dataset('text', data_files={'train': file_path})

# 3. Split the data (Train 80% / Validation 10% / Test 10%)
# First split: 80% Train, 20% Temporary
train_testvalid = dataset['train'].train_test_split(test_size=0.2, seed=1234)

# Second split: Split the 20% into half (10% Valid, 10% Test)
test_valid = train_testvalid['test'].train_test_split(test_size=0.5, seed=1234)

# Combine into final structure
dataset = DatasetDict({
    'train': train_testvalid['train'],
    'validation': test_valid['train'],
    'test': test_valid['test']
})

print("\nSuccess! Data structure:")
print(dataset)
print("\nExample text:")
print(dataset['train'][0])

Downloading from https://raw.githubusercontent.com/amephraim/nlp/master/texts/J.%20K.%20Rowling%20-%20Harry%20Potter%201%20-%20Sorcerer's%20Stone.txt...
Download successful.


Generating train split: 0 examples [00:00, ? examples/s]


Success! Data structure:
DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 8561
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 1070
    })
    test: Dataset({
        features: ['text'],
        num_rows: 1071
    })
})

Example text:
{'text': ''}


In [None]:
import collections
import re
import pickle
import os

# --- 1. Create a Custom Tokenizer ---

def tokenizer(text):
    text = text.lower()
    # Find all words (alphanumeric) OR punctuation marks
    tokens = re.findall(r"\w+|[^\w\s]", text)
    return tokens

print(f"Tokenizer Test: {tokenizer('Harry Potter is a wizard!')}")

# --- 2. Tokenize the Dataset ---
def tokenize_function(example):
    return {'tokens': tokenizer(example['text'])}

tokenized_dataset = dataset.map(tokenize_function, remove_columns=['text'])
print("Tokenization complete.")

# --- 3. Build the Vocabulary Manually ---
class Vocabulary:
    def __init__(self, vocab_dict, unk_idx):
        self.vocab_dict = vocab_dict
        self.unk_idx = unk_idx
        # Create a reverse lookup (Number -> Word)
        self.itos = {v: k for k, v in vocab_dict.items()}
        
    def __getitem__(self, token):
        # Returns the ID for a word. If unknown, returns the <unk> ID.
        return self.vocab_dict.get(token, self.unk_idx)
    
    def __len__(self):
        return len(self.vocab_dict)
    
    def get_itos(self):
        # Returns a list of all words
        return [self.itos[i] for i in range(len(self))]

# Count all words in the training set
token_counts = collections.Counter()
for tokens in tokenized_dataset['train']['tokens']:
    token_counts.update(tokens)

# Create the dictionary starting with special tokens
# <unk> = 0 (Unknown), <eos> = 1 (End of Sentence)
vocab_dict = {'<unk>': 0, '<eos>': 1}
index = 2

# Add words that appear at least 3 times
for token, count in token_counts.items():
    if count >= 3:
        vocab_dict[token] = index
        index += 1

# Initialize the Vocabulary object
vocab = Vocabulary(vocab_dict, unk_idx=0)

print(f"Total words in vocabulary: {len(vocab)}")
print(f"First 10 words: {vocab.get_itos()[:10]}")

# --- 4. Save the Vocabulary ---
if not os.path.exists('models'):
    os.makedirs('models')
    
with open('models/vocab_lm.pkl', 'wb') as f:
    pickle.dump(vocab, f)
print("Vocabulary saved to models/vocab_lm.pkl")

Tokenizer Test: ['harry', 'potter', 'is', 'a', 'wizard', '!']


Map:   0%|          | 0/8561 [00:00<?, ? examples/s]

Map:   0%|          | 0/1070 [00:00<?, ? examples/s]

Map:   0%|          | 0/1071 [00:00<?, ? examples/s]

Tokenization complete.
Total words in vocabulary: 2154
First 10 words: ['<unk>', '<eos>', 'bell', '-', 'hit', 'hard', 'in', 'the', 'face', 'by']
Vocabulary saved to models/vocab_lm.pkl


In [None]:
import torch

def get_data(dataset, vocab, batch_size):
    data = []
    for example in dataset:
        if example['tokens']:
            # We add <eos> to the end so the model learns when a sentence finishes 
            tokens = example['tokens'] + ['<eos>']
            # Convert words to their ID numbers using our vocab dictionary 
            tokens = [vocab[token] for token in tokens]
            data.extend(tokens)
            
    # Convert the big list of numbers into a PyTorch "Tensor"
    data = torch.LongTensor(data)
    
    # Calculate how many full batches we can make 
    num_batches = data.shape[0] // batch_size
    
    # Trim off the extra numbers that don't fit into a full batch 
    data = data[:num_batches * batch_size]
    
    # Reshape the data so it's organized by batch 
    data = data.view(batch_size, num_batches)
    return data

# We will use a batch size of 128
batch_size = 128
train_data = get_data(tokenized_dataset['train'], vocab, batch_size)
valid_data = get_data(tokenized_dataset['validation'], vocab, batch_size)
test_data  = get_data(tokenized_dataset['test'], vocab, batch_size)

print(f"Train data shape: {train_data.shape}")

Train data shape: torch.Size([128, 695])


In [None]:

import sys
sys.path.append('./app')
from lstm import LSTMLanguageModel

# 1. Define Settings (Hyperparameters)

vocab_size = len(vocab)
emb_dim = 1024 # The 'size' of each word vector [cite: 10]
hid_dim = 1024 # The 'memory' capacity of the LSTM [cite: 10]
num_layers = 2 # How many LSTMs are stacked together [cite: 10]
dropout_rate = 0.65 # Prevents the model from just memorizing the text [cite: 10]
lr = 1e-3 # Learning rate: how big the 'steps' are during learning [cite: 10]

# 2. Initialize the model on your CPU
model = LSTMLanguageModel(vocab_size, emb_dim, hid_dim, num_layers, dropout_rate).to(device)

# 3. Define Optimizer and Loss Function
# Adam is the tool that adjusts the weights to fix mistakes 
optimizer = optim.Adam(model.parameters(), lr=lr)
# CrossEntropyLoss measures how wrong the word predictions are 
criterion = nn.CrossEntropyLoss()

# Count how many total 'brain cells' the model has
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'The model has {num_params:,} trainable parameters')

The model has 21,207,146 trainable parameters


In [7]:
from tqdm import tqdm # For the progress bar

# Helper to grab a sequence of words for the model to read
def get_batch(data, seq_len, idx):
    src    = data[:, idx:idx+seq_len]                   
    target = data[:, idx+1:idx+seq_len+1] # The next word we want to predict [cite: 12]
    return src, target

def train(model, data, optimizer, criterion, batch_size, seq_len, clip, device):
    epoch_loss = 0
    model.train() # Set to learning mode
    num_batches = data.shape[-1]
    data = data[:, :num_batches - (num_batches -1) % seq_len]
    num_batches = data.shape[-1]
    
    hidden = model.init_hidden(batch_size, device) # Start with fresh memory
    
    for idx in tqdm(range(0, num_batches - 1, seq_len), desc='Training: ', leave=False):
        optimizer.zero_grad() # Reset the mistake counter
        hidden = model.detach_hidden(hidden) # Clean memory for this step

        src, target = get_batch(data, seq_len, idx)
        src, target = src.to(device), target.to(device)
        
        prediction, hidden = model(src, hidden)               

        # Reshape to compare predicted word vs actual word
        prediction = prediction.reshape(batch_size * seq_len, -1)  
        target = target.reshape(-1)
        loss = criterion(prediction, target)
        
        loss.backward() # Calculate how to fix the mistakes
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip) # Prevent math errors
        optimizer.step() # Update the brain weights
        epoch_loss += loss.item() * seq_len
        
    return epoch_loss / num_batches

def evaluate(model, data, criterion, batch_size, seq_len, device):
    epoch_loss = 0
    model.eval() # Set to testing mode (no learning)
    num_batches = data.shape[-1]
    data = data[:, :num_batches - (num_batches -1) % seq_len]
    num_batches = data.shape[-1]

    hidden = model.init_hidden(batch_size, device)

    with torch.no_grad(): # Don't update weights during testing
        for idx in range(0, num_batches - 1, seq_len):
            hidden = model.detach_hidden(hidden)
            src, target = get_batch(data, seq_len, idx)
            src, target = src.to(device), target.to(device)

            prediction, hidden = model(src, hidden)
            prediction = prediction.reshape(batch_size * seq_len, -1)
            target = target.reshape(-1)

            loss = criterion(prediction, target)
            epoch_loss += loss.item() * seq_len
    return epoch_loss / num_batches

In [None]:
import math
import os

n_epochs = 5
seq_len  = 35 # Words to read in one breath
clip     = 0.25
best_valid_loss = float('inf')

# Learning rate scheduler: slows down learning if progress stops 
lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=0)

for epoch in range(n_epochs):
    train_loss = train(model, train_data, optimizer, criterion, batch_size, seq_len, clip, device)
    valid_loss = evaluate(model, valid_data, criterion, batch_size, seq_len, device)

    lr_scheduler.step(valid_loss)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        if not os.path.exists('models'): os.makedirs('models')
        torch.save(model.state_dict(), 'models/best-val-lstm_lm.pt')

    print(f'Epoch: {epoch+1}')
    print(f'\tTrain Perplexity: {math.exp(train_loss):.3f}')
    print(f'\tValid Perplexity: {math.exp(valid_loss):.3f}')

                                                         

Epoch: 1
	Train Perplexity: 647.617
	Valid Perplexity: 205.719


                                                         

Epoch: 2
	Train Perplexity: 224.915
	Valid Perplexity: 148.214


                                                         

Epoch: 3
	Train Perplexity: 156.073
	Valid Perplexity: 112.385


                                                         

Epoch: 4
	Train Perplexity: 121.784
	Valid Perplexity: 92.675


                                                         

Epoch: 5
	Train Perplexity: 101.807
	Valid Perplexity: 81.458


In [None]:

# map_location=device ensures it loads correctly on our CPU
model.load_state_dict(torch.load('models/best-val-lstm_lm.pt', map_location=device))

# 2. Run the model on the Test Data
test_loss = evaluate(model, test_data, criterion, batch_size, seq_len, device)

# 3. Print the final score
print(f'Final Test Perplexity: {math.exp(test_loss):.3f}')

Final Test Perplexity: 77.570


In [None]:
def generate_text(prompt, max_seq_len, temperature, model, tokenizer, vocab, device):
    model.eval() # Set to evaluation mode
    
    # 1. Convert prompt into numbers
    tokens = tokenizer(prompt)
    indices = [vocab[t] for t in tokens]
    
    batch_size = 1
    hidden = model.init_hidden(batch_size, device)
    
    # 2. Start predicting words one by one
    with torch.no_grad():
        for i in range(max_seq_len):
            src = torch.LongTensor([indices]).to(device)
            prediction, hidden = model(src, hidden)
            
            # 3. Use 'Temperature' to control creativity
            # Lower temperature = safe/boring. Higher = creative/risky.
            probs = torch.softmax(prediction[:, -1] / temperature, dim=-1)  
            prediction = torch.multinomial(probs, num_samples=1).item()    
            
            # If the model predicts <unk>, try again to get a real word
            while prediction == vocab['<unk>']:
                prediction = torch.multinomial(probs, num_samples=1).item()

            # If the model predicts <eos>, the sentence is finished
            if prediction == vocab['<eos>']:
                break

            indices.append(prediction) # Feed the new word back in to get the next one

    # 4. Convert the numbers back into words
    itos = vocab.get_itos()
    result_tokens = [itos[i] for i in indices]
    return " ".join(result_tokens)



Result: harry potter is one sobbed shelves wailed glanced tracks jacket wailed cheek men feathers halfway jacket planets countryside strangely sleeve glanced stumped planets planets astronomy whipped shout sweaty immediately countryside planets stumped apothecary


In [None]:
# 1. Setup the parameters for the experiment
prompt = 'Harry potter is '
max_seq_len = 30
seed = 0 # Using a seed ensures we can compare the temperatures fairly

# 2. Define the temperatures to test
# Smaller = more predictable; Higher = more diverse/random
temperatures = [0.5, 0.7, 0.75, 0.8, 1.0]

print(f"Prompt: {prompt}\n" + "="*30)

for temperature in temperatures:
   
    # We split it into a list of words
    generation = generate_text(prompt, max_seq_len, temperature, model, tokenizer, vocab, device)
    
    # Printing the result for each temperature
    print(f"Temperature: {temperature}")
    print(generation)
    print("-" * 30)

Prompt: Harry potter is 
Temperature: 0.5
harry potter is clothes complain smiles ollivanders excitement excitement planets tape jacket bezoar weather planets scream excitement excitement planets shelf shelf bushy nightmare jacket planets excitement bezoar shelves excitement countryside planets jacket planets
------------------------------
Temperature: 0.7
harry potter is pay jacket setting bushy planets sleeve men wailed shelf fortune men sixteen countryside shelf fireplace flopped excitement whistle computer planets planets marks jacket bezoar easier quicker excitement greatest lip sleeve
------------------------------
Temperature: 0.75
harry potter is top constrictor damp bushy whistle rare ignore flushed planets whistled stumped planets countryside whistled excitement trousers greatest scales cheek search shelves bezoar daily yell postcard planets men countryside rise curtains
------------------------------
Temperature: 0.8
harry potter is mercy babble tucked stumped bezoar earlier