In [1]:
# Install necessary libraries
%pip install torch transformers datasets matplotlib seaborn

Note: you may need to restart the kernel to use updated packages.


## Step 1: Load and Explore the Dataset

In [2]:
from datasets import load_dataset

# Load the dataset
print("Loading dataset...")
dataset = load_dataset("open-web-math/open-web-math")

dataset

Loading dataset...


Resolving data files:   0%|          | 0/114 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/113 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['url', 'text', 'date', 'metadata'],
        num_rows: 6315233
    })
})

In [3]:


# Display a sample from the dataset
print("Sample from the dataset:")
print(dataset["train"][0])

# Check dataset structure
print("Dataset structure:")
print(dataset)

Sample from the dataset:
{'url': 'https://telescoper.wordpress.com/2010/11/23/bayes-and-hi-theorem/', 'text': 'Bayes and his\xa0Theorem\n\nMy earlier post on Bayesian probability seems to have generated quite a lot of readers, so this lunchtime I thought I’d add a little bit of background. The previous discussion started from the result\n\n$P(B|AC) = K^{-1}P(B|C)P(A|BC) = K^{-1} P(AB|C)$\n\nwhere\n\n$K=P(A|C).$\n\nAlthough this is called Bayes’ theorem, the general form of it as stated here was actually first written down, not by Bayes but by Laplace. What Bayes’ did was derive the special case of this formula for “inverting” the binomial distribution. This distribution gives the probability of x successes in n independent “trials” each having the same probability of success, p; each “trial” has only two possible outcomes (“success” or “failure”). Trials like this are usually called Bernoulli trials, after Daniel Bernoulli. If we ask the question “what is the probability of exactly x s

In [4]:
dataset['train'][0]['text']

'Bayes and his\xa0Theorem\n\nMy earlier post on Bayesian probability seems to have generated quite a lot of readers, so this lunchtime I thought I’d add a little bit of background. The previous discussion started from the result\n\n$P(B|AC) = K^{-1}P(B|C)P(A|BC) = K^{-1} P(AB|C)$\n\nwhere\n\n$K=P(A|C).$\n\nAlthough this is called Bayes’ theorem, the general form of it as stated here was actually first written down, not by Bayes but by Laplace. What Bayes’ did was derive the special case of this formula for “inverting” the binomial distribution. This distribution gives the probability of x successes in n independent “trials” each having the same probability of success, p; each “trial” has only two possible outcomes (“success” or “failure”). Trials like this are usually called Bernoulli trials, after Daniel Bernoulli. If we ask the question “what is the probability of exactly x successes from the possible n?”, the answer is given by the binomial distribution:\n\n$P_n(x|n,p)= C(n,x) p^x (

## Step 2: Tokenizer Visualization

In [5]:
from transformers import AutoTokenizer

# Load the tokenizer
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")

# Tokenize a sample text
sample_text = dataset["train"][0]["text"]
tokens = tokenizer.tokenize(sample_text)
token_ids = tokenizer.convert_tokens_to_ids(tokens)

print("Sample text:")
print(sample_text)

print("Tokens:")
print(tokens)

print("Token IDs:")
print(token_ids)

Loading tokenizer...
Sample text:
Bayes and his Theorem

My earlier post on Bayesian probability seems to have generated quite a lot of readers, so this lunchtime I thought I’d add a little bit of background. The previous discussion started from the result

$P(B|AC) = K^{-1}P(B|C)P(A|BC) = K^{-1} P(AB|C)$

where

$K=P(A|C).$

Although this is called Bayes’ theorem, the general form of it as stated here was actually first written down, not by Bayes but by Laplace. What Bayes’ did was derive the special case of this formula for “inverting” the binomial distribution. This distribution gives the probability of x successes in n independent “trials” each having the same probability of success, p; each “trial” has only two possible outcomes (“success” or “failure”). Trials like this are usually called Bernoulli trials, after Daniel Bernoulli. If we ask the question “what is the probability of exactly x successes from the possible n?”, the answer is given by the binomial distribution:

$P_n(x|

## Step 3: Load the Model

In [6]:
from transformers import AutoModelForCausalLM

# Load the model
print("Loading model...")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B")
print("Model loaded successfully!")

Loading model...
Model loaded successfully!


## Step 4: Train the Model

In [14]:
import torch
from transformers import Trainer, TrainingArguments

split_dataset = dataset["train"].train_test_split(test_size=0.05, seed=42)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]

small_train_dataset = train_dataset.select(range(100))
small_eval_dataset = eval_dataset.select(range(50))

# Assign a padding token to the tokenizer
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token  # Use the eos_token as the pad_token

# Tokenize the small sample dataset
def tokenize_function(examples):
    tokenized = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)
    tokenized["labels"] = tokenized["input_ids"].copy()  # Add labels for loss computation
    return tokenized

tokenized_train_dataset = small_train_dataset.map(tokenize_function, batched=True)
tokenized_eval_dataset = small_eval_dataset.map(tokenize_function, batched=True)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./llama3-math-pretrain-small",
    evaluation_strategy="steps",
    eval_steps=10, 
    save_steps=10, 
    per_device_train_batch_size=1, 
    num_train_epochs=1,  # Train for 1 epoch
    logging_dir="./logs",
    logging_steps=5,
    save_total_limit=2,
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_steps=10,
    fp16=torch.cuda.is_available(),  # Use mixed precision if supported
    report_to="none",  # Disable reporting to external services
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
)

# Train the model
print("Starting training on small sample...")
trainer.train()
print("Training complete!")

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Starting training on small sample...


RuntimeError: MPS backend out of memory (MPS allocated: 16.91 GB, other allocations: 1.05 GB, max allowed: 18.13 GB). Tried to allocate 1002.00 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

## Step 5: Visualize Attention Scores

In [20]:
import matplotlib.pyplot as plt
import seaborn as sns

# Example input
inputs = tokenizer("Solve for x: 2x + 3 = 7", return_tensors="pt")

# Forward pass with attention outputs
outputs = model(**inputs, output_attentions=True)

# Access attention scores
attention_scores = outputs.attentions  # List of attention scores for each layer

# Visualize attention for the first layer and first head
attention = attention_scores[0][0].detach().numpy()  # First layer, first head
sns.heatmap(attention, cmap="viridis")
plt.title("Attention Scores")
plt.show()



ValueError: Must pass 2-d input. shape=(32, 15, 15)