# Training a causal language model from scratch (PyTorch)

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [None]:
# Create python 3.10 greedyLR virtual env

# 1. git clone https://github.com/balak4/transformers/tree/main /home/ec2-user/SageMaker/transformers
# 2. conda env create -f git/balak4/transformers/examples/greedy-lr/conda/pytorch_p310_greedy_v2.yml
# 3. Install modified transformers fork in local env:
#     1. source ~/.bashrc
#     2. conda activate py310-greedy
#     3. python3 -m pip install -e /home/ec2-user/SageMaker/git/balak4/transformers

In [None]:
import torch
import transformers
import accelerate

from datetime import datetime

print("pytorch", torch.__version__)
print("transformers", transformers.__version__)
print("accelerate", accelerate.__version__)

In [42]:
# SET SEED
RANDOM_SEED = 42
transformers.set_seed(RANDOM_SEED)

In [43]:
def any_keyword_in_string(string, keywords):
    for keyword in keywords:
        if keyword in string:
            return True
    return False

In [44]:
filters = ["pandas", "sklearn", "matplotlib", "seaborn"]
example_1 = "import numpy as np"
example_2 = "import pandas as pd"

print(
    any_keyword_in_string(example_1, filters), any_keyword_in_string(example_2, filters)
)

False True


In [45]:
from collections import defaultdict
from tqdm import tqdm
from datasets import Dataset


def filter_streaming_dataset(dataset, filters):
    filtered_dict = defaultdict(list)
    total = 0
    for sample in tqdm(iter(dataset)):
        total += 1
        if any_keyword_in_string(sample["content"], filters):
            for k, v in sample.items():
                filtered_dict[k].append(v)
    print(f"{len(filtered_dict['content'])/total:.2%} of data after filtering.")
    return Dataset.from_dict(filtered_dict)

In [None]:
# This cell will take a very long time to execute, so you should skip it and go to
# the next one!
# from datasets import load_dataset

# split = "train"  # "valid"
# filters = ["pandas", "sklearn", "matplotlib", "seaborn"]

# data = load_dataset(f"transformersbook/codeparrot-{split}", split=split, streaming=True)
# filtered_data = filter_streaming_dataset(data, filters)

In [5]:
from datasets import load_dataset, DatasetDict

ds_train = load_dataset("huggingface-course/codeparrot-ds-train", split="train")
ds_valid = load_dataset("huggingface-course/codeparrot-ds-valid", split="validation")

raw_datasets = DatasetDict(
    {
        "train": ds_train,  # .shuffle().select(range(50000)),
        "valid": ds_valid,  # .shuffle().select(range(500))
    }
)

raw_datasets

Downloading data: 100%|██████████| 8.25G/8.25G [02:11<00:00, 62.7MB/s]


Generating train split: 0 examples [00:00, ? examples/s]

Downloading data: 100%|██████████| 46.1M/46.1M [00:00<00:00, 63.4MB/s]


Generating validation split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['repo_name', 'path', 'copies', 'size', 'content', 'license'],
        num_rows: 606720
    })
    valid: Dataset({
        features: ['repo_name', 'path', 'copies', 'size', 'content', 'license'],
        num_rows: 3322
    })
})

In [64]:
# Test - small data batch

# raw_datasets_small = DatasetDict(
#     {
#         "train": ds_train.shuffle().select(range(50_000)),
#         "valid": ds_valid.shuffle().select(range(500)),
#     }
# )

# raw_datasets_small

DatasetDict({
    train: Dataset({
        features: ['repo_name', 'path', 'copies', 'size', 'content', 'license'],
        num_rows: 50000
    })
    valid: Dataset({
        features: ['repo_name', 'path', 'copies', 'size', 'content', 'license'],
        num_rows: 500
    })
})

In [6]:
for key in raw_datasets["train"][0]:
    print(f"{key.upper()}: {raw_datasets['train'][0][key][:200]}")

REPO_NAME: kmike/scikit-learn
PATH: sklearn/utils/__init__.py
COPIES: 3
SIZE: 10094
CONTENT: """
The :mod:`sklearn.utils` module includes various utilites.
"""

from collections import Sequence

import numpy as np
from scipy.sparse import issparse

from .murmurhash import murm
LICENSE: bsd-3-clause


In [7]:
from transformers import AutoTokenizer

context_length = 128
tokenizer = AutoTokenizer.from_pretrained("huggingface-course/code-search-net-tokenizer")

outputs = tokenizer(
    raw_datasets["train"][:2]["content"],
    truncation=True,
    max_length=context_length,
    return_overflowing_tokens=True,
    return_length=True,
)

print(f"Input IDs length: {len(outputs['input_ids'])}")
print(f"Input chunk lengths: {(outputs['length'])}")
print(f"Chunk mapping: {outputs['overflow_to_sample_mapping']}")



tokenizer_config.json:   0%|          | 0.00/265 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/789k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/448k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

Input IDs length: 34
Input chunk lengths: [128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 117, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 41]
Chunk mapping: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [8]:
def tokenize(element):
    outputs = tokenizer(
        element["content"],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True,
    )
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length == context_length:
            input_batch.append(input_ids)
    return {"input_ids": input_batch}

In [68]:
# Test - small

# tokenized_datasets = raw_datasets_small.map(
#     tokenize, batched=True, remove_columns=raw_datasets_small["train"].column_names
# )
# tokenized_datasets

Map: 100%|██████████| 50000/50000 [01:54<00:00, 438.17 examples/s]
Map: 100%|██████████| 500/500 [00:01<00:00, 461.05 examples/s]


DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 1383736
    })
    valid: Dataset({
        features: ['input_ids'],
        num_rows: 13310
    })
})

In [13]:
tokenized_datasets = raw_datasets.map(
    tokenize, batched=True, remove_columns=raw_datasets["train"].column_names
)
tokenized_datasets

Map:   0%|          | 0/606720 [00:00<?, ? examples/s]

Map:   0%|          | 0/3322 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 16702061
    })
    valid: Dataset({
        features: ['input_ids'],
        num_rows: 93164
    })
})

In [49]:
import os

dataset_name = 'tokenized_dataset_full'
save_dir = f'./logs/codeparrot-ds/{dataset_name}/'

os.makedirs(save_dir, exist_ok=True)
tokenized_datasets.save_to_disk(save_dir)

Saving the dataset (0/18 shards):   0%|          | 0/16702061 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/93164 [00:00<?, ? examples/s]

In [55]:
# Reduce size of tokenized datasets

from datasets import DatasetDict

half_size_datasets = DatasetDict({
    split_name: dataset.shuffle(seed=42).select(range(len(dataset) // 2))
    for split_name, dataset in tokenized_datasets.items()
})

In [56]:
# Print original sizes
for split_name, dataset in tokenized_datasets.items():
    print(f"Original {split_name} size:", len(dataset))

# Print new sizes
for split_name, dataset in half_size_datasets.items():
    print(f"New {split_name} size:", len(dataset))

Original train size: 16702061
Original valid size: 93164
New train size: 8351030
New valid size: 46582


In [57]:
dataset_name = 'tokenized_dataset_half_seed_42'
save_dir = f'./logs/codeparrot-ds/{dataset_name}/'

os.makedirs(save_dir, exist_ok=True)
half_size_datasets.save_to_disk(save_dir)

Saving the dataset (0/9 shards):   0%|          | 0/8351030 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/46582 [00:00<?, ? examples/s]

In [58]:
# Re-assign dataset to re-use variable names below
tokenized_datasets = half_size_datasets

In [59]:
# Check
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 8351030
    })
    valid: Dataset({
        features: ['input_ids'],
        num_rows: 46582
    })
})

In [60]:
from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig

config = AutoConfig.from_pretrained(
    "gpt2",
    vocab_size=len(tokenizer),
    n_ctx=context_length,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)

## LR SCHEDULER - GREEDYLR

In [61]:
# model = GPT2LMHeadModel(config)

model_size = sum(t.numel() for t in model.parameters())
print(f"GPT-2 size: {model_size/1000**2:.1f}M parameters")

GPT-2 size: 124.2M parameters


In [62]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [63]:
out = data_collator([tokenized_datasets["train"][i] for i in range(5)])
for key in out:
    print(f"{key} shape: {out[key].shape}")

input_ids shape: torch.Size([5, 128])
attention_mask shape: torch.Size([5, 128])
labels shape: torch.Size([5, 128])


In [68]:
exp_name = "codeparrot-ds"
date = datetime.now().strftime("%Y-%m-%d")
run_num = 2
run_name = "greedylr"

logging_dir = f"./logs/{exp_name}/run{run_num}/{run_name}/{date}/tensorboard"
output_dir = f"./logs/{exp_name}/run{run_num}/{run_name}/{date}/output"
model_dir = f"./logs/{exp_name}/run{run_num}/{run_name}/{date}/pretrained-model-dir"

print(logging_dir)
print(output_dir)
print(model_dir)

./logs/codeparrot-ds/run2/greedylr/2025-01-25/tensorboard
./logs/codeparrot-ds/run2/greedylr/2025-01-25/output
./logs/codeparrot-ds/run2/greedylr/2025-01-25/pretrained-model-dir


In [78]:
from transformers import Trainer, TrainingArguments

# greedyLR
train_args = TrainingArguments(
    per_device_train_batch_size=48, # 32, 48
    per_device_eval_batch_size=48, # 32, 48
    logging_dir=logging_dir,
    logging_steps=10,
    num_train_epochs=1,
    learning_rate=2e-4,
    gradient_accumulation_steps=8,
    weight_decay=0.1,
    bf16=False, # Use to lower memory requirement
    evaluation_strategy="steps",
    eval_steps=500,
    warmup_steps=1_000,
    save_steps=500,
    save_total_limit=3,
    output_dir=output_dir,
    report_to="tensorboard",
    lr_scheduler_type="greedy",
    # greedy
    min_lr=1.85e-05,
    smooth=True,
    factor=0.95
)

In [70]:
from accelerate import Accelerator, DataLoaderConfiguration

# For accelerate = 0.28.0
accelerator = Accelerator()

# For newer versions of accelerate, e.g. accelerate = 1.3.0
# Define DataLoaderConfiguration
# dataloader_config = DataLoaderConfiguration(
#     dispatch_batches=False,  # Each process fetches its own batch
#     split_batches=True       # Split fetched batches across processes
# )

# # Initialize Accelerator with DataLoaderConfiguration
# accelerator = Accelerator(dataloader_config=dataloader_config)

In [None]:
trainer = accelerator.prepare(Trainer(
    model=model,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["valid"],
    args=train_args,
    data_collator=data_collator
))

model.config.use_cache = False  # silence the warnings. Please re-enable for inference!

trainer.train()

trainer.model.save_pretrained(
    model_dir,
    safe_serialization=False
)

GreedyLR settings: patience=10 smooth=True min_lr=1.85e-05 factor=0.95


Step,Training Loss,Validation Loss
500,2.4568,2.357604
1000,1.8629,1.784082
1500,1.6163,1.529091
2000,1.4814,1.419977
2500,1.4521,1.386506
3000,1.4463,1.378297
3500,1.446,1.371534




## LR SCHEDULER: COSINE

In [None]:
%%time
# INITALIZE MODEL

model = GPT2LMHeadModel(config)
model_size = sum(t.numel() for t in model.parameters())
print(f"GPT-2 size: {model_size/1000**2:.1f}M parameters")

In [None]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [None]:
out = data_collator([tokenized_datasets["train"][i] for i in range(5)])
for key in out:
    print(f"{key} shape: {out[key].shape}")

In [None]:
exp_name = "codeparrot-ds"
date = datetime.now().strftime("%Y-%m-%d")
run_num = 2
run_name = "cosine"

logging_dir = f"./logs/{exp_name}/run{run_num}/{run_name}/{date}/tensorboard"
output_dir = f"./logs/{exp_name}/run{run_num}/{run_name}/{date}/output"
model_dir = f"./logs/{exp_name}/run{run_num}/{run_name}/{date}/pretrained-model-dir"

print(logging_dir)
print(output_dir)
print(model_dir)

In [None]:
# Orginal: cosine

train_args = TrainingArguments(
    per_device_train_batch_size=48,  # 16, 8
    per_device_eval_batch_size=48,
    logging_dir=logging_dir,
    logging_steps=10,
    num_train_epochs=1,
    learning_rate=2e-4,
    gradient_accumulation_steps=8,
    weight_decay=0.1,
    bf16=False,
    evaluation_strategy="steps",
    eval_steps=500,
    warmup_steps=1_000,
    save_steps=500,
    save_total_limit=3,
    output_dir=output_dir,
    report_to="tensorboard",
    lr_scheduler_type="cosine",
)


trainer = accelerator.prepare(Trainer(
    model=model,
    tokenizer=tokenizer,
    args=train_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["valid"],
))

trainer.train()

trainer.model.save_pretrained(
    model_dir,
    safe_serialization=False
)

In [None]:
# trainer.push_to_hub()

## Test Model Inference ## 

In [None]:
import torch
from transformers import pipeline

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
pipe = pipeline(
    "text-generation", model="huggingface-course/codeparrot-ds", device=device
)

In [None]:
txt = """\
# create some data
x = np.random.randn(100)
y = np.random.randn(100)

# create scatter plot with x, y
"""
print(pipe(txt, num_return_sequences=1)[0]["generated_text"])

In [None]:
txt = """\
# create some data
x = np.random.randn(100)
y = np.random.randn(100)

# create dataframe from x and y
"""
print(pipe(txt, num_return_sequences=1)[0]["generated_text"])

In [None]:
txt = """\
# dataframe with profession, income and name
df = pd.DataFrame({'profession': x, 'income':y, 'name': z})

# calculate the mean income per profession
"""
print(pipe(txt, num_return_sequences=1)[0]["generated_text"])

In [None]:
txt = """
# import random forest regressor from scikit-learn
from sklearn.ensemble import RandomForestRegressor

# fit random forest model with 300 estimators on X, y:
"""
print(pipe(txt, num_return_sequences=1)[0]["generated_text"])

## Sample Training Loop with Accelerate 

In [None]:
keytoken_ids = []
for keyword in [
    "plt",
    "pd",
    "sk",
    "fit",
    "predict",
    " plt",
    " pd",
    " sk",
    " fit",
    " predict",
    "testtest",
]:
    ids = tokenizer([keyword]).input_ids[0]
    if len(ids) == 1:
        keytoken_ids.append(ids[0])
    else:
        print(f"Keyword has not single token: {keyword}")

In [None]:
from torch.nn import CrossEntropyLoss
import torch


def keytoken_weighted_loss(inputs, logits, keytoken_ids, alpha=1.0):
    # Shift so that tokens < n predict n
    shift_labels = inputs[..., 1:].contiguous()
    shift_logits = logits[..., :-1, :].contiguous()
    # Calculate per-token loss
    loss_fct = CrossEntropyLoss(reduce=False)
    loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
    # Resize and average loss per sample
    loss_per_sample = loss.view(shift_logits.size(0), shift_logits.size(1)).mean(axis=1)
    # Calculate and scale weighting
    weights = torch.stack([(inputs == kt).float() for kt in keytoken_ids]).sum(
        axis=[0, 2]
    )
    weights = alpha * (1.0 + weights)
    # Calculate weighted average
    weighted_loss = (loss_per_sample * weights).mean()
    return weighted_loss

In [None]:
from torch.utils.data.dataloader import DataLoader

tokenized_dataset.set_format("torch")
train_dataloader = DataLoader(tokenized_dataset["train"], batch_size=32, shuffle=True)
eval_dataloader = DataLoader(tokenized_dataset["valid"], batch_size=32)

In [None]:
weight_decay = 0.1


def get_grouped_params(model, no_decay=["bias", "LayerNorm.weight"]):
    params_with_wd, params_without_wd = [], []
    for n, p in model.named_parameters():
        if any(nd in n for nd in no_decay):
            params_without_wd.append(p)
        else:
            params_with_wd.append(p)
    return [
        {"params": params_with_wd, "weight_decay": weight_decay},
        {"params": params_without_wd, "weight_decay": 0.0},
    ]

In [None]:
def evaluate():
    model.eval()
    losses = []
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            outputs = model(batch["input_ids"], labels=batch["input_ids"])

        losses.append(accelerator.gather(outputs.loss))
    loss = torch.mean(torch.cat(losses))
    try:
        perplexity = torch.exp(loss)
    except OverflowError:
        perplexity = float("inf")
    return loss.item(), perplexity.item()

In [None]:
model = GPT2LMHeadModel(config)

In [None]:
from torch.optim import AdamW

optimizer = AdamW(get_grouped_params(model), lr=5e-4)

In [None]:
from accelerate import Accelerator

accelerator = Accelerator(fp16=True)

model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

In [None]:
from transformers import get_scheduler

num_train_epochs = 1
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=1_000,
    num_training_steps=num_training_steps,
)

In [None]:
from huggingface_hub import Repository, get_full_repo_name

model_name = "codeparrot-ds-accelerate"
repo_name = get_full_repo_name(model_name)
repo_name

In [None]:
output_dir = "codeparrot-ds-accelerate"
repo = Repository(output_dir, clone_from=repo_name)

In [None]:
evaluate()

In [None]:
from tqdm.notebook import tqdm

gradient_accumulation_steps = 8
eval_steps = 5_000

model.train()
completed_steps = 0
for epoch in range(num_train_epochs):
    for step, batch in tqdm(
        enumerate(train_dataloader, start=1), total=num_training_steps
    ):
        logits = model(batch["input_ids"]).logits
        loss = keytoken_weighted_loss(batch["input_ids"], logits, keytoken_ids)
        if step % 100 == 0:
            accelerator.print(
                {
                    "lr": get_lr(),
                    "samples": step * samples_per_step,
                    "steps": completed_steps,
                    "loss/train": loss.item() * gradient_accumulation_steps,
                }
            )
        loss = loss / gradient_accumulation_steps
        accelerator.backward(loss)
        if step % gradient_accumulation_steps == 0:
            accelerator.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            completed_steps += 1
        if (step % (eval_steps * gradient_accumulation_steps)) == 0:
            eval_loss, perplexity = evaluate()
            accelerator.print({"loss/eval": eval_loss, "perplexity": perplexity})
            model.train()
            accelerator.wait_for_everyone()
            unwrapped_model = accelerator.unwrap_model(model)
            unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
            if accelerator.is_main_process:
                tokenizer.save_pretrained(output_dir)
                repo.push_to_hub(
                    commit_message=f"Training in progress step {step}", blocking=False
                )