# Finetune GPT2 on the `codeparrot` dataset and build a code generation model (PyTorch)

## Setup

In [1]:
import torch
torch.cuda.is_available()

True

In [None]:
!pip install datasets evaluate transformers[sentencepiece]
# !pip install accelerate
# !apt install git-lfs

In [3]:
# # Setup git
# !git config --global user.email ""
# !git config --global user.name ""

In [4]:
# Logged in to HuggingFace Hub
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Gathering the data

In [5]:
# Function to filter the code samples using the matplotlib, seaborn, pandas, and scikit-learn libraries
def any_keyword_in_string(string, keywords):
    for keyword in keywords:
        if keyword in string:
            return True
    return False

In [6]:
# Let’s test it on two examples
filters = ["matplotlib", "seaborn", "pandas", "sklearn"]
example_1 = "import numpy as np"
example_2 = "import pandas as pd"

print(
    any_keyword_in_string(example_1, filters),
    any_keyword_in_string(example_2, filters),
)

False True


In [7]:
# Create a Function to filter desired elements
from datasets import Dataset
from collections import defaultdict
from tqdm import tqdm

def filter_streaming_dataset(dataset, filters):
    filtered_dict = defaultdict(list)
    total = 0
    for sample in tqdm(iter(dataset)):
        total += 1
        if any_keyword_in_string(sample["content"], filters):
            for k, v in sample.items():
                filtered_dict[k].append(v)
    print(f"{len(filtered_dict['content'])/total:.2%} of data after filtering.")
    return Dataset.from_dict(filtered_dict)

In [8]:
# # Skipping this cell for execution time

# # Apply this function to the streaming dataset
# from datasets import load_dataset

# split = "train"
# filters = ["pandas", "sklearn", "matplotlib", "seaborn"]

# data = load_dataset(f"transformersbook/codeparrot-{split}", split=split, streaming=True)
# filtered_data = filter_streaming_dataset(data, filters)


In [9]:
# Loading  filtered dataset from the HuggingFace Hub
from datasets import load_dataset, DatasetDict

ds_train = load_dataset("huggingface-course/codeparrot-ds-train", split="train")
ds_valid = load_dataset("huggingface-course/codeparrot-ds-valid", split="validation")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading data:   0%|          | 0.00/8.25G [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data:   0%|          | 0.00/46.1M [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/3322 [00:00<?, ? examples/s]

In [10]:
raw_datasets = DatasetDict(
    {
        "train": ds_train.shuffle().select(range(5000)),  # 50000
        "valid": ds_valid.shuffle().select(range(50))  # 500
    }
)

In [11]:
# Let’s look at an example from the dataset
for key in raw_datasets["train"][0]:
    print(f"{key.upper()}: {raw_datasets['train'][0][key][:200]}")

REPO_NAME: eyadsibai/rep
PATH: rep/estimators/utils.py
COPIES: 3
SIZE: 5406
CONTENT: from __future__ import division, print_function, absolute_import

import numpy
import pandas
from scipy.special import expit, logit
from sklearn.base import BaseEstimator, TransformerM
LICENSE: apache-2.0


## Preparing the dataset

In [12]:
# Let’s see exactly how this works by looking at the first two examples
from transformers import AutoTokenizer

context_length = 128
tokenizer = AutoTokenizer.from_pretrained("huggingface-course/code-search-net-tokenizer")

outputs = tokenizer(
    raw_datasets["train"][:2]["content"],
    truncation=True,
    max_length=context_length,
    return_overflowing_tokens=True,
    return_length=True,
)

print(f"Input IDs length: {len(outputs['input_ids'])}")
print(f"Input chunk lengths: {(outputs['length'])}")
print(f"Chunk mapping: {outputs['overflow_to_sample_mapping']}")

tokenizer_config.json:   0%|          | 0.00/265 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/789k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/448k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

Input IDs length: 34
Input chunk lengths: [128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 8, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 96]
Chunk mapping: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [13]:
# Tokenize
def tokenize(element):
    outputs = tokenizer(
        element["content"],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True,
    )

    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length == context_length:
            input_batch.append(input_ids)
    return {"input_ids": input_batch}

tokenized_datasets = raw_datasets.map(
    tokenize,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)

tokenized_datasets

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 139491
    })
    valid: Dataset({
        features: ['input_ids'],
        num_rows: 1392
    })
})

## Initializing a new model

In [14]:
# let’s initialize a GPT-2 model
from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig

config = AutoConfig.from_pretrained(
    "gpt2",
    vocab_size=len(tokenizer),
    n_ctx=context_length,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [15]:
# load a new model with above configuration
model = GPT2LMHeadModel(config)
model_size = sum(t.numel() for t in model.parameters())
print(f"GPT-2 Size: {model_size/1000**2:.1f}M parameters")

GPT-2 Size: 124.2M parameters


In [16]:
# Setup a data collator for creating the batches
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [17]:
# Let’s have a look at an example
out = data_collator([tokenized_datasets["train"][i] for i in range(5)])
for key in out:
    print(f"{key} shape: {out[key].shape}")

input_ids shape: torch.Size([5, 128])
attention_mask shape: torch.Size([5, 128])
labels shape: torch.Size([5, 128])


In [18]:
# Configure the training arguments and Trainer
from transformers import Trainer, TrainingArguments

args = TrainingArguments(
    output_dir="gpt2-finetuned-codeparrot",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    evaluation_strategy="steps",
    eval_steps=5_000,  # 5k, Increased to reduce the frequency of evaluations
    logging_steps=5_000,  # 5k, Increased to reduce logging frequency
    gradient_accumulation_steps=8,  # 8, Reduced to 4 to lower memory usage
    num_train_epochs=1,
    weight_decay=0.1,
    warmup_steps=1_000,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=5_000,  # Increased to save less frequently
    fp16=True,  # Keep FP16 for better performance but consider turning it off if instability occurs
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["valid"],
)




In [19]:
# Train the model
trainer.train()

Step,Training Loss,Validation Loss


TrainOutput(global_step=545, training_loss=5.283514908256881, metrics={'train_runtime': 1535.0196, 'train_samples_per_second': 90.872, 'train_steps_per_second': 0.355, 'total_flos': 9111971708928000.0, 'train_loss': 5.283514908256881, 'epoch': 1.0})

In [20]:
# Push the model and tokenizer to the Hub
trainer.push_to_hub()

CommitInfo(commit_url='https://huggingface.co/Ashaduzzaman/gpt2-finetuned-codeparrot/commit/e3834d4351bf2f8fdea0173c9df317ec4a8e8962', commit_message='End of training', commit_description='', oid='e3834d4351bf2f8fdea0173c9df317ec4a8e8962', pr_url=None, pr_revision=None, pr_num=None)

## Code generation with a pipeline

In [21]:
# text generation pipeline
import torch
from transformers import pipeline

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
pipe = pipeline(
    "text-generation",
    model="Ashaduzzaman/gpt2-finetuned-codeparrot",
    device=device
)

config.json:   0%|          | 0.00/898 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/497M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/789k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/448k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

In [22]:
# Task 1: Let’s start with the simple task of creating a scatter plot
txt = """\
# create some data
x = np.random.randn(100)
y = np.random.randn(100)

# create scatter plot with x, y
"""
print(pipe(txt, num_return_sequences=1)[0]["generated_text"])

# create some data
x = np.random.randn(100)
y = np.random.randn(100)

# create scatter plot with x, y
y = plt.figure() # Plot with mean (d)


In [23]:
# Task 1: Create a DataFrame from two arrays
txt = """\
# create some data
x = np.random.randn(100)
y = np.random.randn(100)

# create dataframe from x and y
"""
print(pipe(txt, num_return_sequences=1)[0]["generated_text"])

# create some data
x = np.random.randn(100)
y = np.random.randn(100)

# create dataframe from x and y
y = np.random.randn(10, 8, 1)


In [24]:
# Task 1: use the groupby operation
txt = """\
# dataframe with profession, income and name
df = pd.DataFrame({'profession': x, 'income':y, 'name': z})

# calculate the mean income per profession
"""
print(pipe(txt, num_return_sequences=1)[0]["generated_text"])

# dataframe with profession, income and name
df = pd.DataFrame({'profession': x, 'income':y, 'name': z})

# calculate the mean income per profession
df = pd.DataFrame


In [25]:
# Task 1: use scikit-learn and set up a Random Forest model
txt = """
# import random forest regressor from scikit-learn
from sklearn.ensemble import RandomForestRegressor

# fit random forest model with 300 estimators on X, y:
"""
print(pipe(txt, num_return_sequences=1)[0]["generated_text"])


# import random forest regressor from scikit-learn
from sklearn.ensemble import RandomForestRegressor

# fit random forest model with 300 estimators on X, y:
X = rng.randint(50, 1, 2, 'o',


## Training with 🤗 Accelerate

In [26]:
# keytoken_ids = []
# for keyword in [
#     "plt",
#     "pd",
#     "sk",
#     "fit",
#     "predict",
#     "plt",
#     "pd",
#     "sk",
#     "fit",
#     "predict",
#     "testtest"
# ]:
#     ids = tokenizer([keyword]).input_ids[0]
#     if len(ids) == 1:
#         keytoken_ids.append(ids[0])
#     else:
#         print(f"keyword has not single token: {keyword}")

In [27]:
# # Write a custom loss function
# from torch.nn import CrossEntropyLoss
# import torch

# def keytoken_weighted_loss(inputs, logits, keytoken_ids, alpha=1.0):
#     # Shift so that tokens < n predict n
#     shift_labels = inputs[..., 1:].contiguous()
#     shift_logits = logits[..., :-1, :].contiguous()
#     # Calculate per-token loss
#     loss_fct = CrossEntropyLoss(reduce=False)
#     loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
#     # Resize and Average loss per sample
#     loss_per_sample = loss.view(shift_logits.size(0), shift_logits.size(1)).mean(axis=1)
#     # Calculate and scale weighting
#     weights = torch.stack([(inputs == kt).float() for kt in keytoken_ids]).sum(
#         axis=[0, 2]
#     )
#     weights = alpha * (1.0 + weights)
#     # Calculate weighted average
#     weighted_loss = (loss_per_sample * weights).mean()
#     return weighted_loss

In [28]:
# # Load dataloaders to load the data in batches
# from torch.utils.data.dataloader import DataLoader

# tokenized_datasets.set_format("torch")
# train_dataloader = DataLoader(tokenized_datasets["train"], shuffle=True, batch_size=32)
# eval_dataloader = DataLoader(tokenized_datasets["valid"], batch_size=32)

In [29]:
# # Set up weight decay parameters
# weight_decay = 0.1
# def get_grouped_params(model, no_decay=["bias", "LayerNorm.weight"]):
#     params_with_wd, params_without_wd = [], []
#     for n, p in model.named_parameters():
#         if any(nd in n for nd in no_decay):
#             params_without_wd.append(p)
#         else:
#             params_with_wd.append(p)
#     return [
#         {"params": params_with_wd, "weight_decay": weight_decay},
#         {"params": params_without_wd, "weight_decay": 0.0},
#     ]

In [30]:
# # Define the evaluate function
# def evaluate():
#     model.eval()
#     losses = []
#     for step, batch in enumerate(eval_dataloader):
#         with torch.no_grad():
#             outputs = model(batch["input_ids"], labels=batch["input_ids"])

#         losses.append(accelerator.gather(outputs.loss))
#     loss = torch.mean(torch.cat(losses))
#     try:
#         perplexity = torch.exp(loss)
#     except OverflowError:
#         perplexity = float("inf")
#     return loss.item(), perplexity.item()


In [31]:
# model = GPT2LMHeadModel(config)

In [32]:
# # Define the optimizer using the function
# from torch.optim import AdamW

# optimizer = AdamW(get_grouped_params(model), lr=5e-4)

In [33]:
# # Prepare the model
# from accelerate import Accelerator

# accelerator = Accelerator()

# model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
#     model, optimizer, train_dataloader, eval_dataloader
# )

In [34]:
# # Define a learning rate Scheduler
# from transformers import get_scheduler

# num_train_epochs = 1
# num_update_steps_per_epoch = len(train_dataloader)
# num_training_steps = num_train_epochs * num_update_steps_per_epoch

# lr_scheduler = get_scheduler(
#     "linear",
#     optimizer=optimizer,
#     num_warmup_steps=1_000,
#     num_training_steps=num_training_steps,
# )

In [35]:
# # Push our model to the Hub
# from huggingface_hub import Repository, get_full_repo_name, create_repo

# model_name = "codeparrot-ds-accelerate"
# create_repo(model_name)

# repo_name = get_full_repo_name(model_name)
# repo_name

In [36]:
# # Clone that repository in a local folder
# output_dir = "codeparrot-ds-accelerate"
# repo = Repository(output_dir, clone_from=repo_name)

In [37]:
# evaluate()

In [38]:
# # Train the model
# from tqdm.notebook import tqdm

# gradient_accumulation_steps = 8
# eval_steps = 5_000

# model.train()
# completed_steps = 0
# for epoch in range(num_train_epochs):
#     for step, batch in tqdm(
#         enumerate(train_dataloader, start=1), total=num_training_steps
#     ):
#         logits = model(batch["input_ids"]).logits
#         loss = keytoken_weighted_loss(
#             batch["input_ids"],
#             logits,
#             keytoken_ids,
#         )
#         if step % 100 == 0:
#             accelerator.print(
#                 {
#                     "samples": step * samples_per_step,
#                     "steps": completed_steps,
#                     "loss/epoch": loss.item() * gradient_accumulation_steps,
#                 }
#             )
#         loss = loss / gradient_accumulation_steps
#         accelerator.backward(loss)
#         if step % gradient_accumulation_steps == 0:
#             accelerator.clip_grad_norm_(model.parameters(), 1.0)
#             optimizer.step()
#             lr_scheduler.step()
#             optimizer.zero_grad()
#             completed_steps += 1
#         if (step % eval_steps == 0) or (step == num_training_steps):
#             eval_loss, perplexity = evaluate()
#             accelerator.print({"loss/eval": eval_loss, "perplexity": perplexity})
#             model.train()
#             accelerator.wait_for_everyone()
#             unwrapped_model = accelerator.unwrap_model(model)
#             unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
#             if accelerator.is_main_process:
#                 tokenizer.save_pretrained(output_dir)
#                 repo.push_to_hub(
#                     commit_message=f"Training in progress step {step}", blocking=False
#                 )