In [2]:
!pip uninstall pyarrow -q
!pip install --upgrade pyarrow -q
!pip install transformers -q
!pip install datasets -q
!pip install evaluate -q

Proceed (Y/n)? Y
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.0/40.0 MB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf-cu12 24.10.1 requires pyarrow<18.0.0a0,>=14.0.0, but you have pyarrow 18.0.0 which is incompatible.
pylibcudf-cu12 24.10.1 requires pyarrow<18.0.0a0,>=14.0.0, but you have pyarrow 18.0.0 which is incompatible.[0m[31m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m9.6 MB/

In [1]:
from transformers import pipeline, AutoTokenizer, AutoModel, AutoModelForSequenceClassification, AutoModelForCausalLM, AutoModelForMaskedLM
from transformers import GPT2LMHeadModel, AutoConfig
from transformers import AdamW
from transformers import get_scheduler
from transformers import DataCollatorWithPadding, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from datasets import load_dataset, Dataset, DatasetDict
import evaluate

import wandb
wandb.init(mode='disabled')

import torch
from torch.utils.data import DataLoader

import numpy as np
import math

from tqdm.auto import tqdm
import html
import requests

### Section 1: Data Prep

In [2]:
ds_train = load_dataset("huggingface-course/codeparrot-ds-train", split="train")
ds_valid = load_dataset("huggingface-course/codeparrot-ds-valid", split="validation")

raw_datasets = DatasetDict(
    {
        "train": ds_train.shuffle().select(range(1000)),
        "valid": ds_valid.shuffle().select(range(100))
    }
)

raw_datasets

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


DatasetDict({
    train: Dataset({
        features: ['repo_name', 'path', 'copies', 'size', 'content', 'license'],
        num_rows: 1000
    })
    valid: Dataset({
        features: ['repo_name', 'path', 'copies', 'size', 'content', 'license'],
        num_rows: 100
    })
})

In [3]:
#print(raw_datasets["train"][42]['content'])

In [4]:
context_length = 128

In [5]:
def tokenize_func(examples):
  outputs = tokenizer(
      examples['content'],
      truncation=True,
      max_length=context_length,
      return_overflowing_tokens=True,
      return_length=True)

  input_batch = []
  for length,  input_id in zip(outputs["length"], outputs["input_ids"]):
    if length == context_length:
      input_batch.append(input_id)

  return {'input_ids': input_batch}

In [None]:
tokenized_datasets = raw_datasets["train"].map(tokenize_func, batched=True, remove_columns=raw_datasets["train"].column_names)
tokenized_datasets

In [None]:
print(len(tokenized_datasets[0]['input_ids']))

In [6]:
tokenizer = AutoTokenizer.from_pretrained("huggingface-course/code-search-net-tokenizer")

In [7]:
outputs = tokenizer(
    raw_datasets["train"][:2]["content"],
    truncation=True,
    max_length=context_length,
    return_overflowing_tokens=True,
    return_length=True
)

In [8]:
for key in outputs:
  print(key)
  print(outputs[key])

input_ids
[[973, 1893, 978, 1893, 173, 173, 2745, 34130, 173, 2745, 1601, 442, 635, 173, 173, 2745, 3857, 442, 2604, 173, 2745, 3857, 14, 1959, 14, 13880, 442, 10399, 173, 2745, 3857, 14, 1959, 337, 1824, 63, 20813, 442, 11150, 173, 973, 3857, 978, 308, 20778, 12, 1170, 63, 1446, 12, 6226, 12, 271, 526, 63, 1446, 12, 3544, 12, 370, 9012, 1508, 2643, 12, 3634, 12, 292, 63, 1881, 9, 173, 973, 3857, 14, 48495, 14, 7817, 978, 7193, 631, 12, 785, 15305, 4871, 12, 28763, 4871, 173, 173, 8566, 12, 10029, 233, 1893, 8, 19518, 12, 396, 12, 396, 491, 1893, 8, 11183, 12, 396, 12, 396, 9, 4391, 173, 692, 7252, 20778, 1494, 28786, 8, 1023, 274, 232, 5632, 233, 404, 325, 12, 269, 10936, 340], [269, 25474, 15, 33423, 21145, 340, 269, 2880, 15, 37, 30790, 340, 269, 33614, 15, 25474, 15, 51, 36698, 10368, 340, 880, 269, 33614, 15, 2880, 15, 37034, 358, 312, 1111, 34280, 14, 2175, 14, 343, 263, 991, 3254, 439, 4066, 485, 5632, 9, 232, 509, 1737, 63, 7358, 8, 248, 12, 5632, 274, 222, 9927, 17, 233, 2604,

In [9]:
tokenized_datasets = raw_datasets.map(
    tokenize_func, batched=True, remove_columns=raw_datasets["train"].column_names
)
tokenized_datasets

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 29706
    })
    valid: Dataset({
        features: ['input_ids'],
        num_rows: 2338
    })
})

### Section 2: Model Prep

In [10]:
config = AutoConfig.from_pretrained(
    "gpt2",
    vocab_size=len(tokenizer),
    n_ctx=context_length,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)

In [11]:
model = GPT2LMHeadModel(config)

In [12]:
print(sum(t.numel() for t in model.parameters()))

124242432


In [13]:
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [14]:
out = data_collator([tokenized_datasets["train"][i] for i in range(5)])
for key in out:
    print(f"{key} shape: {out[key].shape}")

input_ids shape: torch.Size([5, 128])
attention_mask shape: torch.Size([5, 128])
labels shape: torch.Size([5, 128])


In [15]:
print(out['input_ids'][0][-10:])
print(out['labels'][0][-10:])

tensor([  274,   232,  5632,   233,   404,   325,    12,   269, 10936,   340])
tensor([  274,   232,  5632,   233,   404,   325,    12,   269, 10936,   340])


In [16]:
args = TrainingArguments(
    output_dir="codeparrot-ds",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    evaluation_strategy="steps",
    eval_steps=5_000,
    logging_steps=5_000,
    gradient_accumulation_steps=8,
    num_train_epochs=1,
    weight_decay=0.1,
    warmup_steps=1_000,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=5_000,
    fp16=True,
)



In [17]:
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["valid"],
)

  trainer = Trainer(


In [18]:
tokenized_datasets.set_format('torch')

In [19]:
train_dataloader = DataLoader(tokenized_datasets["train"], batch_size=32, shuffle=True)
eval_dataloader = DataLoader(tokenized_datasets["valid"], batch_size=32)

In [20]:
def evaluate():
  model.eval()
  losses = []

  for step, batch in enumerate(eval_dataloader):
    with torch.inference_mode():
      outputs = model(batch["input_ids"], labels=batch["input_ids"])

    losses.append(outputs.loss.detach().cpu().item())

  loss = torch.mean(torch.cat(losses))
  perplexity = torch.exp(loss)

  return loss.item(), perplexity.item()

In [21]:
opt = torch.optim.AdamW(model.parameters(), lr=5e-4)

In [None]:
evaluate()