In [None]:
def any_keyword_in_string(string, keywords):
    for keyword in keywords:
        if keyword in string:
            return True
    return False

In [None]:
filters = ["pandas", "sklearn", "matplotlib", "seaborn"]
example_1 = "import numpy as np"
example_2 = "import pandas as pd"

In [None]:
from collections import defaultdict
from tqdm import tqdm
from datasets import Dataset

def filter_streaming_dataset(dataset, filters):
    filtered_dict = defaultdict(list)
    total = 0
    for sample in tqdm(iter(dataset)):
        total += 1
        if any_keyword_in_string(sample["content"], filters):
            for k, v in sample.items():
                filtered_dict[k].append(v)
    return Dataset.from_dict(filtered_dict)

In [None]:
##### takes long time to run
from datasets import load_dataset

split = "train"
filters = ["pandas", "sklearn", "matplotlib", "seaborn"]

data = load_dataset(f"transformersbook/codeparrot-{split}", split=split, streaming=True)
filtered_data = filter_streaming_dataset(data, filters)

3.26% of data after filtering resulting in 6 GB and consists of 600,000 Python scripts

filtering the fulldataset takes 2-3h depending on your machine and bandwith

In [None]:
##### this is the prefiltered dataset to skip filtering whole dataset
#from datasets import load_dataset, DatasetDict

#ds_train = load_dataset("huggingface-course/codeparrot-ds-train", split="train")
#ds_valid = load_dataset("huggingface-course/codeparrot-ds-valid", split="validation")

#raw_datasets = DatasetDict(
#    {
#        "train": ds_train,  # .shuffle().select(range(50000)),
#        "valid": ds_valid,  # .shuffle().select(range(500))
#    }
#)

#raw_datasets

Pretraining the language model will take a while. We suggest that you first run the training loop on a sample of the data by uncommenting the two partial lines above, and make sure that the training successfully completes and the models are stored. Nothing is more frustrating than a training run failing at the last step because you forgot to create a folder or because there’s a typo at the end of the training loop!

## tokenizing the data

In [None]:
from transformers import AutoTokenizer

context_length = 128
tokenizer = AutoTokenizer.from_pretrained("/huggingface-course/code-search-net-tokenizer")

outputs = tokenzier(
    raw_datasets["train"][:2]["content"],
    truncation=True,
    max_length=context_length,
    return_overflowing_tokens=True,
    return_length=True,
)

#print(f"Input IDs length: {len(outputs['input_ids'])}")
#print(f"Input chunk lengths: {(outputs['length'])}")
#print(f"Chunk mapping: {outputs['overflow_to_sample_mapping']}")

In [None]:
def tokenize(element):
    outputs = tokenzier(
        element["content"],
        truncation=True,
        max_length=context_length,
        return_verflowwing_tokens=True,
        return_length=True,
    )
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length == context_length:
            input_batch.append(input_ids)
    return {"input_ids": input_batch}


tokenized_datasets = raw_datasets.map(
    tokenize, batched=True, remove_columns=raw_datasets["train"].column_names
)
#tokenized_datasets

In [None]:
#DatasetDict({
#    train: Dataset({
#        features: ['input_ids'],
#        num_rows: 16702061
#    })
#    valid: Dataset({
#        features: ['input_ids'],
#        num_rows: 93164
#    })
#})

We now have 16.7 million examples with 128 tokens each, which corresponds to about 2.1 billion tokens in total. For reference, OpenAI’s GPT-3 and Codex models are trained on 300 and 100 billion tokens, respectively, where the Codex models are initialized from the GPT-3 checkpoints. Our goal in this section is not to compete with these models, which can generate long, coherent texts, but to create a scaled-down version providing a quick autocomplete function for data scientists.

## initializing a new model
Our first step is to freshly initialize a GPT-2 model. We’ll use the same configuration for our model as for the small GPT-2 model, so we load the pretrained configuration, make sure that the tokenizer size matches the model vocabulary size and pass the bos and eos (beginning and end of sequence) token IDs

In [None]:
from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig

config = AutoConfig.from_pretrained(
    "gpt2",
    vocab_size=len(tokenizer),
    n_ctx=context_length,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)

In [None]:
model = GPTLMHeadModel(config)
model_size = sum(t.numel() for t in model.parameters())
print(f"GPT-2 size: {model_size/1000**2:.1f}M parameters")

GPT-2 size: 124.2M parameters


In [None]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [None]:
out = data_collator([tokenized_datasets["train"][i] for i in range(5)])
for key in out:
    print(f"{key} shape: {out[key].shape}")


#input_ids shape: torch.Size([5, 128])
#attention_mask shape: torch.Size([5, 128])
#labels shape: torch.Size([5, 128])

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
# if not working from a notebook just type in terminal:

# huggingface-cli login

1. configure training arguements and fire up the Trainer
2. useing cosine learning rate schedule with some warmup
3. effective batch size of 256 (per_device_train_batch_size*gradient_accumulation_steps)
# gradient accumulation is used when a single batch does not fit in memory and incrementally builds up the gradient through several forward/backward passes

In [None]:
from transformers import Trainer, TrainingArguments

args = TrainingArguments(
    output_dir="codeparrot-ds",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    evaluation_strategy="steps",
    eval_step=5_000,
    logging_steps=5_000,
    gradient_accumulation_steps=8,
    num_train_epochs=1,
    weight_decay=0.1,
    warmup_steps=1_000,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=5_000,
    fp16=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["valid"]
)

In [None]:
trainer.train()

In [None]:
trainer.push_to_hub()