In [1]:
def any_keyword_in_string(string, keywords):
    for keyword in keywords:
        if keyword in string:
            return True
    return False

In [2]:
filters = ["pandas", "sklearn", "matplotlib", "seaborn"]
example_1 = "import numpy as np"
example_2 = "import pandas as pd"

In [3]:
from collections import defaultdict
from tqdm import tqdm
from datasets import Dataset

def filter_streaming_dataset(dataset, filters):
    filtered_dict = defaultdict(list)
    total = 0
    for sample in tqdm(iter(dataset)):
        total += 1
        if any_keyword_in_string(sample["content"], filters):
            for k, v in sample.items():
                filtered_dict[k].append(v)
    return Dataset.from_dict(filtered_dict)

In [4]:
##### takes long time to run
#from datasets import load_dataset

#split = "train"
#filters = ["pandas", "sklearn", "matplotlib", "seaborn"]

#data = load_dataset(f"transformersbook/codeparrot-{split}", split=split, streaming=True)
#filtered_data = filter_streaming_dataset(data, filters)

3.26% of data after filtering resulting in 6 GB and consists of 600,000 Python scripts

filtering the fulldataset takes 2-3h depending on your machine and bandwith

In [5]:
##### this is the prefiltered dataset to skip filtering whole dataset
#from datasets import load_dataset, DatasetDict
from datasets import load_dataset, DatasetDict

ds_train = load_dataset("huggingface-course/codeparrot-ds-train", split="train")
ds_valid = load_dataset("huggingface-course/codeparrot-ds-valid", split="validation")

raw_datasets = DatasetDict(
    {
        "train": ds_train,  # .shuffle().select(range(50000)),
        "valid": ds_valid,  # .shuffle().select(range(500))
    }
)

raw_datasets

Found cached dataset json (C:/Users/Christian/.cache/huggingface/datasets/huggingface-course___json/huggingface-course--codeparrot-ds-train-7e9fc5dfe436a81a/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)
Found cached dataset json (C:/Users/Christian/.cache/huggingface/datasets/huggingface-course___json/huggingface-course--codeparrot-ds-valid-65557c3279496c87/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


DatasetDict({
    train: Dataset({
        features: ['repo_name', 'path', 'copies', 'size', 'content', 'license'],
        num_rows: 606720
    })
    valid: Dataset({
        features: ['repo_name', 'path', 'copies', 'size', 'content', 'license'],
        num_rows: 3322
    })
})

Pretraining the language model will take a while. We suggest that you first run the training loop on a sample of the data by uncommenting the two partial lines above, and make sure that the training successfully completes and the models are stored. Nothing is more frustrating than a training run failing at the last step because you forgot to create a folder or because there’s a typo at the end of the training loop!

## tokenizing the data

In [6]:
from transformers import AutoTokenizer

context_length = 128
tokenizer = AutoTokenizer.from_pretrained('huggingface-course/code-search-net-tokenizer')

outputs = tokenizer(
    raw_datasets["train"][:2]["content"],
    truncation=True,
    max_length=context_length,
    return_overflowing_tokens=True,
    return_length=True,
)

print(f"Input IDs length: {len(outputs['input_ids'])}")
print(f"Input chunk lengths: {(outputs['length'])}")
print(f"Chunk mapping: {outputs['overflow_to_sample_mapping']}")

Input IDs length: 34
Input chunk lengths: [128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 117, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 41]
Chunk mapping: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [7]:
def tokenize(element):
    outputs = tokenizer(
        element["content"],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True,
    )
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length == context_length:
            input_batch.append(input_ids)
    return {"input_ids": input_batch}


tokenized_datasets = raw_datasets.map(
    tokenize, batched=True, remove_columns=raw_datasets["train"].column_names
)
tokenized_datasets

Loading cached processed dataset at C:\Users\Christian\.cache\huggingface\datasets\huggingface-course___json\huggingface-course--codeparrot-ds-train-7e9fc5dfe436a81a\0.0.0\e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4\cache-cf462dbd021072f8.arrow
Loading cached processed dataset at C:\Users\Christian\.cache\huggingface\datasets\huggingface-course___json\huggingface-course--codeparrot-ds-valid-65557c3279496c87\0.0.0\e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4\cache-5580b16c22b3b9e6.arrow


DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 16702061
    })
    valid: Dataset({
        features: ['input_ids'],
        num_rows: 93164
    })
})

In [8]:
#DatasetDict({
#    train: Dataset({
#        features: ['input_ids'],
#        num_rows: 16702061
#    })
#    valid: Dataset({
#        features: ['input_ids'],
#        num_rows: 93164
#    })
#})

We now have 16.7 million examples with 128 tokens each, which corresponds to about 2.1 billion tokens in total. For reference, OpenAI’s GPT-3 and Codex models are trained on 300 and 100 billion tokens, respectively, where the Codex models are initialized from the GPT-3 checkpoints. Our goal in this section is not to compete with these models, which can generate long, coherent texts, but to create a scaled-down version providing a quick autocomplete function for data scientists.

## initializing a new model
Our first step is to freshly initialize a GPT-2 model. We’ll use the same configuration for our model as for the small GPT-2 model, so we load the pretrained configuration, make sure that the tokenizer size matches the model vocabulary size and pass the bos and eos (beginning and end of sequence) token IDs

In [9]:
from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig

config = AutoConfig.from_pretrained(
    "gpt2",
    vocab_size=len(tokenizer),
    n_ctx=context_length,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)

In [10]:
model = GPT2LMHeadModel(config)
model_size = sum(t.numel() for t in model.parameters())
print(f"GPT-2 size: {model_size/1000**2:.1f}M parameters")

GPT-2 size: 124.2M parameters


GPT-2 size: 124.2M parameters


In [11]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [12]:
out = data_collator([tokenized_datasets["train"][i] for i in range(5)])
for key in out:
    print(f"{key} shape: {out[key].shape}")


#input_ids shape: torch.Size([5, 128])
#attention_mask shape: torch.Size([5, 128])
#labels shape: torch.Size([5, 128])

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


input_ids shape: torch.Size([5, 128])
attention_mask shape: torch.Size([5, 128])
labels shape: torch.Size([5, 128])


In [13]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

NameError: name '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' is not defined

1. configure training arguements and fire up the Trainer
2. useing cosine learning rate schedule with some warmup
3. effective batch size of 256 (per_device_train_batch_size*gradient_accumulation_steps)
# gradient accumulation is used when a single batch does not fit in memory and incrementally builds up the gradient through several forward/backward passes

In [15]:
from transformers.integrations import NeptuneCallback
import neptune

run = neptune.init_run(project="cdreetz/casual-hug",
                       api_token="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiIxMmNlMjY1NS1iZDRhLTQ1MjEtYjVhNy0zNDk3Njc4NGU2YjEifQ==",
                       capture_hardware_metrics=True,
                       capture_stderr=True,
                       capture_stdout=True)

neptune_callback = NeptuneCallback(run=run,
                                   log_parameters=True)



https://app.neptune.ai/cdreetz/casual-hug/e/CAS-11


In [16]:
from transformers import Trainer, TrainingArguments

args = TrainingArguments(
    output_dir="codeparrot-ds2",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    evaluation_strategy="steps",
    eval_steps=500,
    logging_steps=500,
    gradient_accumulation_steps=8,
    num_train_epochs=1,
    weight_decay=0.1,
    warmup_steps=200,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=500,
    fp16=True,
    push_to_hub=True,
    report_to="none"
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["valid"],
    callbacks=[neptune_callback]
)

c:\Users\Christian\Desktop\datasci_gpt\codeparrot-ds2 is already a clone of https://huggingface.co/cdreetz/codeparrot-ds2. Make sure you pull the latest changes with `repo.git_pull()`.


In [None]:
from pynvml import *


def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")


def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()

In [None]:
print_gpu_utilization()

GPU memory occupied: 1498 MB.


In [17]:
trainer.train()

        Convert the value to a supported type, such as a string or float, or use stringify_unsupported(obj)
        for dictionaries or collections that contain unsupported values.
        For more, see https://docs.neptune.ai/help/value_of_unsupported_type
        Convert the value to a supported type, such as a string or float, or use stringify_unsupported(obj)
        for dictionaries or collections that contain unsupported values.
        For more, see https://docs.neptune.ai/help/value_of_unsupported_type


  0%|          | 0/130484 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [23]:
import numpy
import torch
test_torch = torch.rand(100,1,28,28)
print(test_torch.numpy())

[[[[8.82269263e-01 9.15003955e-01 3.82863760e-01 ... 2.69494832e-01
    3.58812630e-01 1.99363768e-01]
   [5.47191560e-01 6.16043806e-03 9.51554537e-01 ... 9.10269439e-01
    6.44015670e-01 7.07106769e-01]
   [6.58130586e-01 4.91302013e-01 8.91304135e-01 ... 1.59144104e-01
    7.65289068e-01 2.97897756e-01]
   ...
   [8.02882731e-01 2.66210496e-01 2.61398315e-01 ... 6.68269873e-01
    6.77897573e-01 8.37045908e-02]
   [1.49900913e-02 2.40555465e-01 8.42273831e-01 ... 4.93099749e-01
    9.57616389e-01 1.99889958e-01]
   [5.03931105e-01 7.37799764e-01 1.54821873e-01 ... 3.01824689e-01
    6.30125940e-01 6.88570201e-01]]]


 [[[2.36630499e-01 4.21047211e-03 7.61717200e-01 ... 1.94603682e-01
    2.53947854e-01 5.96131444e-01]
   [6.35630608e-01 6.92236125e-01 7.74437606e-01 ... 4.58340883e-01
    6.07877910e-01 2.25802660e-01]
   [6.44235015e-01 1.17883086e-02 1.42245770e-01 ... 9.18389320e-01
    8.87405157e-01 6.51075661e-01]
   ...
   [3.88001800e-01 3.18637729e-01 6.96419597e-01 ... 6.

In [None]:
trainer.push_to_hub()