Check for Cuda

In [4]:
import torch

print(torch.cuda.is_available())

True


Load Dataset

In [5]:
from datasets import load_dataset

ds = load_dataset('andrewy1n/pseudocode-decompiled-samples-small', token="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", split='train')
ds = ds.train_test_split(test_size=0.2)

print(ds)

DatasetDict({
    train: Dataset({
        features: ['Name', 'Text'],
        num_rows: 1788
    })
    test: Dataset({
        features: ['Name', 'Text'],
        num_rows: 448
    })
})


Create BPE Tokenizer

In [6]:
from tokenizers import Tokenizer
from tokenizers import pre_tokenizers
from tokenizers.pre_tokenizers import Punctuation, Whitespace, Digits
from transformers import PreTrainedTokenizerFast

checkpoint = "bigcode/starcoder2-3b"
tokenizer = Tokenizer.from_pretrained(checkpoint,) # we use an existing Vocabulary from starcoder2

tokenizer.pre_tokenizer = pre_tokenizers.Sequence([Whitespace(), Digits(individual_digits=True), Punctuation()]) # we use whitespace, Digits and punctuations to split the tokens first

tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer)

special_tokens = {"mask_token":"<MASK>", "sep_token":"<SEP>", "unk_token":"<UNK>", "cls_token":"<CLS>", "pad_token": "<PAD>"}

tokenizer.add_special_tokens(special_tokens)

5

In [7]:
with open("cp_o3.cpp", "r") as f:
    input = f.readlines()

print(tokenizer("".join(input)))

{'input_ids': [429, 119, 576, 100, 41692, 45, 429, 772, 100, 54, 49, 429, 772, 100, 55, 49, 15463, 772, 100, 56, 46, 128, 3239, 113, 1901, 54, 64, 429, 100, 100, 3016, 64, 1792, 122, 1901, 55, 64, 429, 41844, 56, 64, 429, 47, 1314, 1901, 57, 64, 3239, 285, 100, 6900, 100, 10153, 64, 15463, 1717, 100, 56, 61, 64, 113, 1901, 54, 66, 47, 45, 3239, 47, 46, 45, 285, 100, 6900, 100, 10153, 48, 53, 125, 55, 61, 46, 64, 344, 45, 772, 100, 55, 66, 66, 53, 46, 128, 100, 100, 3016, 66, 41692, 45, 772, 100, 54, 49, 53, 49, 772, 100, 56, 43, 53, 7469, 46, 64, 7768, 9796, 100, 53, 53, 54, 54, 61, 103, 59, 53, 64, 130, 344, 45, 772, 100, 55, 66, 66, 53, 125, 57, 53, 59, 46, 128, 344, 45, 8799, 100, 12137, 3016, 100, 6772, 2510, 51, 56, 62, 60, 53, 65, 53, 46, 128, 100, 100, 3016, 66, 41692, 45, 772, 100, 54, 49, 53, 49, 772, 100, 56, 43, 53, 7469, 46, 64, 344, 45, 45, 100, 100, 3016, 65, 53, 46, 129, 129, 45, 8799, 100, 12137, 3016, 100, 6772, 2510, 51, 56, 62, 60, 53, 38, 66, 50, 54, 46, 46, 7768, 9

Preprocess the Dataset

In [8]:
import multiprocessing

num_proc = multiprocessing.cpu_count()

def group_texts(examples):
    return tokenizer(examples['Text'], padding=True, truncation=True, return_tensors='pt', max_length=512)

ds = ds.filter(lambda example: example['Text'] is not None)

tokenized_ds = ds.map(
    group_texts,
    batched=True,
    remove_columns=ds['train'].column_names,
    num_proc=num_proc
)

print(tokenized_ds)

Filter:   0%|          | 0/1788 [00:00<?, ? examples/s]

Filter:   0%|          | 0/448 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/1743 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/437 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1743
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 437
    })
})


Do not run this if already truncating

In [9]:
from itertools import chain

max_length = 512    
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= max_length:
        total_length = (total_length // max_length) * max_length
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + max_length] for i in range(0, total_length, max_length)]
        for k, t in concatenated_examples.items()
    }
    return result

tokenized_ds = tokenized_ds.map(group_texts, batched=True, num_proc=num_proc)
# shuffle dataset
tokenized_ds= tokenized_ds.shuffle(seed=34)

Map (num_proc=8):   0%|          | 0/1743 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/437 [00:00<?, ? examples/s]

In [10]:
print(f"the training dataset contains in total {len(tokenized_ds['train'])*max_length} tokens")

the training dataset contains in total 892416 tokens


Create Data Collator

In [11]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability= 0.15
)

Initialize Roberta model

In [12]:
from transformers import RobertaConfig, RobertaForMaskedLM

config = RobertaConfig(
    vocab_size=50_000,
    max_position_embeddings=514,
    num_attention_heads=8,
    num_hidden_layers=6,
    type_vocab_size=1,
)

model = RobertaForMaskedLM(config=config)

In [13]:
print(model.num_parameters())

81966416


In [14]:
import accelerate

accelerate.__version__

'0.29.2'

In [15]:
import evaluate
    
def preprocess_logits_for_metrics(logits, labels):
    if isinstance(logits, tuple):
        # Depending on the model and config, logits may contain extra tensors,
        # like past_key_values, but logits always come first
        logits = logits[0]
    return logits.argmax(dim=-1)

metric = evaluate.load("accuracy")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # preds have the same shape as the labels, after the argmax(-1) has been calculated
    # by preprocess_logits_for_metrics
    labels = labels.reshape(-1)
    preds = preds.reshape(-1)
    mask = labels != -100
    labels = labels[mask]
    preds = preds[mask]
    
    return metric.compute(predictions=preds, references=labels)

In [16]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="output",
    overwrite_output_dir=True,
    num_train_epochs=10,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_ds['train'],
    eval_dataset=tokenized_ds['test'],
    compute_metrics=compute_metrics,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics
)

print(tokenized_ds)


dataloader_config = DataLoaderConfiguration(dispatch_batches=None)


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1743
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 437
    })
})


In [17]:
trainer.train()

trainer.save_model("model")

You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,No log,3.844404
2,No log,3.421346
3,No log,3.331994
4,No log,3.232864
5,3.925100,3.174699
6,3.925100,3.116555
7,3.925100,3.049663
8,3.925100,3.014463
9,3.925100,2.994556
10,3.055500,2.993242


In [18]:
import math

metrics = trainer.evaluate()

metrics["eval_samples"] = len(ds['test'])

try:
    perplexity = math.exp(metrics["eval_loss"])
except OverflowError:
    perplexity = float("inf")

print(perplexity)

metrics["perplexity"] = perplexity

trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)


19.96444596023164
***** eval metrics *****
  epoch                   =       10.0
  eval_loss               =      2.994
  eval_runtime            = 0:00:03.92
  eval_samples            =        437
  eval_samples_per_second =    111.269
  eval_steps_per_second   =     14.004
  perplexity              =    19.9644
