In [3]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config, TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from torch.utils.data import Dataset
# us the model
from transformers import pipeline, set_seed
import wandb

## Let's Tokenize

***Model*** The GPT2 Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings).

**Tokenizer**:  A tokenizer is in charge of preparing the inputs for a model.

***PreTrainedTokenizer*** and ***PreTrainedTokenizerFast*** thus implement the main methods for using all the tokenizers:

- Tokenizing (splitting strings in sub-word token strings), converting tokens strings to ids and back, and encoding/decoding (i.e., tokenizing and converting to integers).
- Adding new tokens to the vocabulary in a way that is independent of the underlying structure (BPE, SentencePiece…).
- Managing special tokens (like mask, beginning-of-sentence, etc.): adding them, assigning them to attributes in the tokenizer for easy access and making sure they are not split during tokenization. 
    
Here is the link to [documentation](https://huggingface.co/docs/transformers/v4.37.2/en/main_classes/tokenizer#transformers.PreTrainedTokenizer)


- GPT-2 Small ('gpt2'): 124 million parameters.
- GPT-2 Medium ('gpt2-medium'): 345 million parameters.
- GPT-2 Large ('gpt2-large'): 774 million parameters.
- GPT-2 XL ('gpt2-xl'): 1.5 billion parameters.

***Byte-Pair Encoding (BPE)*** vs ***Word Level Encoding***

BPE emphasises more on subwords. Yet there might be issues with semantic information of those subwords. 
Word Level Encoding encodes word by word that preserves the semantic information more yet it has problems with unseen word encoding etc. 


In [4]:
# Load pre-trained GPT-2 model and tokenizer
model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)

tokenizer = GPT2Tokenizer.from_pretrained(model_name)


***Data collators*** are objects that will form a batch by using a list of dataset elements as input. These elements are of the same type as the elements of train_dataset or eval_dataset.

To be able to build batches, data collators may apply some processing (like padding). Some of them (like DataCollatorForLanguageModeling) also apply some random data augmentation (like random masking) on the formed batch.

In [5]:

# Load your Shakespeare dataset
dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="shakespeare_dataset.txt",
    block_size=128,
)

"""
tokenizer (PreTrainedTokenizer or PreTrainedTokenizerFast) — The tokenizer used for encoding the data.

mlm (bool, optional, defaults to True) — Whether or not to use masked language modeling.
 If set to False, the labels are the same as the inputs with the padding tokens ignored (by setting them to -100). 
 Otherwise, the labels are -100 for non-masked tokens and the value to predict for the masked token.

mlm_probability (float, optional, defaults to 0.15) — The probability with which to (randomly) mask tokens in the input, when mlm is set to True.

pad_to_multiple_of (int, optional) — If set will pad the sequence to a multiple of the provided value.

return_tensors (str) — The type of Tensor to return. Allowable values are “np”, “pt” and “tf”.
"""

# Create data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # No masked language modeling for GPT-2
)




In [6]:
dataset[0]

tensor([ 5962, 22307,    25,   198,  8421,   356,  5120,   597,  2252,    11,
         3285,   502,  2740,    13,   198,   198,  3237,    25,   198,  5248,
          461,    11,  2740,    13,   198,   198,  5962, 22307,    25,   198,
         1639,   389,   477, 12939,  2138,   284,  4656,   621,   284,  1145,
          680,    30,   198,   198,  3237,    25,   198,  4965,  5634,    13,
        12939,    13,   198,   198,  5962, 22307,    25,   198,  5962,    11,
          345,   760,   327,  1872,   385,  1526, 28599,   318,  4039,  4472,
          284,   262,   661,    13,   198,   198,  3237,    25,   198,  1135,
          760,   470,    11,   356,   760,   470,    13,   198,   198,  5962,
        22307,    25,   198,  5756,   514,  1494,   683,    11,   290,   356,
         1183,   423, 11676,   379,   674,   898,  2756,    13,   198,  3792,
          470,   257, 15593,    30,   198,   198,  3237,    25,   198,  2949,
          517,  3375,   319,   470,    26,  1309,   340,   307])

In [7]:
tokenizer.decode(dataset[0])

"First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you know Caius Marcius is chief enemy to the people.\n\nAll:\nWe know't, we know't.\n\nFirst Citizen:\nLet us kill him, and we'll have corn at our own price.\nIs't a verdict?\n\nAll:\nNo more talking on't; let it be"

Here is the example notebook from Hugging Face about finetuning a model. [Notebook Link](https://github.com/huggingface/notebooks/blob/main/examples/summarization.ipynb)

In [8]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./fine-tuned-shakespeare",
    overwrite_output_dir=True,
    num_train_epochs=3,  # Adjust the number of epochs based on your needs
    per_device_train_batch_size=4,  # Adjust batch size based on GPU memory
    save_steps=10_000,  # Adjust save steps based on your needs
)


wandb.init(config=training_args)
# Magic
wandb.watch(model, log_freq=2)


# Create Trainer and fine-tune the model
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset
)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/novus/.netrc


In [None]:
trainer.train() # report to wights to biases 
                # wandb
# untrained modelin inital loss ne olur? 
# 3.7 neden yüksek dedik. neden ve ne olmalıydı?


In [18]:


generator = pipeline('text-generation', model=model, tokenizer=tokenizer)

set_seed(42)

response_model = generator("Before we proceed any further, hear me speak,", max_length=200, num_return_sequences=1)



Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [19]:
print(response_model[0]["generated_text"])

Before we proceed any further, hear me speak,
My lord: this is very hot of grief.

LUCENTIO:
Marry, God you hear it! how came ye to
It was so; how it could be so fast-rotted
In a day-sleeve, is too fast! Yet this my lord, you are apt to
Do it; for there was a goodly harvest,
And of the harvest of a goodly crop.

BRUTUS:
Nay.

LUCENTIO:
My Lord, the good of this time is too tedious
To keep a simple succession of days.

Provost:
The time?

VINCENTIO:
I mean the day of our departure.

NATHANUS:
What!

Provost:
Not the goodly harvest, but the very fruit of our grace
As we could well do it. By


In [36]:
trainer.save_model("outputs/finetuned_shakespeare")
# Save tokenizer
tokenizer.save_pretrained("outputs/finetuned_shakespeare")

Non-default generation parameters: {'max_length': 50, 'do_sample': True}


('outputs/finetuned_shakespeare/tokenizer_config.json',
 'outputs/finetuned_shakespeare/special_tokens_map.json',
 'outputs/finetuned_shakespeare/vocab.json',
 'outputs/finetuned_shakespeare/merges.txt',
 'outputs/finetuned_shakespeare/added_tokens.json')

***Let's load the pretrained model and get some inference to see if it is recorded correctly***

In [37]:
loaded_model = GPT2LMHeadModel.from_pretrained("outputs/finetuned_shakespeare")
loaded_tokenizer = GPT2Tokenizer.from_pretrained("outputs/finetuned_shakespeare")

# Now you can use the loaded model and tokenizer as before
loaded_generator = pipeline('text-generation', model=loaded_model, tokenizer=loaded_tokenizer)

response_model = loaded_generator("Before we proceed any further, hear me speak,", max_length=50, num_return_sequences=1)
print(response_model[0]["generated_text"])