In [None]:
# !pip install datasets # install the datasets package


In [None]:
import tensorflow as tf
from datasets import load_dataset
from transformers import GPT2Tokenizer, TFGPT2LMHeadModel, DataCollatorForLanguageModeling, create_optimizer, AdamWeightDecay, TrainingArguments
from transformers.keras_callbacks import PushToHubCallback


In [None]:
# Load the dataset
dataset = load_dataset('wikitext', 'wikitext-2-raw-v1')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:

# Initialize the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], return_special_tokens_mask=True)

In [None]:
from datasets import DatasetDict

# Assume dataset is a DatasetDict
tokenized_datasets = DatasetDict()

for split in dataset.keys():
    # Select the first 500 elements of each split
    small_dataset = dataset[split].select(range(500))
    # Apply the tokenize function
    tokenized_datasets[split] = small_dataset.map(tokenize_function, batched=True, remove_columns=["text"])


In [None]:
# tokenized_datasets = dataset.select.map(tokenize_function, batched=True, remove_columns=["text"])


In [None]:
def group_texts(examples):
    block_size = 128
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    total_length = (total_length // block_size) * block_size
    result = {
        k: [t[i:i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
tokenized_datasets = tokenized_datasets.map(group_texts, batched=True)


In [None]:
# Split dataset into training and validation
train_dataset = tokenized_datasets["train"].to_tf_dataset(
    columns=["input_ids", "attention_mask", "labels"],
    shuffle=True,
    batch_size=8,
    collate_fn=lambda x: {
        "input_ids": tf.constant([f['input_ids'] for f in x]),
        "attention_mask": tf.constant([f['attention_mask'] for f in x]),
        "labels": tf.constant([f['labels'] for f in x]),
    },
)

val_dataset = tokenized_datasets["validation"].to_tf_dataset(
    columns=["input_ids", "attention_mask", "labels"],
    shuffle=False,
    batch_size=8,
    collate_fn=lambda x: {
        "input_ids": tf.constant([f['input_ids'] for f in x]),
        "attention_mask": tf.constant([f['attention_mask'] for f in x]),
        "labels": tf.constant([f['labels'] for f in x]),
    },
)

model = TFGPT2LMHeadModel.from_pretrained('gpt2')

All PyTorch model weights were used when initializing TFGPT2LMHeadModel.

All the weights of TFGPT2LMHeadModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [None]:
# Define the loss function explicitly
def compute_loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

In [None]:
# Compile the model with a custom loss function
optimizer = AdamWeightDecay(learning_rate=5e-5, weight_decay_rate=0.01)
model.compile(optimizer=optimizer, loss=compute_loss)

In [None]:
# # Custom training loop
# epochs = 3
# for epoch in range(epochs):
#     print(f"Epoch {epoch + 1}/{epochs}")
#     for batch in train_dataset:
#         with tf.GradientTape() as tape:
#             inputs = {
#                 "input_ids": batch["input_ids"],
#                 "attention_mask": batch["attention_mask"],
#                 "labels": batch["labels"],
#             }
#             logits = model(inputs, training=True).logits
#             loss = compute_loss(inputs["labels"], logits)

#         gradients = tape.gradient(loss, model.trainable_variables)
#         optimizer.apply_gradients(zip(gradients, model.trainable_variables))
#         print(f"Loss: {loss.numpy().mean()}")

In [None]:
# Early stopping parameters
patience = 2  # Number of epochs to wait for improvement
best_val_loss = float('inf')
epochs_without_improvement = 0
epochs = 5

for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    # Training loop
    train_losses = []
    for batch in train_dataset:
        with tf.GradientTape() as tape:
            inputs = {
                "input_ids": batch["input_ids"],
                "attention_mask": batch["attention_mask"],
                "labels": batch["labels"],
            }
            logits = model(inputs, training=True).logits
            loss = compute_loss(inputs["labels"], logits)

        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))
        train_losses.append(loss.numpy().mean())

    avg_train_loss = sum(train_losses) / len(train_losses)
    print(f"Training Loss: {avg_train_loss}")

    # Validation loop
    val_losses = []
    for batch in val_dataset:
        inputs = {
            "input_ids": batch["input_ids"],
            "attention_mask": batch["attention_mask"],
            "labels": batch["labels"],
        }
        logits = model(inputs, training=False).logits
        loss = compute_loss(inputs["labels"], logits)
        val_losses.append(loss.numpy().mean())

    avg_val_loss = sum(val_losses) / len(val_losses)
    print(f"Validation Loss: {avg_val_loss}")

    # Early stopping check
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        epochs_without_improvement = 0
        model.save_pretrained('best_model')  # Save the model with the best validation loss
    else:
        epochs_without_improvement += 1
        if epochs_without_improvement >= patience:
            print(f"Early stopping triggered. No improvement for {patience} epochs.")
            break


Epoch 1/5
Training Loss: 1.6279491338019187
Validation Loss: 0.05509876326790878
Epoch 2/5
Training Loss: 0.07697614912803356
Validation Loss: 0.0516012255102396
Epoch 3/5
Training Loss: 0.0526533737205542
Validation Loss: 0.05032658004867179
Epoch 4/5
Training Loss: 0.04478792982319227
Validation Loss: 0.04883518322770085
Epoch 5/5
Training Loss: 0.04004255567605679
Validation Loss: 0.04724022267120225


In [None]:

# model.save_pretrained("./fine_tuned_gpt2")
# tokenizer.save_pretrained("./fine_tuned_gpt2")


In [None]:
from transformers import pipeline

# Load the fine-tuned model
# model = TFGPT2LMHeadModel.from_pretrained("./fine_tuned_gpt2")
# tokenizer = GPT2Tokenizer.from_pretrained("./fine_tuned_gpt2")

# Create a text generation pipeline

text_generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    top_p=0.92,
    do_sample=True,
    temperature=0.7,
    num_beams=3,
    no_repeat_ngram_size=3,
    early_stopping=True,
    repetition_penalty=3.0,
    length_penalty=1.5,
    top_k=50,
    max_length=50,
    num_return_sequences=1
)

# Generate text based on a prompt
prompt = "Once upon a time"
generated_texts = text_generator(prompt)

print(generated_texts[0]['generated_text'])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Once upon a time time time times times times days days days day day day night night night nights nights nights evenings nights nights Nights Nights Nights Night Night Night Star Star StarStarStarStarstarstar star star star stars stars stars Stars Stars Starsstarsstarsstars


In [None]:
prompt = "I had to go  there with my mates to start our project"
generated_texts = text_generator(prompt)

print(generated_texts[0]['generated_text'])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


I had to go  there with my mates to start our project project project Project Project Project Projects Projects Projects projects projects projectsprojectsprojectsprojectsprojectprojectProjectProjectProjectProProProproproprororororosrosrosososososesosesoses


In [None]:
prompt = "Time to go , see you!"
generated_texts = text_generator(prompt)

print(generated_texts[0]['generated_text'])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Time to go , see you!!!...... : : ::::,,,,",",",',',',...,...,..................................................................................................


In [None]:
prompt = "The ministery decided to disable the athentication of the papers"
generated_texts = text_generator(prompt)

print(generated_texts[0]['generated_text'])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


The ministery decided to disable the athentication of the papers papers newspapers newspapers newspapers newspaper newspaper newspaper paper paper paper Paper Paper PaperPaperPaperPaperpaperpaperpaperpaperspaperspapersppppppppp pp pp pp p p p P P P
