In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m28.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [None]:
from datasets import load_dataset
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
import torch

# Step 1: Load the Poetry Dataset
dataset = load_dataset("merve/poetry")

# Step 2: Load Pre-trained GPT-2 Model and Tokenizer
model_name = "openai-community/gpt2"  # You can choose different variants like "gpt2-medium" for more capacity
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Set padding token to EOS token for GPT-2 compatibility
tokenizer.pad_token = tokenizer.eos_token

# Step 3: Tokenize the Dataset using the 'content' column
def tokenize_function(examples):
    return tokenizer(examples["content"], padding="max_length", truncation=True, max_length=50)

# Tokenize the poetry dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Step 4: Define Data Collator for Language Modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # Set to False because GPT-2 is a causal language model
)

# Step 5: Set Training Arguments
training_args = TrainingArguments(
    output_dir="./poem_generator_results",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,  # Accumulate gradients to simulate a larger batch size
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=1000,
    save_total_limit=2,
    fp16=torch.cuda.is_available()  # Enable mixed precision if using GPU
)

# Step 6: Set Up Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    data_collator=data_collator
)

# Step 7: Fine-Tune the Model
trainer.train()

# Step 8: Save the Fine-Tuned Model
model.save_pretrained("./poem_generator")
tokenizer.save_pretrained("./poem_generator")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/2.81k [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


poetry.csv:   0%|          | 0.00/606k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/573 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/573 [00:00<?, ? examples/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss
10,6.5578
20,6.5212
30,6.2917
40,6.1867
50,5.9336
60,5.7153
70,5.5562
80,5.5179
90,5.122
100,4.9113


('./poem_generator/tokenizer_config.json',
 './poem_generator/special_tokens_map.json',
 './poem_generator/vocab.json',
 './poem_generator/merges.txt',
 './poem_generator/added_tokens.json')

In [None]:
from transformers import pipeline, GPT2Tokenizer, GPT2LMHeadModel

# Load the fine-tuned model and tokenizer
model = GPT2LMHeadModel.from_pretrained("./poem_generator")
tokenizer = GPT2Tokenizer.from_pretrained("./poem_generator")

# Set up the text generation pipeline
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Define the list of themes
themes = ["nature", "love", "seasons", "dreams", "hope", "adventure"]
num_poems_per_theme = 1  # Number of poems to generate for each theme

# Generate poems for each theme
for theme in themes:
    print(f"Theme: {theme.capitalize()}")
    generated_poems = generator(
        theme,
        max_length=50,
        num_return_sequences=num_poems_per_theme,
        do_sample=True,
        top_k=50,
        top_p=0.95,
    )
    for i, poem in enumerate(generated_poems, start=1):
        print(f"Poem {i} for theme '{theme}':")
        print(poem["generated_text"])
        print("\n" + "-" * 80 + "\n")


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Theme: Nature
Poem 1 for theme 'nature':
nature, the artifice of men, and the arts that produce, and those artifice that lead to glory: so that men may be praised, when they are not, and by which they are well pleased.And how many, that are known

--------------------------------------------------------------------------------

Theme: Love
Poem 1 for theme 'love':
love in her breast,

What thou hadst to look upon in the dark;

And at the same time, after a yearlong journey,

I began, with an unwholesome sigh, to see

The

--------------------------------------------------------------------------------

Theme: Seasons
Poem 1 for theme 'seasons':
seasons in the past, they have no sense of it or of their own; nor can they comprehend it unless with themselves; to what end they aspire to. They cannot know either the value of aught except with themselves; nor know that there

--------------------------------------------------------------------------------

Theme: Dreams
Poem 1 for theme 'dre

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch
from datasets import load_dataset

# Load model and tokenizer
model = GPT2LMHeadModel.from_pretrained("./poem_generator")
tokenizer = GPT2Tokenizer.from_pretrained("./poem_generator")

# Load your dataset (ensure it's in the same format as fine-tuning)
dataset = load_dataset("merve/poetry", split="train")

# Evaluate perplexity
def calculate_perplexity(model, tokenizer, dataset):
    model.eval()
    total_log_likelihood = 0
    total_tokens = 0

    for data in dataset:
        input_text = data["content"]  # Adjust based on the actual column name
        inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs, labels=inputs["input_ids"])
            log_likelihood = outputs.loss.item()
            total_log_likelihood += log_likelihood * inputs["input_ids"].size(1)
            total_tokens += inputs["input_ids"].size(1)

    perplexity = torch.exp(torch.tensor(total_log_likelihood / total_tokens))
    return perplexity.item()

# Calculate and print perplexity
perplexity = calculate_perplexity(model, tokenizer, dataset)
print(f"Perplexity: {perplexity}")

Repo card metadata block was not found. Setting CardData to empty.


Perplexity: 76.67526245117188


In [None]:
!pip install nltk rouge-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=ab8467dd50992ffff86832f10bb457d45edaf097d4024a0b640cafe7dd914ed6
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [None]:
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer

# Define a reference poem and a generated poem
reference_poem = ["Nature's beauty is profound, it's all around."]
generated_poem = "Nature's beauty lies in the sky, where birds freely fly."

# BLEU Score
reference = [reference_poem[0].split()]  # Tokenize the reference
candidate = generated_poem.split()  # Tokenize the generated poem
bleu_score = sentence_bleu(reference, candidate)
print(f"BLEU Score: {bleu_score:.4f}")

# ROUGE Scores
scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
scores = scorer.score(reference_poem[0], generated_poem)

print("ROUGE Scores:")
for metric, score in scores.items():
    print(f" {metric}: {score.fmeasure:.4f}")

BLEU Score: 0.0000
ROUGE Scores:
 rouge1: 0.3000
 rouge2: 0.2222
 rougeL: 0.3000


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [None]:
from huggingface_hub import login

# Log in to your Hugging Face account
login()

# Push the model to the Hub
model.push_to_hub("mehwish67/poem_Generator")
tokenizer.push_to_hub("mehwish67/poem_Generator")


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

README.md:   0%|          | 0.00/1.16k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/mehwish67/poem_Generator/commit/83cd74ff9321072590516c9aeec8856906af0ca6', commit_message='Upload tokenizer', commit_description='', oid='83cd74ff9321072590516c9aeec8856906af0ca6', pr_url=None, repo_url=RepoUrl('https://huggingface.co/mehwish67/poem_Generator', endpoint='https://huggingface.co', repo_type='model', repo_id='mehwish67/poem_Generator'), pr_revision=None, pr_num=None)