In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import re

file_path = "D:\GitHub\nlp-poetry-project\data\processed\PoetryFoundationData_Cleaned.csv"
df = pd.read_csv(file_path)

print("Initial shape:", df.shape)
print("\nColumns:", df.columns.tolist())
print("\nMissing values:\n", df.isnull().sum())

if 'Tags' in df.columns:
    df = df[['Title', 'Poet', 'Poem', 'Tags']]
else:
    df['Tags'] = ''
    df = df[['Title', 'Poet', 'Poem', 'Tags']]

df = df.dropna(subset=['Poem'])
df = df[df['Poem'].str.strip() != '']

df = df.drop_duplicates(subset=['Poem'])

def clean_text(text):
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.replace('\r', '').replace('\n', '\n')
    text = text.strip()
    return text

def clean_tags(tag_str):
    if pd.isna(tag_str) or tag_str == '':
        return []
    tags = [t.strip().lower() for t in str(tag_str).split(',') if t.strip()]
    return tags

df['Cleaned_Poem'] = df['Poem'].apply(clean_text)
df['Cleaned_Tags'] = df['Tags'].apply(clean_tags)

df = df[df['Cleaned_Poem'].str.split().apply(len) > 10]

df = df.reset_index(drop=True)

print("\nAfter cleaning:")
print("Shape:", df.shape)
print("Unique poets:", df['Poet'].nunique())
print("Average poem length in words:", int(df['Cleaned_Poem'].str.split().apply(len).mean()))
print("Average number of tags per poem:", round(df['Cleaned_Tags'].apply(len).mean(), 2))

cleaned_path = "/content/drive/MyDrive/nlp/PoetryFoundationData_Cleaned.csv"
df.to_csv(cleaned_path, index=False)

print(f"\nCleaned dataset saved to: {cleaned_path}")


Initial shape: (13854, 5)

Columns: ['Unnamed: 0', 'Title', 'Poem', 'Poet', 'Tags']

Missing values:
 Unnamed: 0      0
Title           0
Poem            0
Poet            0
Tags          955
dtype: int64

After cleaning:
Shape: (13674, 6)
Unique poets: 3093
Average poem length in words: 252
Average number of tags per poem: 4.87

Cleaned dataset saved to: /content/drive/MyDrive/nlp/PoetryFoundationData_Cleaned.csv


In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import Dataset
import pandas as pd
import torch

file_path = "/content/drive/MyDrive/nlp/PoetryFoundationData_Cleaned.csv"
df = pd.read_csv(file_path)
texts = df['Cleaned_Poem'].tolist()

dataset = Dataset.from_dict({"text": texts})

model_name = "distilgpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=128)

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

training_args = TrainingArguments(
    output_dir="./distilgpt2-poetry",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=500,
    save_total_limit=2,
    logging_steps=100,
    learning_rate=5e-5,
    weight_decay=0.01,
    prediction_loss_only=True,
    report_to="none",
    fp16=torch.cuda.is_available()
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator
)

trainer.train()

prompt = "A serene morning in the mountains"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

inputs = tokenizer.encode(prompt, return_tensors="pt").to(device)
attention_mask = (inputs != tokenizer.pad_token_id).long()

outputs = model.generate(
    inputs,
    attention_mask=attention_mask,
    max_length=100,
    do_sample=True,
    top_k=50,
    top_p=0.95,
    temperature=0.9,
    no_repeat_ngram_size=3
)

generated_poem = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("\n Generated Poem:\n", generated_poem)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Map:   0%|          | 0/13674 [00:00<?, ? examples/s]

`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
100,5.0062
200,4.9561
300,4.9339
400,4.897
500,4.9095
600,4.8433
700,4.9125
800,4.8421
900,4.8433
1000,4.8322


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



 Generated Poem:
 A serene morning in the mountains and the wind is calm, And a sudden rain on the plains, And with a white breeze falling at high speed. A cloud of sky over the plains is high, And light is a cold, slow, dark night. The wind is heavy as a cloud, And the wind does not pass. A shadow passes, and a man takes his seat. He thinks of the rain on a white tree, And that he sees it, He tells the man in his
