In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
pip install torch transformers pandas accelerate



In [3]:
import torch
import pandas as pd
from transformers import BloomTokenizerFast, BloomForCausalLM
import random
import textwrap

# --- Configuration ---
FILE_NAME = "/content/drive/MyDrive/nlp/PoetryFoundationData_Cleaned.csv"
MODEL_NAME = "bigscience/bloom-560m"
USER_PROMPT = "A serene morning in the mountains"

# Generation parameters
MAX_NEW_TOKENS = 120
TEMPERATURE = 0.9
NUM_RETURN_SEQUENCES = 3
NUM_EXAMPLES_FROM_DATASET = 3 # The 'few-shot' part

# --- 1. Load Data and Construct Few-Shot Prompt ---
print(f"Loading data from {FILE_NAME}...")
try:
    # Load the cleaned poetry data
    df = pd.read_csv(FILE_NAME)

    # Filter for the main text column and sample a few poems
    sample_poems = df['Cleaned_Poem'].dropna().sample(NUM_EXAMPLES_FROM_DATASET, random_state=42)

    # Start building the ICL prompt with system instructions
    prompt_template = "Below are examples of modern poetry. Study their style and cadence, then complete the final poem based on the last prompt.\n\n"

    # Add the sampled poems as examples
    for i, poem in enumerate(sample_poems):
        # Format the poem with line breaks to mimic natural poetry structure
        formatted_poem = textwrap.fill(poem, width=80)

        prompt_template += f"### Example Poem {i+1} (Source: Poetry Foundation Dataset) ###\n"
        prompt_template += f"{formatted_poem}\n\n"

    # Add the final prompt that the model must complete
    prompt_template += f"### NEW POEM (Prompt: {USER_PROMPT}) ###\n"
    prompt_template += f"{USER_PROMPT}\n"

    final_prompt = prompt_template

    print(f"Few-shot prompt constructed with {NUM_EXAMPLES_FROM_DATASET} examples.")

except FileNotFoundError:
    print(f"Error: The file '{FILE_NAME}' was not found.")
    print("Using a placeholder poem to continue the demonstration.")
    # Fallback in case the file is not correctly accessible during execution
    final_prompt = (
        "### Example Poem 1 ###\n"
        "The entrance at the back of the complex led onto a road, where an upended couch\n"
        "tilted into a ditch and a washing machine gleamed avocado beneath pine needles.\n"
        "But doesn't longing clarify the body?\n\n"
        f"### NEW POEM (Prompt: {USER_PROMPT}) ###\n{USER_PROMPT}\n"
    )
except Exception as e:
    print(f"An error occurred during file processing: {e}")
    print("Using a basic prompt for demonstration.")
    final_prompt = USER_PROMPT


# --- 2. Load the BLOOM-560M Model and Tokenizer ---
print("\nLoading BLOOM-560M model and tokenizer...")
try:
    tokenizer = BloomTokenizerFast.from_pretrained(MODEL_NAME)
    model = BloomForCausalLM.from_pretrained(MODEL_NAME)
    print("Model loaded successfully.")

except Exception as e:
    print(f"An error occurred while loading the model: {e}")
    print("Please ensure you have the 'transformers' and 'torch' libraries installed.")
    exit()

# --- 3. Generate the Poetry ---
print("\nGenerating poetry using the dataset-guided prompt...")

# Encode the prompt text
input_ids = tokenizer.encode(final_prompt, return_tensors='pt')
prompt_length = len(input_ids[0])

# Generate the text
output = model.generate(
    input_ids,
    max_length=prompt_length + MAX_NEW_TOKENS,
    temperature=TEMPERATURE,
    num_return_sequences=NUM_RETURN_SEQUENCES,
    do_sample=True,
    top_k=50,
    top_p=0.95,
    pad_token_id=tokenizer.eos_token_id
)
print("Generation complete.")

# --- 4. Decode and Display the Results ---
print("\n" + "="*50)
print("--- GENERATED POEMS (Guided by PoetryFoundation Data) ---")
print("="*50)

for i, sequence in enumerate(output):
    # Decode the full generated sequence
    generated_text = tokenizer.decode(sequence, skip_special_tokens=True)

    # Find the end of the input prompt in the generated text to only display the new poem
    # This step ensures we only output what the model generated, not the few-shot examples
    start_index = generated_text.rfind(USER_PROMPT) + len(USER_PROMPT)
    new_poem = generated_text[start_index:].strip()

    print(f"\n[Poem {i+1}]")
    print("-" * 20)
    print(USER_PROMPT) # Print the user prompt again for context
    print(new_poem)
    print("-" * 20)

Loading data from /content/drive/MyDrive/nlp/PoetryFoundationData_Cleaned.csv...
Few-shot prompt constructed with 3 examples.

Loading BLOOM-560M model and tokenizer...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/222 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/693 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Model loaded successfully.

Generating poetry using the dataset-guided prompt...
Generation complete.

--- GENERATED POEMS (Guided by PoetryFoundation Data) ---

[Poem 1]
--------------------
A serene morning in the mountains
The wind, the flowers,the sky
Every day the sun rises and sets in a new and happy way
The mountains are beautiful
For the sky is filled with flowers
And the mountains above are beautiful
For the wind, the flowers,the sky
Every day the sun rises and sets in a new and happy way
The mountains are beautiful
For the sky is filled with flowers
And the mountains above are beautiful
For the wind, the flowers,the sky
Every day the sun rises and sets in a new and happy way
The mountains are beautiful
For the sky is
--------------------

[Poem 2]
--------------------
A serene morning in the mountains
As a lighted morning in the far distance
I see a white cloud above the clouds
And a glimpse of the world around me
But when I think about all I want,
I turn to my heart and feel

In [4]:
generated_poems = []

for i, sequence in enumerate(output):
    full_generated_text = tokenizer.decode(sequence, skip_special_tokens=True)


    start_index = full_generated_text.rfind(USER_PROMPT) + len(USER_PROMPT)
    new_poem = full_generated_text[start_index:].strip()
    generated_poems.append(new_poem)

print(f"Extracted {len(generated_poems)} generated poems.")
for i, poem in enumerate(generated_poems):
    print(f"\n--- Extracted Poem {i+1} ---")
    print(poem)


Extracted 3 generated poems.

--- Extracted Poem 1 ---
The wind, the flowers,the sky
Every day the sun rises and sets in a new and happy way
The mountains are beautiful
For the sky is filled with flowers
And the mountains above are beautiful
For the wind, the flowers,the sky
Every day the sun rises and sets in a new and happy way
The mountains are beautiful
For the sky is filled with flowers
And the mountains above are beautiful
For the wind, the flowers,the sky
Every day the sun rises and sets in a new and happy way
The mountains are beautiful
For the sky is

--- Extracted Poem 2 ---
As a lighted morning in the far distance
I see a white cloud above the clouds
And a glimpse of the world around me
But when I think about all I want,
I turn to my heart and feel the sweetest joy
And it feels that the world I see
Is the most precious in the world
And I can breathe in the breath of a bird
That lives in the clouds and the sky above me.
### Example Poem 4 (Source: Poetry Foundation Dataset) #

In [5]:
from collections import Counter

def calculate_diversity_ngram(poems, tokenizer, n):

    all_ngrams = []
    for poem in poems:
        tokens = tokenizer.tokenize(poem.lower())
        if len(tokens) < n:
            continue

        ngrams = [tuple(tokens[i:i+n]) for i in range(len(tokens) - n + 1)]
        all_ngrams.extend(ngrams)

    if not all_ngrams:
        return 0.0

    unique_ngrams = len(set(all_ngrams))
    total_ngrams = len(all_ngrams)
    diversity = unique_ngrams / total_ngrams
    return diversity


In [6]:
import numpy as np

def calculate_perplexity(text, model, tokenizer):

    model.eval()
    device = model.device


    encodings = tokenizer(text, return_tensors='pt', truncation=True, max_length=512).to(device)
    input_ids = encodings.input_ids
    attention_mask = encodings.attention_mask


    if input_ids.numel() == 0:
        return float('inf')

    with torch.no_grad():

        outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)

        loss = outputs.loss


    perplexity = torch.exp(loss).item()
    return perplexity


In [7]:
import re
print("\n--- Evaluating Generated Poems ---")


perplexity_scores = []
for i, poem in enumerate(generated_poems):

    if poem.strip():

        perp = calculate_perplexity(poem, model, tokenizer)
        perplexity_scores.append(perp);
        print(f"Poem {i+1} Perplexity: {perp:.2f}")
    else:
        print(f"Poem {i+1} is empty.")

if perplexity_scores:
    print(f"Average Perplexity: {np.mean(perplexity_scores):.2f}")
else:
    print("No valid poems.")

valid_poems = [p for p in generated_poems if p.strip()]

if valid_poems:
    unigram_diversity = calculate_diversity_ngram(valid_poems, tokenizer, 1)
    bigram_diversity = calculate_diversity_ngram(valid_poems, tokenizer, 2)
    trigram_diversity = calculate_diversity_ngram(valid_poems, tokenizer, 3)

    print(f"\nUnigram Diversity: {unigram_diversity:.4f}")
    print(f"Bigram Diversity: {bigram_diversity:.4f}")
    print(f"Trigram Diversity: {trigram_diversity:.4f}")
else:
    print("No valid poems.")


all_sentences = []
for poem in valid_poems:

    sentences = [s.strip() for s in re.split(r'[.!?]\s*', poem) if s.strip()]
    all_sentences.extend(sentences)

if all_sentences:
    sentence_lengths = [len(s.split()) for s in all_sentences]
    avg_sentence_length = np.mean(sentence_lengths)
    print(f"\nApproximate Coherence: {avg_sentence_length:.2f} words/sentence")
else:
    print("No valid sentences.")


--- Evaluating Generated Poems ---
Poem 1 Perplexity: 3.07
Poem 2 Perplexity: 19.75
Poem 3 Perplexity: 55.93
Average Perplexity: 26.25

Unigram Diversity: 0.3806
Bigram Diversity: 0.6148
Trigram Diversity: 0.6805

Approximate Coherence: 51.75 words/sentence
