In [1]:
import datasets
import pandas as pd

dataset = datasets.load_dataset("trl-lib/tldr")


# Convert the dataset to a pandas DataFrame
df = pd.DataFrame(dataset['test'])
print(f"Dataset shape: {df.shape}")
print(df.head())

  from .autonotebook import tqdm as notebook_tqdm


Dataset shape: (6553, 2)
                                              prompt  \
0  SUBREDDIT: r/relationships\n\nTITLE: Me [19 F]...   
1  SUBREDDIT: r/Parenting\n\nTITLE: My 11 year ol...   
2  SUBREDDIT: r/relationships\n\nTITLE: The girl ...   
3  SUBREDDIT: r/tifu\n\nTITLE: TIFU by accidently...   
4  SUBREDDIT: r/relationships\n\nTITLE: I [32 M] ...   

                                          completion  
0   I really like this guy, but after having sex ...  
1   Sons good friend died and his funeral is toda...  
2   Girl I'm seeing didn't respond to my texts wh...  
3   Tried to stop an old lady falling, kicked her...  
4   Wife Cheats on me but I stuck around for kids...  


In [18]:
MODEL_NAME = "Qwen/Qwen2-0.5B-Instruct"
KEY_WORD = "baseline"
BATCH_SIZE = 8

# Batch size for processing
SYSTEM_PROMPT = "Summarize the following text within 20 characters: {text}"


# Create a file to store the inference results
import os
from datetime import datetime

# Create the inference directory if it doesn't exist
inference_dir = "eval/inference/"
os.makedirs(inference_dir, exist_ok=True)

# Generate a filename with the model name and current date
current_date = datetime.now().strftime("%Y-%m-%d_%H-%M")
filename = f"{inference_dir}{MODEL_NAME.replace('/', '_')}_{KEY_WORD}_{current_date}.jsonl"

print(f"Results will be saved to: {filename}")

import torch
from transformers import pipeline, AutoTokenizer
import json
from tqdm import tqdm

# Initialize the pipeline
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, padding_side="left")

pipe = pipeline(
    "text-generation",
    MODEL_NAME,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

# Function to generate summary for a single text
def generate_summary(text):
    prompt = SYSTEM_PROMPT.format(text=text)
    response = pipe(prompt, max_new_tokens=418, do_sample=True)
    return response[0]['generated_text']

# Process the dataset in batches
for i in tqdm(range(0, len(df), BATCH_SIZE), desc="Processing batches"):
    batch_prompts = df['prompt'][i:i+BATCH_SIZE].tolist()
    batch = [[{"role": "system", "content": SYSTEM_PROMPT.format(text=text)}] for text in batch_prompts]
    
    # Generate summaries for the batch
    outputs = pipe(
        batch,
        max_new_tokens=50,
        do_sample=True,
        batch_size=BATCH_SIZE,
    )
    
    # Extract the generated summaries
    results = [
        {
            "prompt": prompt,
            "summary": output[0]['generated_text'][-1]['content']
        } for prompt, output in zip(batch, outputs)
    ]

    # Write the summaries to the file
    with open(filename, "a") as f:
        for result in results:
            f.write(json.dumps(result) + "\n")

Results will be saved to: eval/inference/Qwen_Qwen2-0.5B-Instruct_baseline_2025-03-16_07-16.jsonl


Device set to use cuda:0
Processing batches: 100%|██████████| 820/820 [14:48<00:00,  1.08s/it]
