In [2]:
import argparse, re, os
from typing import List, Union, Iterable
from itertools import zip_longest
from compare_mt.rouge.rouge_scorer import RougeScorer
from nltk import sent_tokenize, word_tokenize
from sklearn.metrics import accuracy_score
from datasets import load_dataset
import numpy as np
from tqdm import tqdm
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
)

####################
##### Model ID #####
####################

model_id = "/home/gpq253/tuned-llama-3.1-8b"

  from .autonotebook import tqdm as notebook_tqdm
INFO:datasets:PyTorch version 2.5.1 available.


In [3]:
####################
### Load Dataset ###
####################

eval_dataset = load_dataset("csv", data_files="./Datasets/val.csv", split="train") #, streaming=True

In [4]:
###########################
### Sample from Dataset ###
###########################

print(next(iter(eval_dataset))) 

{'text': "(Sep 25, 2016  1:28 PM CDT) Authorities say seven people have been injured in an apparent fight in Boston's Theater District, the AP reports. The Boston Globe reports that Bernard O'Rourke, police superintendent of the bureau of field services, said officers responded to a report of a fight about 2:15am Sunday in the district, where bars and restaurants cater to nightlife crowds and had just closed. Police say people were stabbed with knives or bottles. Four of the victims were transported to receive medical treatment, while the other three walked into hospitals on their own, a police spokesman tells the Globe. Another police spokesman on Sunday afternoon told the AP the injuries appear to be non-life-threatening; earlier reports had said one person was critically injured. O'Rourke has said a suspect has been IDed—and per RT.com, that suspect was one of the wounded. Police are said to be seeking a second suspect. Emerson College, which has facilities in the area, alerted stud

In [5]:
#prompt = f"Generate a single headlines for this news article and ensure it has at least one number in the headline: {eval_dataset["text"]}"
#Eval_dataset in format of huggin face dataset (datasets.arrow_dataset.Dataset)
print(type(eval_dataset))
print(eval_dataset)

<class 'datasets.arrow_dataset.Dataset'>
Dataset({
    features: ['text', 'summary', 'cloze', 'cloze_gt', 'cloze_annotation', 'need_reasoning'],
    num_rows: 2775
})


In [30]:
##################################################
### Split Dataset to Components for Evaluation ###
##################################################

text = eval_dataset.remove_columns(['summary','cloze','cloze_gt','cloze_annotation','need_reasoning'])
target = eval_dataset.remove_columns(['text','cloze','cloze_gt','cloze_annotation','need_reasoning'])
num_gt = eval_dataset.remove_columns(['text','summary','cloze','cloze_annotation','need_reasoning'])
num_type = eval_dataset.remove_columns(['text','summary','cloze','cloze_gt','cloze_annotation'])

In [7]:
print(next(iter(text)))
print(next(iter(target)))
print(next(iter(num_gt)))
print(next(iter(num_type)))

{'text': "(Sep 25, 2016  1:28 PM CDT) Authorities say seven people have been injured in an apparent fight in Boston's Theater District, the AP reports. The Boston Globe reports that Bernard O'Rourke, police superintendent of the bureau of field services, said officers responded to a report of a fight about 2:15am Sunday in the district, where bars and restaurants cater to nightlife crowds and had just closed. Police say people were stabbed with knives or bottles. Four of the victims were transported to receive medical treatment, while the other three walked into hospitals on their own, a police spokesman tells the Globe. Another police spokesman on Sunday afternoon told the AP the injuries appear to be non-life-threatening; earlier reports had said one person was critically injured. O'Rourke has said a suspect has been IDed—and per RT.com, that suspect was one of the wounded. Police are said to be seeking a second suspect. Emerson College, which has facilities in the area, alerted stud

In [8]:
############################################
### Output Individualized Outputs to CSV ###
############################################

target.to_csv('./Datasets/target.csv')
num_gt.to_csv('./Datasets/num_gt.csv')
num_type.to_csv('./Datasets/num_type.csv')

Creating CSV from Arrow format: 100%|████████████| 3/3 [00:00<00:00, 126.08ba/s]
Creating CSV from Arrow format: 100%|████████████| 3/3 [00:00<00:00, 670.41ba/s]
Creating CSV from Arrow format: 100%|███████████| 3/3 [00:00<00:00, 1492.10ba/s]


5565

In [16]:
######################
### Load Tokenizer ###
######################
tokenizer = AutoTokenizer.from_pretrained(
  model_id, 
  trust_remote_code=True
)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [31]:
model = AutoModelForCausalLM.from_pretrained(model_id)

prompt = f"Create a numeral infused headline for this news article"

def add_prompt(example):
    example['text'] = prompt + ': ' + example['text']
    return example

text2 = text.map(add_prompt)
text2_dict = text2.to_dict()
text2_l = []
for text in text2_dict['text']:
    text2_l.append(text)


print(len(text2_l))

inputs = tokenizer(text2_l, return_tensors = 'pt')
outputs = model.generate(**inputs, return_dict_in_generate=True, output_scores = True)

Loading checkpoint shards: 100%|██████████████████| 4/4 [00:05<00:00,  1.28s/it]


2775


ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`input_ids` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

In [13]:
device = "cuda" if torch.cuda.is_available() else "cpu"

torch.cuda.empty_cache()

pipe = pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device=device,
)

prompt = f"Create a numeral infused headline for each news article: {text}"

messages = [
    {"role": "system", "content": "You are a journalist tasked to provide headlines for news articles!"},
    {"role": "user", "content": prompt},
]

terminators = [
    pipe.tokenizer.eos_token_id,
    pipe.tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

outputs = pipe(
    messages,
    max_new_tokens=256,
    eos_token_id=terminators,
    do_sample=True,
    temperature=0.6,
    top_p=0.9,
)

Loading checkpoint shards: 100%|██████████████████| 4/4 [00:00<00:00,  5.41it/s]
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


In [14]:
assistant_response = outputs[0]["generated_text"][-1]["content"]

print(assistant_response.strip('"'))

Here are 10 numeral-infused headlines for the news articles:

1. **1,000 Jobs at Risk as Tech Giant Announces Mass Layoffs**
2. **5 Cities Ranked as Most Livable in the World: 2023 Report**
3. **8.2 Magnitude Earthquake Hits Remote Island, No Injuries Reported**
4. **3,000-Year-Old Ancient City Discovered in the Desert**
5. **2.5 Billion People to Gain Access to Clean Water by 2030, UN Says**
6. **9/11 Memorial to Be Rebuilt in New York City, Officials Announce**
7. **4 Key Takeaways from the Latest Climate Change Report**
8. **6,000-Year-Old Mummified Cat Discovered in Egyptian Tomb**
9. **1 in 5 People Affected by Natural Disasters, Study Finds**
10. **7 Countries Sign Historic Trade Deal, Boosting Global Economy**

Let me know if you'd like me to generate more!


In [15]:
print(len(assistant_response))

774
