In [1]:
import argparse, re, os
from typing import List, Union, Iterable
from itertools import zip_longest
from compare_mt.rouge.rouge_scorer import RougeScorer
from nltk import sent_tokenize, word_tokenize
from transformers.pipelines.pt_utils import KeyDataset
from sklearn.metrics import accuracy_score
from datasets import load_dataset
import numpy as np
from tqdm import tqdm
from evaluate import evaluator
import torch
import csv
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
)

####################
##### Model ID #####
####################

model_id = "/home/gpq253/tuned-llama-3.1-8b-v2"

  from .autonotebook import tqdm as notebook_tqdm
INFO:datasets:PyTorch version 2.5.1 available.


In [2]:
####################
### Load Dataset ###
####################

eval_dataset = load_dataset("csv", data_files="./Datasets/val.csv", split="train") #, streaming=True

In [3]:
###########################
### Sample from Dataset ###
###########################

print(next(iter(eval_dataset))) 

{'text': "(Sep 25, 2016  1:28 PM CDT) Authorities say seven people have been injured in an apparent fight in Boston's Theater District, the AP reports. The Boston Globe reports that Bernard O'Rourke, police superintendent of the bureau of field services, said officers responded to a report of a fight about 2:15am Sunday in the district, where bars and restaurants cater to nightlife crowds and had just closed. Police say people were stabbed with knives or bottles. Four of the victims were transported to receive medical treatment, while the other three walked into hospitals on their own, a police spokesman tells the Globe. Another police spokesman on Sunday afternoon told the AP the injuries appear to be non-life-threatening; earlier reports had said one person was critically injured. O'Rourke has said a suspect has been IDed—and per RT.com, that suspect was one of the wounded. Police are said to be seeking a second suspect. Emerson College, which has facilities in the area, alerted stud

In [4]:
#prompt = f"Generate a single headlines for this news article and ensure it has at least one number in the headline: {eval_dataset["text"]}"
#Eval_dataset in format of huggin face dataset (datasets.arrow_dataset.Dataset)
print(type(eval_dataset))
print(eval_dataset)

<class 'datasets.arrow_dataset.Dataset'>
Dataset({
    features: ['text', 'summary', 'cloze', 'cloze_gt', 'cloze_annotation', 'need_reasoning'],
    num_rows: 2775
})


In [5]:
##################################################
### Split Dataset to Components for Evaluation ###
##################################################

text = eval_dataset.remove_columns(['summary','cloze','cloze_gt','cloze_annotation','need_reasoning'])
target = eval_dataset.remove_columns(['text','cloze','cloze_gt','cloze_annotation','need_reasoning'])
num_gt = eval_dataset.remove_columns(['text','summary','cloze','cloze_annotation','need_reasoning'])
num_type = eval_dataset.remove_columns(['text','summary','cloze','cloze_gt','cloze_annotation'])

In [6]:
print(next(iter(text)))
print(next(iter(target)))
print(next(iter(num_gt)))
print(next(iter(num_type)))

{'text': "(Sep 25, 2016  1:28 PM CDT) Authorities say seven people have been injured in an apparent fight in Boston's Theater District, the AP reports. The Boston Globe reports that Bernard O'Rourke, police superintendent of the bureau of field services, said officers responded to a report of a fight about 2:15am Sunday in the district, where bars and restaurants cater to nightlife crowds and had just closed. Police say people were stabbed with knives or bottles. Four of the victims were transported to receive medical treatment, while the other three walked into hospitals on their own, a police spokesman tells the Globe. Another police spokesman on Sunday afternoon told the AP the injuries appear to be non-life-threatening; earlier reports had said one person was critically injured. O'Rourke has said a suspect has been IDed—and per RT.com, that suspect was one of the wounded. Police are said to be seeking a second suspect. Emerson College, which has facilities in the area, alerted stud

In [7]:
############################################
### Output Individualized Outputs to CSV ###
############################################

target.to_csv('./Datasets/target.csv')
num_gt.to_csv('./Datasets/num_gt.csv')
num_type.to_csv('./Datasets/num_type.csv')

Creating CSV from Arrow format: 100%|█████████████| 3/3 [00:00<00:00, 91.26ba/s]
Creating CSV from Arrow format: 100%|████████████| 3/3 [00:00<00:00, 768.00ba/s]
Creating CSV from Arrow format: 100%|███████████| 3/3 [00:00<00:00, 1491.04ba/s]


5565

In [8]:
for i in text['text']:
    print(i)
    break

(Sep 25, 2016  1:28 PM CDT) Authorities say seven people have been injured in an apparent fight in Boston's Theater District, the AP reports. The Boston Globe reports that Bernard O'Rourke, police superintendent of the bureau of field services, said officers responded to a report of a fight about 2:15am Sunday in the district, where bars and restaurants cater to nightlife crowds and had just closed. Police say people were stabbed with knives or bottles. Four of the victims were transported to receive medical treatment, while the other three walked into hospitals on their own, a police spokesman tells the Globe. Another police spokesman on Sunday afternoon told the AP the injuries appear to be non-life-threatening; earlier reports had said one person was critically injured. O'Rourke has said a suspect has been IDed—and per RT.com, that suspect was one of the wounded. Police are said to be seeking a second suspect. Emerson College, which has facilities in the area, alerted students to th

In [9]:
device = "cuda" if torch.cuda.is_available() else "cpu"

torch.cuda.empty_cache()

pipe = pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device=device,
)

Loading checkpoint shards: 100%|██████████████████| 4/4 [00:00<00:00,  6.17it/s]


In [11]:
response = []

#. The headline should include at least one number (in numeral format, so 2 instead of two)

for article in text['text']:
    torch.cuda.empty_cache()
    
    prompt = f"Generate a single headline for the following news article: {article}"

    messages = [
        {"role": "system", "content": "You will be given a news article with the prefix 'news'. You will also be provided with an incomplete headline with the prefix 'masked_headline'. Based on the news content, please output the completed headline."},
        {"role": "user", "content": prompt},
    ]
    
    terminators = [
        pipe.tokenizer.eos_token_id,
        pipe.tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]

    outputs = pipe(
        messages,
        max_new_tokens=256,
        eos_token_id=terminators,
        do_sample=True,
        temperature=0.6,
        top_p=0.9,
        pad_token_id = pipe.tokenizer.eos_token_id
    )

    assistant_response = outputs[0]["generated_text"][-1]["content"]

    response.append(assistant_response.strip('"'))

    #break

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [13]:
print(len(response))

2775


In [21]:
with open('./Datasets/predict.csv', 'w') as myfile:
    writer = csv.writer(myfile)
    writer.writerow(['predict'])
    for i in response:
        writer.writerow([i])

In [None]:
python numhg_eval.py \
--tgt_path='../Datasets/target.csv' \
--pre_path='../Datasets/predict.csv' \
--num_gt_path='../Datasets/num_gt.csv' \
--num_type_path='../Datasets/num_type.csv'