In [3]:
import os
import re
import json
import pandas as pd
import torch
import datetime
# from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from vllm import LLM, SamplingParams
from thefuzz import process
from sklearn.metrics import precision_score, recall_score, f1_score
from utils.data_loader import load_dataset
from utils.clean_json import clean_incomplete_json
from utils.eval import fuzzy_match, evaluate_salience, evaluate_multiple_instances
from utils.postprocessing import save_outputs

In [4]:
os.environ['HF_HOME'] = '/work/pi_wenlongzhao_umass_edu/8/aranade/models/huggingface_cache'
download_dir = "/work/pi_wenlongzhao_umass_edu/8/aranade/models"
llama_model_path = "/datasets/ai/llama3/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/5206a32e0bd3067aef1ce90f5528ade7d866253f"
zephyr_model_path = "HuggingFaceH4/zephyr-7b-beta"

In [15]:
sampling_params = SamplingParams(
    n=1,
    temperature=0,
    max_tokens=300,
    stop=["</s>", "\n}"]
)

In [5]:
llm = LLM(
    model=zephyr_model_path, 
    tokenizer=zephyr_model_path,
    download_dir=download_dir,
)

INFO 03-18 02:34:28 __init__.py:207] Automatically detected platform cuda.
INFO 03-18 02:34:45 config.py:549] This model supports multiple tasks: {'classify', 'score', 'embed', 'reward', 'generate'}. Defaulting to 'generate'.
INFO 03-18 02:34:45 llm_engine.py:234] Initializing a V0 LLM engine (v0.7.3) with config: model='HuggingFaceH4/zephyr-7b-beta', speculative_config=None, tokenizer='HuggingFaceH4/zephyr-7b-beta', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=32768, download_dir='/work/pi_wenlongzhao_umass_edu/8/aranade/models', load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model

model-00008-of-00008.safetensors:   0%|          | 0.00/816M [00:00<?, ?B/s]

model-00003-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00005-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00007-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00001-of-00008.safetensors:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

model-00004-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00006-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00002-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

INFO 03-18 02:35:22 weight_utils.py:270] Time spent downloading weights for HuggingFaceH4/zephyr-7b-beta: 34.643490 seconds


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Loading safetensors checkpoint shards:   0% Completed | 0/8 [00:00<?, ?it/s]


INFO 03-18 02:35:57 model_runner.py:1115] Loading model weights took 13.4967 GB
INFO 03-18 02:36:02 worker.py:267] Memory profiling takes 4.62 seconds
INFO 03-18 02:36:02 worker.py:267] the current vLLM instance can use total_gpu_memory (44.34GiB) x gpu_memory_utilization (0.90) = 39.91GiB
INFO 03-18 02:36:02 worker.py:267] model weights take 13.50GiB; non_torch_memory takes 0.06GiB; PyTorch activation peak memory takes 3.38GiB; the rest of the memory reserved for KV Cache is 22.97GiB.
INFO 03-18 02:36:02 executor_base.py:111] # cuda blocks: 11758, # CPU blocks: 2048
INFO 03-18 02:36:02 executor_base.py:116] Maximum concurrency for 32768 tokens per request: 5.74x
INFO 03-18 02:36:04 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_

Capturing CUDA graph shapes: 100%|██████████| 35/35 [00:18<00:00,  1.87it/s]

INFO 03-18 02:36:22 model_runner.py:1562] Graph capturing finished in 19 secs, took 0.26 GiB
INFO 03-18 02:36:22 llm_engine.py:436] init engine (profile, create kv cache, warmup model) took 25.66 seconds





## Article text and prompt

In [16]:
# title = 'Euro reaches ten month low against US dollar'

# article = '''The euro reached a ten-month low against the US dollar earlier today over concerns about Greece's debt crisis. The euro traded at US$1.3436 in the morning, 
# a level not seen since May of last year, although it finished the day back up at $1.3606. It was, however, up 0.6% against the British pound, at 90.76 pence. The pound also fell today, 
# reaching a trading level of $1.4936 after a loss of 0.4%.The spike lower this morning indicates market nervousness about the prospects of a Greek bailout - the message coming out of 
# Europe is still confused, commented an analysts for CMC Markets, Michael Hewson, as quoted by Agence France-Presse.
# '''

### llama3.1-8b-instruct model prompt

In [16]:
# system_prompt = "You are an expert news article analysis assistant."

few_shot_prompt = """
<|begin_of_text|><|start_header_id|>system<|end_header_id|>
You are an expert news article analysis assistant in entity recognition and reading comprehension.
<|eot_id|><|start_header_id|>user<|end_header_id|>
I need you to extract entities from an article and assign a salience score to each one. The salience score indicates how central the entity is to the article: use 1 for salient entities and 0 for entities mentioned only in passing.

Below are two examples:

Example 1:
---------------------
Article Title: Laws allowing same sex marriage in Washington, D.C. go into effect
Article Text: The United States capital of Washington, D.C. legalized same-sex marriage on Wednesday. Beginning at 6 A.M. local time (1100 UTC), couples began submitting marriage applications at local courthouses citywide. Washington D.C. becomes the seventh United States territory to legalize same sex marriage. The bill was ratified by Mayor Adrian Fenty last December. Due to city's territorial status as a federal district, the bill had to be reviewed by congress. The bill passed congressional review Tuesday night. The bill faced opposition from many family values activists, who tried to stop the bill from becoming law. Supreme Court Chief Justice John Roberts rejected a lawsuit to prevent the measure.
Final JSON Output:
{{
  "entities": [
    {{
      "entity_title": "United States",
      "entity_salience": "1"
    }},
    {{
      "entity_title": "Washington, D.C.",
      "entity_salience": "1"
    }},
    {{
      "entity_title": "same-sex marriage",
      "entity_salience": "0"
    }},
    {{
      "entity_title": "Adrian Fenty",
      "entity_salience": "0"
    }},
    {{
      "entity_title": "federal district",
      "entity_salience": "0"
    }},
    {{
      "entity_title": "family values",
      "entity_salience": "0"
    }},
    {{
      "entity_title": "John Roberts",
      "entity_salience": "0"
    }}
  ]
}}

User: Example 2:
---------------------
Article Title: New York Representative Eric Massa to retire
Article Text: New York Representative Eric Massa announced Wednesday that he would be stepping down as Congressman from New York's 29th congressional district. He cited health reasons for the sudden announcement. Massa is the latest in a string of United States Congresspeople to resign or not to seek reelection in 2010. He said that he had his third non-specific cancer recurrence in December 2009. He implied that his cancer is terminal, saying “I will now enter the final phase of my life at a more controlled pace.” He denied claims that his resignation is related to a sexual harassment accusation from a male aide. He said that this was untrue, although he admitted to using so called salty language.
Final JSON Output:Assistant: Based on the above, the final JSON output should be:
{{
  "entities": [
    {{
      "entity_title": "New York",
      "entity_salience": "1"
    }},
    {{
      "entity_title": "Eric Massa",
      "entity_salience": "0"
    }},
    {{
      "entity_title": "Congressman",
      "entity_salience": "1"
    }},
    {{
      "entity_title": "New York",
      "entity_salience": "1"
    }},
    {{
      "entity_title": "29th congressional district",
      "entity_salience": "0"
    }},
    {{
      "entity_title": "United States",
      "entity_salience": "1"
    }},
    {{
      "entity_title": "sexual harassment",
      "entity_salience": "0"
    }}
  ]
}}

Before producing the final JSON output, please think through your reasoning for each extracted entity. Identify each entity mentioned in the article and decide if it is central (salience 1) or only mentioned (salience 0) based solely on the article title and text. Do not output your internal reasoning.

Now, based on the article below, return only the final JSON answer (with no extra commentary or questions). Use the exact format below:

{{
  "entities": [
    {{
      "entity_title": "<entity_name>",
      "entity_salience": "<0 or 1>"
    }},
    ...
  ]
}}

---------------------
Article Title: {title}
Article Text: {article}
---------------------

<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""

### Zephyr-7b prompt

In [7]:
few_shot_zephyr_prompt = """
<|system|>
You are an expert news article analysis assistant in entity recognition and reading comprehension.
I need you to extract entities from an article and assign a salience score to each one. The salience score indicates how central the entity is to the article: use 1 for salient entities and 0 for entities mentioned only in passing.
Below are two examples:

Example 1:
---------------------
Article Title: Laws allowing same sex marriage in Washington, D.C. go into effect
Article Text: The United States capital of Washington, D.C. legalized same-sex marriage on Wednesday. Beginning at 6 A.M. local time (1100 UTC), couples began submitting marriage applications at local courthouses citywide. Washington D.C. becomes the seventh United States territory to legalize same sex marriage. The bill was ratified by Mayor Adrian Fenty last December. Due to city's territorial status as a federal district, the bill had to be reviewed by congress. The bill passed congressional review Tuesday night. The bill faced opposition from many family values activists, who tried to stop the bill from becoming law. Supreme Court Chief Justice John Roberts rejected a lawsuit to prevent the measure.
Final JSON Output:
{{
  "entities": [
    {{
      "entity_title": "United States",
      "entity_salience": "1"
    }},
    {{
      "entity_title": "Washington, D.C.",
      "entity_salience": "1"
    }},
    {{
      "entity_title": "same-sex marriage",
      "entity_salience": "0"
    }},
    {{
      "entity_title": "Adrian Fenty",
      "entity_salience": "0"
    }},
    {{
      "entity_title": "federal district",
      "entity_salience": "0"
    }},
    {{
      "entity_title": "family values",
      "entity_salience": "0"
    }},
    {{
      "entity_title": "John Roberts",
      "entity_salience": "0"
    }}
  ]
}}

User: Example 2:
---------------------
Article Title: New York Representative Eric Massa to retire
Article Text: New York Representative Eric Massa announced Wednesday that he would be stepping down as Congressman from New York's 29th congressional district. He cited health reasons for the sudden announcement. Massa is the latest in a string of United States Congresspeople to resign or not to seek reelection in 2010. He said that he had his third non-specific cancer recurrence in December 2009. He implied that his cancer is terminal, saying “I will now enter the final phase of my life at a more controlled pace.” He denied claims that his resignation is related to a sexual harassment accusation from a male aide. He said that this was untrue, although he admitted to using so called salty language.
Final JSON Output:Assistant: Based on the above, the final JSON output should be:
{{
  "entities": [
    {{
      "entity_title": "New York",
      "entity_salience": "1"
    }},
    {{
      "entity_title": "Eric Massa",
      "entity_salience": "0"
    }},
    {{
      "entity_title": "Congressman",
      "entity_salience": "1"
    }},
    {{
      "entity_title": "New York",
      "entity_salience": "1"
    }},
    {{
      "entity_title": "29th congressional district",
      "entity_salience": "0"
    }},
    {{
      "entity_title": "United States",
      "entity_salience": "1"
    }},
    {{
      "entity_title": "sexual harassment",
      "entity_salience": "0"
    }}
  ]
}}

Before producing the final JSON output, please think through your reasoning for each extracted entity. Identify each entity mentioned in the article and decide if it is central (salience 1) or only mentioned (salience 0) based solely on the article title and text. Do not output your internal reasoning.

Now, based on the article below, return only the final JSON answer (with no extra commentary or questions). Use the exact format below:

{{
  "entities": [
    {{
      "entity_title": "<entity_name>",
      "entity_salience": "<0 or 1>"
    }},
    ...
  ]
}}

</s>
<|user|>
Based on the article below, extract entities and assign salience scores. Output only the final JSON result. (Use the exact JSON format specified.)
---------------------
Article Title: {title}
Article Text: {article}
---------------------
</s>
<|assistant|>
"""

## Generate response

In [8]:
# messages = [
#     {"role": "system", "content": system_prompt},
#     {"role": "user", "content": few_shot_prompt},
# ]

# combined_prompt = f"{system_prompt}\n\n{few_shot_prompt}"
# combined_prompt
# few_shot_prompt
prompt = few_shot_zephyr_prompt.format(title=title, article=article)
prompt

'\n<|system|>\nYou are an expert news article analysis assistant in entity recognition and reading comprehension.\nI need you to extract entities from an article and assign a salience score to each one. The salience score indicates how central the entity is to the article: use 1 for salient entities and 0 for entities mentioned only in passing.\nBelow are two examples:\n\nExample 1:\n---------------------\nArticle Title: Laws allowing same sex marriage in Washington, D.C. go into effect\nArticle Text: The United States capital of Washington, D.C. legalized same-sex marriage on Wednesday. Beginning at 6 A.M. local time (1100 UTC), couples began submitting marriage applications at local courthouses citywide. Washington D.C. becomes the seventh United States territory to legalize same sex marriage. The bill was ratified by Mayor Adrian Fenty last December. Due to city\'s territorial status as a federal district, the bill had to be reviewed by congress. The bill passed congressional review

In [11]:
# Generate outputs using the vllm model
outputs = llm.generate(prompts=prompt, sampling_params=sampling_params)

Processed prompts: 100%|██████████| 1/1 [00:03<00:00,  3.78s/it, est. speed input: 349.03 toks/s, output: 33.61 toks/s]


In [13]:
outputs[0].outputs[0].text

'{\n  "entities": [\n    {\n      "entity_title": "euro",\n      "entity_salience": "1"\n    },\n    {\n      "entity_title": "US dollar",\n      "entity_salience": "1"\n    },\n    {\n      "entity_title": "Greece",\n      "entity_salience": "1"\n    },\n    {\n      "entity_title": "British pound",\n      "entity_salience": "0"\n    }\n  ]'

In [14]:
clean_incomplete_json(outputs[0].outputs[0].text)

{'entities': [{'entity_title': 'euro', 'entity_salience': '1'},
  {'entity_title': 'US dollar', 'entity_salience': '1'},
  {'entity_title': 'Greece', 'entity_salience': '1'},
  {'entity_title': 'British pound', 'entity_salience': '0'}]}

## Over entire dataset

In [17]:
file_path = "/project/pi_wenlongzhao_umass_edu/8/data/WN_salience_train.json"
df = load_dataset(file_path)
df

Unnamed: 0,text,title,entities
0,"The United States capital of Washington, D.C. ...","Laws allowing same sex marriage in Washington,...","[{'entity title': 'United States', 'entity sal..."
1,New York Representative Eric Massa announced W...,New York Representative Eric Massa to retire,"[{'entity title': 'New York', 'entity salience..."
2,Canadian airline Air Canada has said that it w...,Air Canada to lay off over a thousand machinists,"[{'entity title': 'Canadian', 'entity salience..."
3,Former Bosnian president Dr. Ejup Ganić was ar...,Former Bosnian president arrested in London,"[{'entity title': 'Bosnian', 'entity salience'..."
4,The euro reached a ten-month low against the U...,Euro reaches ten month low against US dollar,"[{'entity title': 'euro', 'entity salience': '..."
...,...,...,...
5383,Wildfires in California continued to spread on...,California wildfires continue to spread,"[{'entity title': 'California', 'entity salien..."
5384,American-based online retailer Amazon.com has ...,Amazon dips into memory hole to retrieve Orwel...,"[{'entity title': 'American', 'entity salience..."
5385,According to the United Nations' humanitarian ...,Thousands displaced after heavy flooding in Bu...,"[{'entity title': 'United Nations', 'entity sa..."
5386,English author and playwright Keith Waterhouse...,Author and playwright Keith Waterhouse dies at 80,"[{'entity title': 'English', 'entity salience'..."


In [18]:
articles = list(df['text'])
titles = list(df['title'])

In [19]:
len(articles), len(titles)

(5388, 5388)

In [20]:
combined_prompts = []
for title, article in zip(titles, articles):
    prompt = few_shot_zephyr_prompt.format(title=title, article=article)
    combined_prompts.append(prompt)

In [22]:
len(combined_prompts)

5388

In [23]:
outputs = llm.generate(prompts=combined_prompts, sampling_params=sampling_params)

Processed prompts:   0%|          | 0/5388 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]



Processed prompts: 100%|██████████| 5388/5388 [1:10:11<00:00,  1.28it/s, est. speed input: 2069.48 toks/s, output: 357.90 toks/s]


In [27]:
results = []
for i, output in enumerate(outputs):
    generated_text = output.outputs[0].text
    response = clean_incomplete_json(generated_text)
    results.append({
        "iteration": i + 1,
        "generated_text": generated_text,
        "entities": response['entities']
    })

Skipping invalid entity: {
      "entity_title": "Star Wars "Celebration V" Convention",
      "entity_salience": "0"
    }
Skipping invalid entity: {
      "entity_title": "St Thomas" Hospital",
      "entity_salience": "0"
    }
Skipping invalid entity: {
      "entity_title": "Brussels" airport",
      "entity_salience": "0"
    }
Skipping invalid entity: {
      "entity_title": "Pathway Home veterans" psychiatric center",
      "entity_salience": "1"
    }
Error: No valid JSON-like content found in the response.
Error: No valid JSON-like content found in the response.
Error: No valid JSON-like content found in the response.
Error: No valid JSON-like content found in the response.
Skipping invalid entity: {
      "entity_title": "Workers" Youth League",
      "entity_salience": "0"
    }
Skipping invalid entity: {
      "entity_title": "David "Comedy Dave" Vitty",
      "entity_salience": "1"
    }
Skipping invalid entity: {
      "entity_title": "Aarons" 499",
      "entity_salienc

In [28]:
len(results)

5388

In [29]:
results[0]

{'iteration': 1,
 'generated_text': '{\n  "entities": [\n    {\n      "entity_title": "United States",\n      "entity_salience": "1"\n    },\n    {\n      "entity_title": "Washington, D.C.",\n      "entity_salience": "1"\n    },\n    {\n      "entity_title": "same-sex marriage",\n      "entity_salience": "0"\n    },\n    {\n      "entity_title": "Adrian Fenty",\n      "entity_salience": "0"\n    },\n    {\n      "entity_title": "federal district",\n      "entity_salience": "0"\n    },\n    {\n      "entity_title": "family values",\n      "entity_salience": "0"\n    },\n    {\n      "entity_title": "John Roberts",\n      "entity_salience": "0"\n    }\n  ]',
 'entities': [{'entity_title': 'United States', 'entity_salience': '1'},
  {'entity_title': 'Washington, D.C.', 'entity_salience': '1'},
  {'entity_title': 'same-sex marriage', 'entity_salience': '0'},
  {'entity_title': 'Adrian Fenty', 'entity_salience': '0'},
  {'entity_title': 'federal district', 'entity_salience': '0'},
  {'entit

In [30]:
ts = datetime.datetime.utcnow().isoformat()

# Save the result (final outputs) as a JSON file
save_outputs(results, f'outputs/{ts}.json')

# Evaluate metrics (assuming evaluate_multiple_instances returns a serializable structure)
metrics = evaluate_multiple_instances(df['entities'], results)
save_outputs(metrics, f'metrics/{ts}.json')

Weird data: {'0': 0, '1': 10}
Weird data: {'0': 0, '1': 10}
Weird data: {'0': 0, '1': 1}


Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: '. ']


Weird data: {'1998': '1', '2007': '0'}
Weird data: {'1998': '1', '2007': '0'}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: '€']


Instance-level errors (empty ground truth or model output): 8


## Check if any empty

In [32]:
def check_empty_lists(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    
    found_empty = False
    # Assuming the top-level object is a list of dictionaries.
    for i, item in enumerate(data):
        for key, value in item.items():
            if isinstance(value, list) and len(value) == 0:
                print(f"Empty list found in iteration {i+1} under key '{key}'.")
                found_empty = True

    if not found_empty:
        print("No empty lists found.")
    return found_empty

In [34]:
empty_exists = check_empty_lists("outputs/2025-03-18T03:49:22.363424.json")
empty_exists

Empty list found in iteration 906 under key 'entities'.
Empty list found in iteration 928 under key 'entities'.
Empty list found in iteration 950 under key 'entities'.
Empty list found in iteration 1005 under key 'entities'.
Empty list found in iteration 3454 under key 'entities'.
Empty list found in iteration 4681 under key 'entities'.
Empty list found in iteration 5027 under key 'entities'.
Empty list found in iteration 5269 under key 'entities'.


True