In [1]:
import os
import ast
import json
import copy
import pandas as pd
from utils.llm_configs import setup_llm, load_column_mapping
from utils.context import extract_surrounding_context
from utils.candidates import build_alias_kb, retrieve_candidates
from utils.prompts_utils import construct_pointwise_prompt, parse_llm_decision
from utils.io import safe_parse_candidates, save_intermediate_outputs
from utils.EL_eval import compute_metrics_from_pointwise_csv

2025-05-12 17:43:02.645686: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-12 17:43:02.655048: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747071782.667041  178234 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747071782.670539  178234 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-12 17:43:02.682677: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [2]:
def prepare_dataset(path: str, column_mapping: dict) -> pd.DataFrame:
    df = pd.read_csv(path, dtype={'gt_wiki_id': 'Int64'})
    df.rename(columns=column_mapping, inplace=True)
    df.dropna(subset=['entity_title'], inplace=True)
    df = df[df['wiki_ID'] != -1]
    return df

In [3]:
def extract_all_contexts(df: pd.DataFrame, n: int = 2) -> pd.DataFrame:
    df['surrounding_context'] = df.apply(
        lambda row: extract_surrounding_context(row['article_text'], row['offsets'], row['entity_title'], n=n), axis=1
    )
    return df

In [4]:
def assign_candidates(df: pd.DataFrame, alias_kb: dict) -> pd.DataFrame:
    df['candidates'] = df['entity_title'].apply(lambda x: retrieve_candidates(x, alias_kb))
    df['pre_pt_len_candidates'] = df['candidates'].apply(lambda c: len(c) if c else 0)
    df.dropna(subset=['candidates'], inplace=True)
    return df

In [5]:
def build_prompt_dataframe(df: pd.DataFrame, model: str) -> pd.DataFrame:
    prompt_records = []
    for idx, row in df.iterrows():
        for cand_idx, cand in enumerate(row['candidates']):
            prompt = construct_pointwise_prompt(row, cand, model=model)
            prompt_records.append({
                "row_idx": idx,
                "cand_idx": cand_idx,
                "messages": prompt
            })
    return pd.DataFrame(prompt_records)

In [6]:
def run_inference(llm, prompt_df: pd.DataFrame, model: str, sampling_params, batch_size: int = 5000, checkpoint_path: str = None):
    all_outputs = []
    seen = set()

    if checkpoint_path and os.path.exists(checkpoint_path):
        existing_df = pd.read_csv(checkpoint_path)
        seen = {(r['row_idx'], r['cand_idx']) for _, r in existing_df.iterrows()}
        all_outputs.extend(existing_df.to_dict('records'))
        print(f"[Resume] Loaded {len(all_outputs)} previously completed records.")

    for chunk_start in range(0, len(prompt_df), batch_size):
        chunk = prompt_df.iloc[chunk_start:chunk_start + batch_size]
        if all((row['row_idx'], row['cand_idx']) in seen for _, row in chunk.iterrows()):
            continue
            
        responses = llm.chat(messages=list(chunk['messages']), sampling_params=sampling_params)
        batch_outputs = []

        for i, resp in enumerate(responses):
            row = chunk.iloc[i]
            txt = resp.outputs[0].text
            keep = parse_llm_decision(txt, model=model)
            out = {
                "row_idx": row['row_idx'],
                "cand_idx": row['cand_idx'],
                "keep": keep,
                "llm_text": txt
            }
            all_outputs.append(out)
            batch_outputs.append(out)

        if checkpoint_path:
            pd.DataFrame(batch_outputs).to_csv(
                checkpoint_path,
                mode='a',
                header=not os.path.exists(checkpoint_path),
                index=False
            )
            print(f"[Checkpoint] Saved batch {chunk_start // batch_size + 1}")

    return pd.DataFrame(all_outputs)

In [7]:
def update_candidates(df: pd.DataFrame, output_df: pd.DataFrame) -> pd.DataFrame:
    df['candidates_after_pointwise'] = df['candidates'].apply(lambda x: copy.deepcopy(x))
    for _, row in output_df.iterrows():
        df.at[row['row_idx'], 'candidates_after_pointwise'][row['cand_idx']]['llm_decision'] = row['llm_text']
        df.at[row['row_idx'], 'candidates_after_pointwise'][row['cand_idx']]['relevant'] = row['keep']

    df['candidates_after_pointwise'] = df['candidates_after_pointwise'].apply(
        lambda cands: [c for c in cands if c.get("relevant") is True] if isinstance(cands, list) else []
    )
    df['post_pt_len_candidates'] = df['candidates_after_pointwise'].apply(lambda x: len(x))
    return df

In [8]:
def main():
    model = "llama"  # or "zephyr"
    llm, sampling_params = setup_llm(model=model)
    column_mapping = load_column_mapping()

    sed_outputs = prepare_dataset("/work/pi_wenlongzhao_umass_edu/8/696-detecting-salient-entities/results/sed_results/SED_output.csv", column_mapping)

    with open("prep_kb/filtered_kb_4_22.json") as f:
        kb = json.load(f)
    alias_kb = build_alias_kb(kb)

    sed_outputs_subset = sed_outputs.head(100).copy()
    sed_outputs_subset = extract_all_contexts(sed_outputs_subset, n=2)
    sed_outputs_subset = assign_candidates(sed_outputs_subset, alias_kb)
    
    prompt_df = build_prompt_dataframe(sed_outputs_subset, model=model)
    print('Length of prompts: ', len(prompt_df))

    checkpoint_path = f"outputs/pointwise/checkpoints/{model}_checkpoint.csv"
    os.makedirs(os.path.dirname(checkpoint_path), exist_ok=True)
    output_df = run_inference(llm, prompt_df, model=model, sampling_params=sampling_params, checkpoint_path=checkpoint_path)

    output_df.drop_duplicates(subset=['row_idx', 'cand_idx'], inplace=True)
    sed_outputs_subset = update_candidates(sed_outputs_subset, output_df)
    save_intermediate_outputs(sed_outputs_subset, f"outputs/pointwise/final/intermediate_results_{model}.csv")

    metrics = compute_metrics_from_pointwise_csv(f"outputs/pointwise/final/intermediate_results_{model}.csv")
    print(metrics)

In [9]:
if __name__ == "__main__":
    main()

INFO 05-12 17:43:09 __init__.py:207] Automatically detected platform cuda.
INFO 05-12 17:43:20 config.py:549] This model supports multiple tasks: {'score', 'reward', 'embed', 'generate', 'classify'}. Defaulting to 'generate'.
INFO 05-12 17:43:20 config.py:1555] Chunked prefill is enabled with max_num_batched_tokens=2048.
INFO 05-12 17:43:20 llm_engine.py:234] Initializing a V0 LLM engine (v0.7.3) with config: model='/datasets/ai/llama3/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/5206a32e0bd3067aef1ce90f5528ade7d866253f', speculative_config=None, tokenizer='/datasets/ai/llama3/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/5206a32e0bd3067aef1ce90f5528ade7d866253f', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=131072, download_dir='/work/pi_wenlongzhao_umass_edu/8/aranade/models', load_format=auto, tensor_parallel_size=1, pipeli

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


INFO 05-12 17:43:24 model_runner.py:1115] Loading model weights took 14.9888 GB
INFO 05-12 17:43:25 worker.py:267] Memory profiling takes 0.59 seconds
INFO 05-12 17:43:25 worker.py:267] the current vLLM instance can use total_gpu_memory (44.40GiB) x gpu_memory_utilization (0.90) = 39.96GiB
INFO 05-12 17:43:25 worker.py:267] model weights take 14.99GiB; non_torch_memory takes 0.08GiB; PyTorch activation peak memory takes 1.19GiB; the rest of the memory reserved for KV Cache is 23.70GiB.
INFO 05-12 17:43:25 executor_base.py:111] # cuda blocks: 12136, # CPU blocks: 2048
INFO 05-12 17:43:25 executor_base.py:116] Maximum concurrency for 131072 tokens per request: 1.48x
INFO 05-12 17:43:26 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory

Capturing CUDA graph shapes: 100%|██████████| 35/35 [00:17<00:00,  1.96it/s]

INFO 05-12 17:43:44 model_runner.py:1562] Graph capturing finished in 18 secs, took 0.26 GiB
INFO 05-12 17:43:44 llm_engine.py:436] init engine (profile, create kv cache, warmup model) took 20.22 seconds





Length of prompts:  6240
INFO 05-12 17:43:50 chat_utils.py:332] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to override this.


Processed prompts: 100%|██████████| 5000/5000 [06:01<00:00, 13.82it/s, est. speed input: 10875.75 toks/s, output: 716.37 toks/s]


[Checkpoint] Saved batch 1


Processed prompts: 100%|██████████| 1240/1240 [01:33<00:00, 13.28it/s, est. speed input: 10610.28 toks/s, output: 712.29 toks/s]


[Checkpoint] Saved batch 2
{'Recall@filter': 0.8837209302325582, 'Reduction ratio': 0.5243734219871514, 'Average retained candidates': 27.930232558139537}
