### Steps
* Choose 100 random entries from the gold+silver data
* get the raw papers for them
* extract the papers using the library
* store them in a new dataframe
* adapt the inf prompt to take in the paper context
* adapt the chatgpt fn to take an optional parm for the context
* generate labels for this data
* do evaluation

In [36]:
import pandas as pd
import datasets
aspects = [ 'actionability', 'grounding_specificity','verifiability', 'helpfulness']
random_state = 77
ds = datasets.load_dataset("boda/review_evaluation_human_annotation", "combined_main_aspects", split="full")

ds = ds.map(lambda x: {k: str(v) for k, v in x.items()})
ds = ds.to_pandas()


############# take the samples where we have majority labels for all aspects
filtered_rows = ds[
    (ds['actionability_label_type'].isin(['gold', 'silver'])) &
    (ds['grounding_specificity_label_type'].isin(['gold', 'silver'])) &
    (ds['verifiability_label_type'].isin(['gold', 'silver'])) &
    (ds['helpfulness_label_type'].isin(['gold', 'silver']))
]


# List of OpenReview-supported venues
openreview_venues = ['ICLR_2024', 'ICLR_2025', 'EMNLP_2023']
############# exclude rows with OpenReview-supported venues
filtered_rows = filtered_rows[~filtered_rows['venue'].isin(openreview_venues)]

############### take 100 random samples
sampled_ds = filtered_rows.sample(n=100, random_state=random_state)


In [37]:
print(ds['actionability'].iloc[0])

{'annotators': ['6740484e188a64793529ee77', '6686ebe474531e4a1975636f', 'boda'], 'labels': ['3', '3', '5']}


In [38]:

sampled_ds['venue'].value_counts()

venue
NIPS_2018    12
NIPS_2019    12
NIPS_2020    11
ICLR_2022    11
ARR_2022     10
NIPS_2022     8
NIPS_2021     8
NIPS_2017     7
ICLR_2023     7
NIPS_2016     6
ACL_2017      4
ICLR_2021     4
Name: count, dtype: int64

In [39]:
import os
import json
import pandas as pd
import requests

# PDF download function
def download_openreview_pdf(pdf_url: str, output_path: str):

    # pdf_url = f"https://openreview.net{pdf_url}"
    pdf_url = f"https://openreview.net/pdf?id={pdf_url}"
    response = requests.get(pdf_url, stream=True)
    if response.status_code == 200:
        with open(output_path, 'wb') as f:
            for chunk in response.iter_content(chunk_size=1024):
                f.write(chunk)
        print(f"[+] Downloaded paper to {output_path}")
    else:
        print(f"[!] Failed to download PDF from {pdf_url}. Status code: {response.status_code}")

# Paths
base_path = "/home/abdelrahman.sadallah/mbzuai/review_rewrite/data/raw_papers"
json_paths = {
    "EMNLP_2023": os.path.join(base_path, "emnlp2023_papers.json"),
    "ICLR_2024": os.path.join(base_path, "iclr2024_papers.json"),
    "ICLR_2025": os.path.join(base_path, "iclr2025_papers.json"),
}



# Load all JSON files into a dictionary
paper_data = {}
for venue, path in json_paths.items():
    with open(path, 'r') as f:
        data = json.load(f)
        paper_data[venue] = {entry['id']: entry for entry in data}

In [40]:
# #################################################### Download Reviewer2 Data ####################################################

# import os
# import zipfile
# from huggingface_hub import list_repo_files, hf_hub_download

# # Configuration
# repo_id = "GitBag/Reviewer2_PGE_raw"
# target_dir = "/l/users/abdelrahman.sadallah/reviewer2_raw_data"
# os.makedirs(target_dir, exist_ok=True)

# # Step 1: List all .zip files in the dataset repo
# zip_files = [f for f in list_repo_files(repo_id=repo_id, repo_type="dataset") if f.endswith(".zip")]
# print(f"[INFO] Found {len(zip_files)} zip files.")

# # Step 2 & 3: Download and unzip each .zip file
# for zip_file in zip_files:
#     print(f"[INFO] Downloading {zip_file}...")
    
#     # Download ZIP
#     local_zip_path = hf_hub_download(
#         repo_id=repo_id,
#         filename=zip_file,
#         repo_type="dataset",
#         local_dir=target_dir,
#         local_dir_use_symlinks=False  # make a real copy
#     )

#     print(f"[INFO] Downloaded to {local_zip_path}")

#     # Step 4: Unzip
#     with zipfile.ZipFile(local_zip_path, 'r') as zip_ref:
#         extract_path = os.path.join(target_dir, os.path.splitext(os.path.basename(zip_file))[0])
#         os.makedirs(extract_path, exist_ok=True)
#         zip_ref.extractall(extract_path)
#         print(f"[INFO] Extracted to {extract_path}")


In [41]:
sampled_ds['paper_id'].unique()

array(['NIPS_2017_382', 'NIPS_2018_76', 'NIPS_2016_238', 'NIPS_2020_350',
       'ICLR_2023_4741', 'NIPS_2019_1350', 'NIPS_2022_2635',
       'NIPS_2019_772', 'NIPS_2018_857', 'ICLR_2022_445', 'NIPS_2018_134',
       'NIPS_2017_631', 'NIPS_2021_40', 'ACL_2017_494_review',
       'NIPS_2020_295', 'NIPS_2019_1089', 'NIPS_2016_395',
       'ICLR_2022_3058', 'NIPS_2019_1246', 'NIPS_2018_66',
       'ARR_2022_285_review', 'NIPS_2021_1917', 'NIPS_2018_15',
       'ACL_2017_31_review', 'ARR_2022_93_review', 'NIPS_2016_115',
       'NIPS_2021_2163', 'NIPS_2017_53', 'NIPS_2022_670', 'NIPS_2017_217',
       'ICLR_2022_234', 'NIPS_2022_2523', 'NIPS_2020_1274',
       'NIPS_2019_962', 'ICLR_2023_802', 'NIPS_2017_356', 'NIPS_2020_696',
       'ICLR_2022_562', 'NIPS_2022_948', 'ARR_2022_286_review',
       'NIPS_2019_1145', 'NIPS_2018_38', 'NIPS_2020_1371',
       'NIPS_2022_1572', 'ARR_2022_23_review', 'ARR_2022_162_review',
       'ARR_2022_64_review', 'ICLR_2023_1980', 'NIPS_2022_1913',
       'N

In [42]:
import os
import pandas as pd
from tqdm import tqdm

# Your base storage path
base_path = "/l/users/abdelrahman.sadallah/reviewer2_raw_data"

# List of OpenReview-supported venues
openreview_venues = ['ICLR_2024', 'ICLR_2025', 'EMNLP_2023']

# Define the function that constructs the file path
def get_paper_path(venue: str, paper_id: str) -> str:
    search_id = paper_id.replace('_review','')
    if not search_id.endswith("_paper"):
        search_id += "_paper"

    if venue in openreview_venues:
        return os.path.join(base_path, venue, f"{venue}_{paper_id}.pdf")
    else:
        # e.g., venue = NIPS_2017 -> prefix = NIPS
        prefix = venue.split('_')[0]
        cur_path = os.path.join(base_path, prefix, prefix, venue,f"{venue}_paper",  f"{search_id}.json")
        print(cur_path)
        return cur_path
# Define the check-and-download logic
def ensure_pdf_and_get_path(venue: str, paper_id: str) -> str:
    path = get_paper_path(venue, paper_id)
    if os.path.isfile(path):
        return path  # Already exists

    # Try to download for OpenReview venues
    if venue in openreview_venues:
        if venue in paper_data and paper_id in paper_data[venue]:
            paper_info = paper_data[venue][paper_id]
            pdf_url = paper_info.get('content', {}).get('pdf', {}).get('value', None)
            if pdf_url:
                os.makedirs(os.path.dirname(path), exist_ok=True)
                download_openreview_pdf(paper_id, path)
                return path if os.path.isfile(path) else ""
            else:
                print(f"[!] No PDF URL for {venue}/{paper_id}")
                return ""
        else:
            print(f"[!] Paper {paper_id} not found in paper_data[{venue}]")
            return ""
    else:
        # Not OpenReview, no download attempt
        print(f"[i] Skipping download for {venue}/{paper_id}; check local path.")
        return path if os.path.isfile(path) else ""

# ==== MAIN EXECUTION ====

# Assume `sampled_ds` and `paper_data` are already loaded
# sampled_ds = pd.read_csv(...) etc.

paper_paths = []

for _, row in tqdm(sampled_ds.iterrows(), total=len(sampled_ds)):
    venue = row['venue']
    paper_id = row['paper_id']
    path = ensure_pdf_and_get_path(venue, paper_id)
    paper_paths.append(path)

# Add the path column and save
sampled_ds['paper_path'] = paper_paths
sampled_ds.to_csv("/home/abdelrahman.sadallah/mbzuai/review_rewrite/data/context_experiment.csv", index=False)


100%|██████████| 100/100 [00:00<00:00, 10669.27it/s]

/l/users/abdelrahman.sadallah/reviewer2_raw_data/NIPS/NIPS/NIPS_2017/NIPS_2017_paper/NIPS_2017_382_paper.json
/l/users/abdelrahman.sadallah/reviewer2_raw_data/NIPS/NIPS/NIPS_2018/NIPS_2018_paper/NIPS_2018_76_paper.json
/l/users/abdelrahman.sadallah/reviewer2_raw_data/NIPS/NIPS/NIPS_2016/NIPS_2016_paper/NIPS_2016_238_paper.json
/l/users/abdelrahman.sadallah/reviewer2_raw_data/NIPS/NIPS/NIPS_2020/NIPS_2020_paper/NIPS_2020_350_paper.json
/l/users/abdelrahman.sadallah/reviewer2_raw_data/ICLR/ICLR/ICLR_2023/ICLR_2023_paper/ICLR_2023_4741_paper.json
/l/users/abdelrahman.sadallah/reviewer2_raw_data/NIPS/NIPS/NIPS_2019/NIPS_2019_paper/NIPS_2019_1350_paper.json
/l/users/abdelrahman.sadallah/reviewer2_raw_data/NIPS/NIPS/NIPS_2022/NIPS_2022_paper/NIPS_2022_2635_paper.json
/l/users/abdelrahman.sadallah/reviewer2_raw_data/NIPS/NIPS/NIPS_2019/NIPS_2019_paper/NIPS_2019_772_paper.json
/l/users/abdelrahman.sadallah/reviewer2_raw_data/NIPS/NIPS/NIPS_2018/NIPS_2018_paper/NIPS_2018_857_paper.json
/l/users




In [43]:
with open('/l/users/abdelrahman.sadallah/reviewer2_raw_data/ICLR/ICLR/ICLR_2023/ICLR_2023_paper/ICLR_2023_4659_paper.json', 'r') as f:
    data = json.load(f)

data

{'name': 'ICLR_2023_4659_pdf.pdf',
 'metadata': {'source': 'CRF',
  'title': None,
  'authors': [],
  'emails': [],
  'sections': [{'heading': '1 INTRODUCTION',
    'text': 'The object recognition problem remains in an unclear state. Despite compelling performance of state-of-the-art object recognition methods, several questions such as out of distribution generalization (Recht et al., 2019; Barbu et al., 2019; Shankar et al., 2020; Taori et al., 2020; Koh et al., 2020), “superhuman performance” (He et al., 2015; Geirhos et al., 2018), adversarial vulnerability (Goodfellow et al., 2014), and invariance to image transformations and distortions (Hendrycks & Dietterich, 2019) still persist. Raw performance on test sets has been the main indicator of the progress and the major feedback about the state of the field. Few test sets have been proposed for evaluating object recognition models. Some follow the footsteps of ImageNet (Recht et al., 2019). Some filter images based on failures of mo

In [44]:
import os
import json
import pandas as pd
from tqdm import tqdm

# Load the dataset
csv_path = "/home/abdelrahman.sadallah/mbzuai/review_rewrite/data/context_experiment.csv"
df = pd.read_csv(csv_path)

def extract_paper_text(json_path):
    try:
        with open(json_path, 'r') as f:
            data = json.load(f)
    except Exception as e:
        print(f"[!] Failed to load JSON {json_path}: {e}")
        return ""

    metadata = data.get("metadata", {})
    title = metadata.get("title", "")
    sections = metadata.get("sections", [])

    text_parts = [title.strip()] if isinstance(title, str) and title.strip() else []

    if isinstance(sections, list):
        for section in sections:
            heading = section.get("heading", "")
            section_text = section.get("text", "")
            if isinstance(heading, str) and heading.strip():
                text_parts.append(heading.strip())
            if isinstance(section_text, str) and section_text.strip():
                text_parts.append(section_text.strip())

    return "\n\n".join(text_parts)

# Process each row
paper_texts = []
word_counts = []

for paper_path in tqdm(df['paper_path']):
    if not isinstance(paper_path, str) or not paper_path.strip():
        paper_texts.append("")
        word_counts.append(0)
        continue

    json_path = os.path.splitext(paper_path)[0] + ".json"

    if not os.path.isfile(json_path):
        paper_texts.append("")
        word_counts.append(0)
        continue

    paper_text = extract_paper_text(json_path)
    paper_texts.append(paper_text)
    word_counts.append(len(paper_text.split()))

# Add the columns and save
df['paper_text'] = paper_texts
df['paper_word_count'] = word_counts

df.to_csv("/home/abdelrahman.sadallah/mbzuai/review_rewrite/data/context_experiment_with_paper_text.csv", index=False)

print(f"✅ Processed {len(df)} papers. Total with non-empty text: {(df['paper_word_count'] > 0).sum()}")


100%|██████████| 100/100 [00:00<00:00, 980.63it/s]

✅ Processed 100 papers. Total with non-empty text: 100





In [45]:
import os
import pandas as pd
from tqdm import tqdm

# Load the DataFrame with paper_path
df = pd.read_csv("/home/abdelrahman.sadallah/mbzuai/review_rewrite/data/context_experiment.csv")

# Check if file exists for each path
exist_flags = []
missing_paths = []

for path in tqdm(df['paper_path']):
    exists = os.path.isfile(path) if isinstance(path, str) and path.strip() else False
    exist_flags.append(exists)
    if not exists:
        missing_paths.append(path)


# Optional: Print missing file paths
print(f"[!] Missing {len(missing_paths)} files:")
for path in missing_paths:
    print(f" - {path}")


100%|██████████| 100/100 [00:00<00:00, 43695.22it/s]

[!] Missing 0 files:





In [46]:
df = pd.read_csv("/home/abdelrahman.sadallah/mbzuai/review_rewrite/data/context_experiment_with_paper_text.csv")

df['paper_word_count'].describe()

count      100.000000
mean      5242.910000
std       2009.879704
min       2369.000000
25%       4222.000000
50%       4874.500000
75%       5627.750000
max      18873.000000
Name: paper_word_count, dtype: float64

In [47]:
print(df['actionability'].iloc[0])

{'annotators': ['boda', '6740484e188a64793529ee77', '6686ebe474531e4a1975636f'], 'labels': ['4', '5', '5']}


In [48]:
from datasets import Dataset

df = df.astype(str)
# 
# Convert the pandas DataFrame to a Hugging Face Dataset
hf_dataset = Dataset.from_pandas(df)

# Save the dataset to the Hugging Face format
hf_dataset.push_to_hub(
    repo_id="boda/review_evaluation_human_annotation",
    config_name="context_experiment_with_paper_text",
    split="full",
)
print("✅ Dataset saved to Hugging Face format.")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

✅ Dataset saved to Hugging Face format.


In [49]:
from datasets import Dataset
import datasets
hf_dataset = datasets.load_dataset(
    "boda/review_evaluation_human_annotation",
    "context_experiment_with_paper_text",
    split="full"
)

README.md:   0%|          | 0.00/9.62k [00:00<?, ?B/s]

full-00000-of-00001.parquet:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Generating full split:   0%|          | 0/100 [00:00<?, ? examples/s]

In [50]:


hf_dataset['actionability']

["{'annotators': ['boda', '6740484e188a64793529ee77', '6686ebe474531e4a1975636f'], 'labels': ['4', '5', '5']}",
 "{'annotators': ['boda', '6686ebe474531e4a1975636f', '6740484e188a64793529ee77'], 'labels': ['4', '4', '3']}",
 "{'annotators': ['6740484e188a64793529ee77', '6686ebe474531e4a1975636f', 'boda'], 'labels': ['5', '5', '5']}",
 "{'annotators': ['boda', '6740484e188a64793529ee77', '6686ebe474531e4a1975636f'], 'labels': ['5', '5', '5']}",
 "{'annotators': ['6740484e188a64793529ee77', '6686ebe474531e4a1975636f', 'boda'], 'labels': ['4', '4', '5']}",
 "{'annotators': ['6740484e188a64793529ee77', '6686ebe474531e4a1975636f', 'boda'], 'labels': ['5', '4', '5']}",
 "{'annotators': ['boda', '6740484e188a64793529ee77', '6686ebe474531e4a1975636f'], 'labels': ['5', '5', '5']}",
 "{'annotators': ['boda', '6740484e188a64793529ee77', '6686ebe474531e4a1975636f'], 'labels': ['5', '5', '5']}",
 "{'annotators': ['boda', '6686ebe474531e4a1975636f', '6740484e188a64793529ee77'], 'labels': ['5', '5', 