In [None]:
import math

def tau_closest_agents(agent, agents, d, tau) -> tuple[list[str], float]:
    """Return the list of tau-closest agents to agent.
    agent: agent's id
    agents: list of agents' id
    d: distance function
    tau: threshold number (usually number of agents in a cluster)

    Return:
        - list of tau-closest agents' id
        - distance to the furthest agent
    """
    return None # TODO: implement


def SmallestAgentBall(agents, d, tau) -> list[str]:
    """Return the set of per_clusterclosest agents to the agent of the smallest ball.
    N: list of agents' id
    d: distance function
    tau: threshold number (usually number of agents in a cluster)
    """
    if len(agents) <= tau:
        return agents[:]
    lst = [] # (agent, its tau-closest agent, distance) #TODO: find a better name
    for agent in agents:
        closest_agents = tau_closest_agents(agent, agents, d, tau) # (list, distance to the furthest)
        lst.append((agent, *closest_agents))
    min_ball = min(lst, key=lambda x: x[2])
    return min_ball[1]


def GreedyCohesiveClustering(agents, d, k) -> list[list[str]]:
    """ Return the k cohesive clusters of agents by metric d. Each cluster is a list of id.
    agents: list of agents' id
    d: distance function
    k: number of clusters to return
    """
    clusters = [] # each cluster is a list of id
    N = agents[:]
    j = 1
    per_cluster = math.ceil(len(agents)/k)
    while N:
        C_j = SmallestAgentBall(N, d, per_cluster)
        clusters.append(C_j)
        for agent in C_j:
            N.remove(agent)
        j += 1
    return clusters


In [None]:
import os
print("Current working directory:", os.getcwd())


In [None]:
print("API KEY:", os.getenv("OPENAI_API_KEY"))


In [2]:
import pandas as pd
import os
import json
import fitz
from tqdm import tqdm

In [8]:
def extract_text_from_pdf(pdf_path):
    try:
        doc = fitz.open(pdf_path)
        text = ""
        for page in doc:
            text += page.get_text()
        return text
    except Exception as e:
        print(f"Error processing {pdf_path}: {str(e)}")
        return ""

In [16]:
# Read metadata
df = pd.read_csv('./data/metadata/ICLR_2021_2024.csv')

# Create unified text directory if it doesn't exist
os.makedirs('./data/unified_text/ICLR', exist_ok=True)

# Process each year
# years = range(2021, 2025)
years = [2024]
for year in years:
    year_data = []
    year_df = df[df['year'] == year]

    for _, row in tqdm(year_df.iterrows(), total=len(year_df)):
        paper_id = row['id']
        title = row['title']
        txt_path = f'./data/txts/ICLR_2021_2024/{title}.txt'
        pdf_path = f'./data/pdfs/ICLR_2021_2024/{title}.pdf'

        paper_dict = row.to_dict()

        # Try to get text from txt file first
        if os.path.exists(txt_path):
            with open(txt_path, 'r', encoding='utf-8') as f:
                paper_dict['text'] = f.read()
        # If txt doesn't exist, extract from PDF
        elif os.path.exists(pdf_path):
            paper_dict['text'] = extract_text_from_pdf(pdf_path)
        else:
            paper_dict['text'] = ""
            print(f"Neither txt nor pdf found for paper {paper_id}")

        year_data.append(paper_dict)

    # Save to JSON
    output_path = f'./data/unified_text/ICLR/ICLR_{year}.json'
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(year_data, f, ensure_ascii=False, indent=2)


100%|██████████| 247/247 [00:28<00:00,  8.66it/s]
100%|██████████| 86/86 [00:00<00:00, 102.87it/s]


In [15]:
text = extract_text_from_pdf('./data/pdfs/ICLR_2021_2024/An Analytical Solution to Gauss-Newton Loss for Direct Image Alignment.pdf')
with open("./data/unified_text/ICLR/ICLR_2024.json", "r", encoding="utf-8") as f:
    data = json.load(f)
for entry in data:
    if entry['title'] == 'An Analytical Solution to Gauss-Newton Loss for Direct Image Alignment':
        entry['text'] = text
with open("./data/unified_text/ICLR/ICLR_2024.json", "w", encoding="utf-8") as f:
    json.dump(data, f, ensure_ascii=False, indent=2)


In [9]:
# unify for neurips
# Read metadata
df = pd.read_csv('./data/metadata/neurips_2023_2024.csv')

# Create unified text directory if it doesn't exist
os.makedirs('./data/unified_text/NeurIPS', exist_ok=True)

# Process each year
years = range(2023, 2025)
for year in years:
    year_data = []
    year_df = df[df['year'] == year]

    for _, row in tqdm(year_df.iterrows(), total=len(year_df)):
        paper_id = row['id']
        title = row['title']
        txt_path = f'./data/txts/neurips_2023_2024/{title}.txt'
        pdf_path = f'./data/pdfs/neurips_23_24/{title}.pdf'

        paper_dict = row.to_dict()

        # Try to get text from txt file first
        if os.path.exists(txt_path):
            with open(txt_path, 'r', encoding='utf-8') as f:
                paper_dict['text'] = f.read()
        # If txt doesn't exist, extract from PDF
        elif os.path.exists(pdf_path):
            paper_dict['text'] = extract_text_from_pdf(pdf_path)
        else:
            paper_dict['text'] = ""
            print(pdf_path)
            print(f"Neither txt nor pdf found for paper {paper_id}")

        year_data.append(paper_dict)

    # Save to JSON
    output_path = f'./data/unified_text/NeurIPS/{year}.json'
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(year_data, f, ensure_ascii=False, indent=2)


100%|██████████| 77/77 [00:00<00:00, 2208.22it/s]
100%|██████████| 72/72 [00:00<00:00, 1984.57it/s]

./data/pdfs/neurips_23_24/The PRISM Alignment Dataset_ What Participatory, Representative and Individualised Human Feedback Reveals About the Subjective and Multicultural Alignment of Large Language Models.pdf
Neither txt nor pdf found for paper DFr5hteojx





In [17]:
text = extract_text_from_pdf('./data/pdfs/neurips_23_24/The PRISM Alignment Dataset_ What Participatory, Representative and Individualised Human Feedback Reveals About the Subjective and Multicultural Alignment of Large L.pdf')
with open("./data/unified_text/NeurIPS/2024.json", "r", encoding="utf-8") as f:
    data = json.load(f)
for entry in data:
    if entry['title'] == 'The PRISM Alignment Dataset_ What Participatory, Representative and Individualised Human Feedback Reveals About the Subjective and Multicultural Alignment of Large Language Models':
        entry['text'] = text
        print('success')
with open("./data/unified_text/NeurIPS/2024.json", "w", encoding="utf-8") as f:
    json.dump(data, f, ensure_ascii=False, indent=2)


success


In [26]:
import pandas as pd
df = pd.read_csv('./data/metadata/ICLR_2021_2024.csv')
df = df[df['year'] == 2024]
print(df.shape)

with open("./data/unified_text/ICLR/ICLR_2024.json", "r", encoding="utf-8") as f:
    data = json.load(f)

count = 0
miss_content = 0
for entry in data:
    count += 1
    if entry['text'] == '':
        miss_content += 1
        print(entry['title'])
print(count)
print(miss_content)

(86, 11)
86
0
