In [1]:
import plotly.express as px
from sklearn.decomposition import PCA
import os
from src import DATA_DIR, PROJECT_ROOT, get_paper_info
from src.models.graph import Graph
from src.utils.io import load_json
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
emb_path = os.path.join(DATA_DIR, 'emb', 'NeurIPS', '2024', '20250602_2037', 'emb.json')
emb = load_json(emb_path)

data_path = os.path.join(DATA_DIR, 'metadata', 'neurips_2023_2024.csv')
df = pd.read_csv(data_path)

clustering = os.path.join(PROJECT_ROOT, 'result', 'NeurIPS', '2024', '20250602_2158', 'clusters.csv')
cluster_df = pd.read_csv(clustering)

output_path = os.path.join(PROJECT_ROOT, 'result', 'NeurIPS', '2024', '20250602_2158')

In [10]:
df = df[df['year'] == 2024]
df['emb'] = df['id'].map(emb)

In [11]:
# Create a mapping for each method
method_names = cluster_df['method'].tolist()
session_names = cluster_df.columns[1:]  # skip 'method'

# Build: method -> id -> cluster name
method_to_id_to_cluster = {}

for _, row in cluster_df.iterrows():
    method = row['method']
    id_to_cluster = {}
    for session in session_names:
        ids = str(row[session]).split(',') if pd.notna(row[session]) else []
        for id_ in ids:
            id_to_cluster[id_] = session  # assign paper to the session name
    method_to_id_to_cluster[method] = id_to_cluster

for method, id_to_cluster in method_to_id_to_cluster.items():
    df[method] = df['id'].map(id_to_cluster).fillna("None")  # or use np.nan

In [16]:
df.head(5)

Unnamed: 0,authors,publisher,title,url,year,abstract,session,pdf_url,openreview_url,id,forum_content,emb,Baseline,GreedyCohesive,KMedoids,KMeans,tsne_0,tsne_1
0,"['Tianyu He', 'Darshil Doshi', 'Aritra Das', '...",NeurIPS,Learning to grok_ Emergence of in-context lear...,https://neurips.cc/virtual/2024/oral/97968,2024,Large language models can solve tasks that we...,Oral Session 1A: Neuroscience and Intepretability,https://openreview.net/pdf?id=aVh9KRZdRk,https://openreview.net/forum?id=aVh9KRZdRk,aVh9KRZdRk,[{'content': {'title': {'value': 'Paper Decisi...,"[0.025920594792352154, -0.006532610617907675, ...",Oral Session 1A: Neuroscience and Intepretability,Oral Session 1A: Neuroscience and Intepretability,Oral Session 1A: Neuroscience and Intepretability,Oral Session 1A: Neuroscience and Intepretability,-2.343757,-2.031617
1,"['Jin Zhang', 'Ze Liu', 'Defu Lian', 'Enhong C...",NeurIPS,Generalization Error Bounds for Two-stage Reco...,https://neurips.cc/virtual/2024/oral/97958,2024,Two-stage recommender systems play a crucial ...,Oral Session 1D: Learning Theory,https://openreview.net/pdf?id=m1a4CrRJR7,https://openreview.net/forum?id=m1a4CrRJR7,m1a4CrRJR7,[{'content': {'title': {'value': 'Paper Decisi...,"[0.0005160157651468916, 0.009975187560102265, ...",Oral Session 1D: Learning Theory,Oral Session 1D: Learning Theory,Oral Session 1D: Learning Theory,Oral Session 1B: Human-AI Interaction,-8.007804,3.073959
2,"['Aaron Defazio', 'Xingyu Yang', 'Ahmed Khaled...",NeurIPS,The Road Less Scheduled,https://neurips.cc/virtual/2024/oral/98003,2024,Existing learning rate schedules that do not ...,Oral Session 1C: Optimization and Learning Theory,https://openreview.net/pdf?id=0XeNkkENuI,https://openreview.net/forum?id=0XeNkkENuI,0XeNkkENuI,[{'content': {'title': {'value': 'Paper Decisi...,"[-0.0075764014618471265, -0.008110680956054817...",Oral Session 1C: Optimization and Learning Theory,Oral Session 6C: New Data,Oral Session 1C: Optimization and Learning Theory,Oral Session 1C: Optimization and Learning Theory,-8.981425,3.619922
3,"['Rohan Alur', 'Manish Raghavan', 'Devavrat Sh...",NeurIPS,Human Expertise in Algorithmic Prediction,https://neurips.cc/virtual/2024/oral/97946,2024,We introduce a novel framework for incorporat...,Oral Session 1B: Human-AI Interaction,https://openreview.net/pdf?id=wpGJ2AX6SZ,https://openreview.net/forum?id=wpGJ2AX6SZ,wpGJ2AX6SZ,[{'content': {'comment': {'value': 'Thank you ...,"[0.006127413471889062, -0.0129554309300147, -0...",Oral Session 1B: Human-AI Interaction,Oral Session 1D: Learning Theory,Oral Session 1D: Learning Theory,Oral Session 1B: Human-AI Interaction,-8.261276,2.920403
4,"['Shen Li', 'Yuyang Zhang', 'Zhaolin Ren', 'Cl...",NeurIPS,Enhancing Preference-based Linear Bandits via ...,https://neurips.cc/virtual/2024/oral/97969,2024,Interactive preference learning systems infer...,Oral Session 2A: Agents,https://openreview.net/pdf?id=aIPwlkdOut,https://openreview.net/forum?id=aIPwlkdOut,aIPwlkdOut,[{'content': {'title': {'value': 'Paper Decisi...,"[0.017408840711030996, -0.014277594156958388, ...",Oral Session 2A: Agents,Oral Session 1C: Optimization and Learning Theory,Oral Session 1D: Learning Theory,Oral Session 2A: Agents,-8.037498,2.53764


In [12]:
X = np.vstack(df['emb'].values)
print(X.shape)

(72, 3072)


In [13]:
import numpy as np
from scipy.spatial import ConvexHull
import plotly.graph_objects as go
import plotly.io as pio

for method in method_names:
    # Create the base scatter plot
    custom_palette = px.colors.qualitative.Dark24 + px.colors.qualitative.Set3
    fig = px.scatter(X, x=0, y=1,
                     title=f"{method} Clustering PCA",
                     hover_name=df['id'],
                     color=df[method],
                     color_discrete_sequence=custom_palette)

    fig.show()
    os.makedirs(os.path.join(output_path, "PCA"), exist_ok=True)
    pio.write_html(fig, file=os.path.join(output_path, "PCA", f"{method} Clustering PCA.html"), auto_open=False)

In [14]:
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE
from scipy.spatial import ConvexHull
import plotly.express as px
import plotly.graph_objects as go


# Step 2: Run t-SNE
tsne = TSNE(n_components=2, perplexity=30, random_state=42)
components = tsne.fit_transform(X)

# Step 3: Add to dataframe
df['tsne_0'] = components[:, 0]
df['tsne_1'] = components[:, 1]


In [15]:
# Step 4: Draw convex hulls for each method
for method in method_names:  # e.g. ["Baseline", "GreedyCohesive"]
    fig = px.scatter(
        df,
        x="tsne_0",
        y="tsne_1",
        color=method,
        hover_name="title",
        title=f"t-SNE Clustering — {method},",
        color_discrete_sequence=custom_palette
    )
    os.makedirs(os.path.join(output_path, "t-SNE"), exist_ok=True)
    pio.write_html(fig, file=os.path.join(output_path, "t-SNE", f"{method} Clustering t-SNE.html"), auto_open=False)
    fig.show()


Idea to evaluate:
- how much do polytopes hit each other.
-- care about containing each other completely?

## Plotting:

In [29]:
eval_path = os.path.join(PROJECT_ROOT, 'result', 'NeurIPS', '2024', '20250602_2158', 'eval.csv')
df = pd.read_csv(eval_path)

In [32]:
import plotly.express as px

# --- Metric groups ---
intracluster_metrics = ['avg_compactness', 'avg_diameter', 'max_diameter', 'worst_avg_spread']
objective_metrics = ['k-means', 'k-medoids']

# --- Plot 1: Intra-cluster Metrics ---
df_intracluster = df.melt(id_vars='model', value_vars=intracluster_metrics,
                          var_name='metric', value_name='value')

fig1 = px.bar(
    df_intracluster,
    x='metric',
    y='value',
    color='model',
    barmode='group',
    title="Intra-cluster Metrics Across Clustering Methods"
)
fig1.update_layout(template='plotly_dark')
fig1.show()

# --- Plot 2: Objective Scores ---
df_objectives = df.melt(id_vars='model', value_vars=objective_metrics,
                        var_name='metric', value_name='value')

fig2 = px.bar(
    df_objectives,
    x='metric',
    y='value',
    color='model',
    barmode='group',
    title="Objective Scores (k-means / k-medoids)"
)
fig2.update_layout(template='plotly_dark')
fig2.show()
