In [6]:
# category = "cs.LG"
category = "ICLR.cc/2025/Conference"
start_date = "2024-09-03"
end_date = "2025-03-06"

database_file = "or_submissions.sqlite3"

In [7]:
import io
from pathlib import Path
import sqltables.sqlite3
import numpy as np
import json
import sklearn.decomposition
import networkx as nx
from tqdm.notebook import tqdm
import datetime

In [9]:
db = sqltables.sqlite3.Database()
db.execute(f"attach 'file:{database_file}' as submissions")
db.execute(f"attach 'file:seen.sqlite3' as seen")

# if not "titles_seen" in list(db.tables):
    # db.create_table(name="seen.titles_seen", column_names = ["arxiv_id", "seen"])

In [10]:
unseen_submissions = db.query("""
select submissions.* from submissions left join titles_seen using (arxiv_id) 
where seen is null and categories like ? and date >= ? and date <= ? group by arxiv_id
""", kind="table", parameters=[f"%{category}%", start_date, end_date])

In [11]:
metadata = {
    row.arxiv_id: row for row in unseen_submissions
}

In [12]:
paper_embeddings = unseen_submissions.view("""
select specter.arxiv_id, min(specter.paper_info) as paper_info 
from _ join specter using (arxiv_id) 
group by specter.arxiv_id
""")

In [13]:
[[N]] = paper_embeddings.view("select count(*) from _")
proto_row = next(iter(paper_embeddings))
proto_data = json.loads(proto_row.paper_info)
arxiv_ids = [None] * N
embeddings = np.zeros((N, len(proto_data["embedding"]["vector"])))
N

3704

In [14]:
for i, row in enumerate(paper_embeddings):
    arxiv_ids[i] = row.arxiv_id
    data = json.loads(row.paper_info)
    embedding = np.array(data["embedding"]["vector"])
    embeddings[i, :] = embedding

In [15]:
embeddings

array([[-0.0431635 ,  0.54731268, -0.3499555 , ..., -0.72399843,
         0.22688085, -0.33621722],
       [-0.32873726,  0.2366316 , -0.84385812, ...,  0.08882914,
        -0.40554482, -0.6494661 ],
       [ 0.21524039,  0.31171572, -0.00566469, ..., -0.48830628,
        -0.04354364,  0.21095093],
       ...,
       [ 0.11689794,  0.49772665, -0.25748965, ..., -0.11842493,
        -0.17098482, -0.10038326],
       [ 0.41851151,  0.63480234, -0.21399161, ..., -0.80589968,
        -0.99163562, -0.15439582],
       [ 0.21851477,  0.6472494 , -0.62800056, ..., -0.45193392,
        -0.23375729, -0.34989017]])

In [16]:
def index_to_title(index):
    return metadata[arxiv_ids[index]].title

In [17]:
pca = sklearn.decomposition.PCA(16)
reduced_embeddings = pca.fit_transform(embeddings)

In [18]:
reduced_embeddings.nbytes / 2**10

463.0

In [19]:
%%time
k = 8
nearest_neighbours = {}
for i in range(reduced_embeddings.shape[0]):
    emb = reduced_embeddings[i, :]
    d2 = np.sum((emb[None, :] - reduced_embeddings)**2, axis=-1)
    nearest_neighbours[i] = np.argsort(d2)[1:(k+1)].tolist()

CPU times: user 816 ms, sys: 7.65 ms, total: 824 ms
Wall time: 826 ms


In [20]:
nearest_neighbours

{0: [1998, 1363, 2050, 208, 922, 1961, 2184, 245],
 1: [2352, 2565, 2145, 1834, 3128, 1900, 262, 234],
 2: [2955, 1195, 717, 889, 2546, 1658, 2684, 2250],
 3: [1725, 3220, 879, 1771, 891, 469, 3094, 1935],
 4: [1459, 1920, 1795, 3694, 1889, 394, 233, 2118],
 5: [1146, 2912, 3112, 1581, 67, 652, 1740, 420],
 6: [3546, 213, 2850, 2494, 3039, 2964, 2170, 3464],
 7: [1825, 1016, 1387, 2409, 2631, 325, 1446, 2063],
 8: [3490, 750, 1886, 3466, 1895, 383, 958, 3494],
 9: [3188, 128, 802, 2741, 3314, 3372, 2171, 240],
 10: [2346, 52, 736, 1337, 2504, 2602, 380, 2167],
 11: [3117, 571, 3534, 2754, 1091, 2052, 1657, 1882],
 12: [1458, 2775, 2553, 3093, 395, 1845, 2591, 1999],
 13: [946, 3622, 1563, 3118, 1813, 1430, 2957, 801],
 14: [331, 825, 984, 3468, 801, 2708, 1735, 353],
 15: [1200, 2412, 2441, 1439, 1258, 594, 2242, 3289],
 16: [1216, 3683, 3313, 1427, 1596, 995, 3308, 1491],
 17: [1554, 2756, 3370, 1753, 3175, 1149, 2742, 3608],
 18: [2201, 91, 1564, 2845, 2622, 2967, 2415, 3037],
 19: [

In [21]:
nn_graph = nx.from_dict_of_lists(nearest_neighbours)

In [22]:
for u, v in nn_graph.edges:
    dist = np.linalg.norm(embeddings[u] - embeddings[v])
    nn_graph.edges[(u, v)]["distance"] = dist
    nn_graph.edges[(u, v)]["weight"] = dist

In [23]:
# distances = [nn_graph.edges[e]["distance"] for e in nn_graph.edges]
# dist_q = np.quantile([nn_graph.edges[e]["distance"] for e in nn_graph.edges], 1/k)
# for e in nn_graph.edges:
#     dist = nn_graph.edges[e]["distance"]
#     nn_graph.edges[e]["weight"] = np.exp(-dist**2/(2*dist_q**2))

In [24]:
np.histogram([nn_graph.edges[e]["weight"] for e in nn_graph.edges])
    

(array([  48,  442, 1942, 4527, 5868, 4715, 2524,  825,  172,   28]),
 array([ 4.70592025,  5.4927615 ,  6.27960275,  7.06644399,  7.85328524,
         8.64012649,  9.42696774, 10.21380899, 11.00065023, 11.78749148,
        12.57433273]))

In [25]:
mst = nx.minimum_spanning_tree(nn_graph, weight="weight")
len(list(nx.connected_components(mst)))

1

In [26]:
degrees = nx.degree(mst)
leaves = {n for n, d in degrees if d == 1}
len(leaves), len(leaves) / len(mst)

(1690, 0.4562634989200864)

In [27]:
np.histogram([d for _, d in degrees])

(array([1690, 1051,  548,  238,   95,   49,   20,    9,    3,    1]),
 array([ 1. ,  1.9,  2.8,  3.7,  4.6,  5.5,  6.4,  7.3,  8.2,  9.1, 10. ]))

In [28]:
non_leaves = set(mst.nodes) - leaves

In [29]:
%%time
shortest_path_lengths = dict(nx.shortest_path_length(mst, weight="weight"))

CPU times: user 9.25 s, sys: 117 ms, total: 9.37 s
Wall time: 9.37 s


In [30]:
heights = {}
for node in non_leaves:
    lengths = shortest_path_lengths[node]
    h = min(lengths[leaf] for leaf in leaves)
    heights[node] = h

In [31]:
for n in leaves:
    heights[n] = 0

In [32]:
len(heights)

3704

In [33]:
max_height = max(heights.values())
roots = [n for n in non_leaves if heights[n] == max_height]
roots, max_height

([1122], 30.104654975372902)

In [34]:
root_max_heights = {}
for node in roots:
    path_lengths = shortest_path_lengths[node]
    root_max_heights[node] = max([l for n, l in path_lengths.items() if n in leaves])

In [35]:
root_max_heights

{1122: 544.4216992752062}

In [36]:
max_min_roots = sorted(roots, key=lambda node: root_max_heights[node])
max_min_roots

[1122]

In [37]:
root = max_min_roots[0]

In [38]:
sp = {}
for u in roots:
    for v in roots:
        if u < v:
            sp[(u, v)] = nx.shortest_path(mst, source=u, target=v)

In [39]:
{k: len(path) for k, path in sp.items()}

{}

In [40]:
[(n, index_to_title(n)) for n in roots]

[(1122,
  'On Designing General and Expressive Quantum Graph Neural Networks with Applications to MILP Instance Representation')]

In [41]:
mst[root]

AtlasView({656: {'distance': 7.652677227657469, 'weight': 7.652677227657469}, 2018: {'distance': 8.914593029877075, 'weight': 8.914593029877075}})

In [42]:
nodes = list(nx.depth_first_search.dfs_preorder_nodes(mst, root))
nodes

[1122,
 656,
 3270,
 2172,
 556,
 641,
 2747,
 526,
 3130,
 947,
 809,
 161,
 3360,
 3123,
 2729,
 3073,
 3141,
 3455,
 1948,
 1951,
 3126,
 2778,
 3509,
 2186,
 1922,
 3186,
 786,
 121,
 2579,
 352,
 252,
 3461,
 210,
 1761,
 3286,
 237,
 3215,
 816,
 1224,
 622,
 3038,
 521,
 296,
 3263,
 1988,
 1071,
 439,
 76,
 3135,
 235,
 1855,
 357,
 1035,
 3092,
 1440,
 3220,
 2610,
 2357,
 1771,
 3499,
 2440,
 3350,
 3315,
 13,
 946,
 1430,
 2245,
 1563,
 2957,
 3118,
 133,
 405,
 3051,
 2241,
 1696,
 1367,
 894,
 1025,
 2268,
 2547,
 3622,
 3518,
 2404,
 3543,
 3590,
 59,
 255,
 2924,
 1711,
 2029,
 3311,
 1085,
 1588,
 476,
 1200,
 2412,
 2769,
 2351,
 570,
 1278,
 145,
 3347,
 3562,
 3111,
 2441,
 2181,
 1439,
 2632,
 2458,
 3150,
 1561,
 694,
 586,
 542,
 1445,
 1843,
 380,
 468,
 781,
 2110,
 559,
 1092,
 1356,
 3331,
 3201,
 850,
 543,
 929,
 1136,
 1868,
 590,
 2965,
 2383,
 1957,
 774,
 1942,
 3203,
 1382,
 3021,
 360,
 1666,
 2663,
 3559,
 3616,
 952,
 3363,
 2557,
 911,
 1800,
 762,


In [43]:
[(heights[n], index_to_title(n)) for n in nodes[:20]]

[(30.104654975372902,
  'On Designing General and Expressive Quantum Graph Neural Networks with Applications to MILP Instance Representation'),
 (22.451977747715432,
  'Learning Efficient Positional Encodings with Graph Neural Networks'),
 (15.860370916137363,
  'Holographic Node Representations: Pre-training Task-Agnostic Node Embeddings'),
 (12.764971251664512,
  'KAA: Kolmogorov-Arnold Attention for Enhancing Attentive Graph Neural Networks'),
 (5.711944161920898, 'Edge Prompt Tuning for Graph Neural Networks'),
 (0,
  'HG-Adapter: Improving Pre-Trained Heterogeneous Graph Neural Networks with Dual Adapters'),
 (6.974092798251593,
  'Node-Time Conditional Prompt Learning in Dynamic Graphs'),
 (0,
  'TGB-Seq Benchmark: Challenging Temporal GNNs with Complex Sequential Dynamics'),
 (8.279342846075522,
  'N-ForGOT: Towards Not-forgetting and Generalization of Open Temporal Graph Learning'),
 (0,
  'Expand and Compress: Exploring Tuning Principles for Continual Spatio-Temporal Graph For

In [44]:
differences = np.zeros([len(nodes)])
for i, node_id in enumerate(nodes):
    if i == 0:
        continue
    prev_node_id = nodes[i-1]
    differences[i] = np.linalg.norm(embeddings[node_id] - embeddings[prev_node_id])

In [45]:
np.min(differences[1:])

4.705920249127938

In [46]:
root_distances = shortest_path_lengths[root]

In [47]:
if "paper_ordering" in list(db.tables):
    db.drop_table("paper_ordering")
paper_ordering = db.create_table(
    name="paper_ordering", 
    column_names=["arxiv_id", "position", "difference", "depth", "height"])
paper_ordering.insert([(arxiv_ids[node_id], i, differences[i], root_distances[node_id], max_height - heights[node_id]) for i, node_id in enumerate(nodes)])
paper_ordering

|arxiv\_id|position|difference|depth|height|
|-|-|-|-|-|
|\'IQi8JOqLuv\'|0|0\.0|0|0\.0|
|\'AWg2tkbydO\'|1|7\.652677227657469|7\.652677227657469|7\.65267722765747|
|\'tGYFikNONB\'|2|6\.59160683157807|14\.24428405923554|14\.24428405923554|
|\'atXCzVSXTJ\'|3|6\.95913819698351|21\.20342225621905|17\.33968372370839|
|\'92vMaHotTM\'|4|7\.0530270897436145|28\.256449345962665|24\.392710813452005|
|\'AEglX9CHFN\'|5|5\.711944161920898|33\.96839350788356|30\.104654975372902|
|\'kVlfYvIqaK\'|6|8\.305180951463095|34\.86349968430078|23\.13056217712131|
|\'8e2LirwiJT\'|7|6\.974092798251593|41\.83759248255237|30\.104654975372902|
|\'rLlDt2FQvz\'|8|8\.992488204258063|42\.89613443672793|21\.82531212929738|
|\'FRzCIlkM7I\'|9|8\.279342846075522|51\.17547728280345|30\.104654975372902|
|\'D756s2YQ6b\'|10|9\.764342256123411|35\.08603834750984|17\.563121811904825|
|\'2jf5x5XoYk\'|11|6\.8664408144029005|41\.95247916191274|22\.358451020077567|
|\'uTqnyF0JNR\'|12|7\.746203955295334|49\.69868311720808|30\.104654975372902|
|\'rEQqBZIz49\'|13|9\.17068916385615|42\.02078777731948|22\.35204386715998|
|\'kJ5H7oGT2M\'|14|6\.262714883410703|48\.28350266073018|16\.38737602596396|
|\'qFw2RFJS5g\'|15|6\.568475069939378|54\.85197773066956|22\.95585109590334|
|...|...|...|...|...|


In [48]:
[differences] = zip(*db.query("select difference from paper_ordering"))

In [49]:
diff_quantiles = np.quantile(differences, [0.75, 0.9])
def diff_color(diff):
    i = np.searchsorted(diff_quantiles, diff)
    p = 1 - 0.5*(i / len(diff_quantiles))
    return f"rgb({p*100}%, {p*100}%, {p*100}%)"

In [50]:
diff_color(10)

'rgb(75.0%, 75.0%, 75.0%)'

In [51]:
def quote_html(text):
    return text.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")

In [52]:
q_category = category.replace("/", "_")
output_dir = Path(f"ArXiv_{q_category}_{start_date}_{end_date}")
output_dir.mkdir(parents=True, exist_ok=True)

In [54]:
preamble = """
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<style>
body { margin: 20px; }
.date { width: 8ex; display: inline-block; }
</style>
<script>
function toggle(arxiv) {
  let elt = document.getElementById(arxiv);
  console.log(elt, elt.style.display);
  if(elt.style.display == "block") {
    elt.style.display = "none";
  } else {
    elt.style.display = "block";
  }
}
</script>
</head>
<body>
"""
rows = list(db.query("""
select * from paper_ordering join submissions using (arxiv_id) group by arxiv_id order by position 
"""))
batch_size = 256
batch_count = len(rows) // batch_size
for batch in range(batch_count+1):
    batch_rows = rows[(batch*batch_size):((batch+1)*batch_size)]
    buf = io.StringIO()
    buf.write(preamble)
    for row in batch_rows:
        arxiv = quote_html(row.arxiv_id)
        title = quote_html(row.title)
        authors = quote_html(row.authors)
        date = quote_html(row.date)
        date_short = quote_html(datetime.datetime.fromisoformat(row.date).strftime("%b %y"))
        body_q = quote_html(row.abstract)
        url = quote_html(row.url)
        diff = row.difference
        color = diff_color(diff)
        depth = row.depth
        height = row.height
    #     buf.write(f"<div style='display: list-item; margin-left: {(height-1)*10}px'>")
        buf.write(f"<div>")
        buf.write(f"""
                  <div style='margin-top: 5px; border-top: 1px solid {color}' onclick='toggle(\"{arxiv}\")'>
                  <span class='date'>{date_short}</span><span class='title'>{title}</span>
                  </div>\n""")
        buf.write(f"<div id='{arxiv}' style='display: none; margin-left: 20px'>")
        buf.write(f"<div>Date: {date}</div>")
        buf.write(f"<div>Authors: {authors}</div>")
        buf.write(f"<div style='padding-top: 10px; width: 80ex'>{body_q}</div><div><a href='{url}'>{arxiv}</a></div>\n")
        buf.write("</div>")
        buf.write("</div>")

    buf.write(f"""
    <div><a href="arxiv_{batch-1}.html">Prev ({batch-1})</a></div>
    <div><a href="arxiv_{batch+1}.html">Next ({batch+1})</a></div>
    </body>
    </html>
    """)
    with open(output_dir / f"arxiv_{batch}.html", "w+") as f:
        f.write(buf.getvalue())

'Feb 25'

In [51]:
buf

<_io.StringIO at 0x34a1d2b90>

In [52]:
db.query("select count(*) from paper_ordering")

|count\(\*\)|
|-|
|3044|


In [53]:
# if "seen.titles_seen" not in db.tables:
#     db.create_table(name="seen.titles_seen", column_names=["arxiv_id", "seen"], column_types={"seen": "int"})

In [54]:
titles_seen = db.open_table("seen.titles_seen")

In [53]:
# if True:
if False:
    db.execute("""
    insert into titles_seen select arxiv_id, 1 from paper_ordering
    """)

In [54]:
titles_seen.view("select count(distinct arxiv_id) from _")

|count\(distinct arxiv\_id\)|
|-|
|0|


In [54]:
# titles_seen1 = titles_seen
# titles_seen_data = list(titles_seen)

In [55]:
# import json
# with open("titles_seen.json", "w+") as f:
#     json.dump(list(titles_seen), f)