In [1]:
category = "cs.LG"
start_date = "2024-01-01"
end_date = "2024-03-25"

In [6]:
import io
from pathlib import Path
import sqltables.sqlite3
import numpy as np
import json
import sklearn.decomposition
import networkx as nx
from tqdm.notebook import tqdm

In [10]:
db = sqltables.sqlite3.Database()
db.execute("attach 'file:submissions.sqlite3' as submissions")
db.execute("attach 'file:seen.sqlite3' as seen")

if not "titles_seen" in list(db.tables):
    db.create_table(name="seen.titles_seen", column_names = ["arxiv_id", "seen"])

In [11]:
unseen_submissions = db.query("""
select submissions.* from submissions left join titles_seen using (arxiv_id) 
where seen is null and categories like ? and date >= ? and date <= ? group by arxiv_id
""", kind="table", parameters=[f"%{category}%", start_date, end_date])

In [12]:
metadata = {
    row.arxiv_id: row for row in unseen_submissions
}

In [13]:
paper_embeddings = unseen_submissions.view("""
select specter.arxiv_id, min(specter.paper_info) as paper_info 
from _ join specter using (arxiv_id) 
group by specter.arxiv_id
""")

In [14]:
[[N]] = paper_embeddings.view("select count(*) from _")
proto_row = next(iter(paper_embeddings))
proto_data = json.loads(proto_row.paper_info)
arxiv_ids = [None] * N
embeddings = np.zeros((N, len(proto_data["embedding"]["vector"])))
N

7069

In [15]:
for i, row in enumerate(paper_embeddings):
    arxiv_ids[i] = row.arxiv_id
    data = json.loads(row.paper_info)
    embedding = np.array(data["embedding"]["vector"])
    embeddings[i, :] = embedding

In [16]:
embeddings

array([[-1.12351441,  0.26094788, -0.88533551, ..., -0.68056774,
        -0.09879048,  1.18099356],
       [-0.042441  ,  1.11172628,  0.12678108, ...,  0.55670911,
         0.96004272,  1.05475974],
       [ 0.38378209, -0.32668093,  0.0048793 , ...,  0.69098097,
        -1.19854558,  1.23891664],
       ...,
       [-0.58571446,  0.7200219 , -0.63186663, ...,  0.13034166,
        -0.24664292,  0.99787116],
       [-0.57143569,  0.55115372, -0.2706224 , ..., -0.35181341,
        -0.79355806,  1.34393823],
       [-0.24784255,  0.6424095 , -0.94965935, ..., -0.70770955,
        -0.05074714,  1.08887219]])

In [17]:
def index_to_title(index):
    return metadata[arxiv_ids[index]].title

In [18]:
pca = sklearn.decomposition.PCA(16)
reduced_embeddings = pca.fit_transform(embeddings)

In [19]:
reduced_embeddings.nbytes / 2**10

883.625

In [20]:
%%time
k = 8
nearest_neighbours = {}
for i in range(reduced_embeddings.shape[0]):
    emb = reduced_embeddings[i, :]
    d2 = np.sum((emb[None, :] - reduced_embeddings)**2, axis=-1)
    nearest_neighbours[i] = np.argsort(d2)[1:(k+1)].tolist()

CPU times: user 3.1 s, sys: 32.4 ms, total: 3.13 s
Wall time: 3.14 s


In [21]:
nearest_neighbours

{0: [6365, 736, 2040, 5417, 4470, 5563, 6123, 6545],
 1: [908, 1734, 2832, 1279, 6659, 6023, 6960, 5571],
 2: [2538, 242, 3212, 5532, 4874, 5953, 517, 2358],
 3: [1462, 2328, 5551, 1758, 1564, 174, 1476, 6622],
 4: [4011, 3778, 3692, 3550, 6167, 6949, 4649, 5696],
 5: [6662, 2818, 5291, 3195, 2941, 2421, 3012, 3213],
 6: [5839, 5055, 5502, 1269, 3514, 3377, 4142, 774],
 7: [6669, 1450, 3077, 6332, 2467, 2824, 5715, 6047],
 8: [6799, 474, 6592, 265, 5694, 4793, 473, 5384],
 9: [97, 3218, 5823, 6136, 1577, 5636, 470, 319],
 10: [844, 404, 2631, 6816, 5244, 6863, 986, 5844],
 11: [1163, 2739, 1971, 229, 2375, 1586, 2129, 3919],
 12: [4352, 3729, 1448, 4782, 3352, 895, 4528, 1167],
 13: [4364, 4541, 282, 2454, 4004, 4555, 1973, 4300],
 14: [542, 2825, 1851, 6800, 6631, 251, 1677, 1923],
 15: [258, 375, 870, 991, 549, 313, 426, 5006],
 16: [1899, 6421, 1504, 5000, 5924, 1518, 1391, 6068],
 17: [1118, 6933, 5861, 876, 6321, 4246, 5643, 4588],
 18: [408, 1894, 1, 5245, 5671, 2224, 4365, 269],

In [22]:
nn_graph = nx.from_dict_of_lists(nearest_neighbours)

In [23]:
for u, v in nn_graph.edges:
    dist = np.linalg.norm(embeddings[u] - embeddings[v])
    nn_graph.edges[(u, v)]["distance"] = dist
    nn_graph.edges[(u, v)]["weight"] = dist

In [24]:
# distances = [nn_graph.edges[e]["distance"] for e in nn_graph.edges]
# dist_q = np.quantile([nn_graph.edges[e]["distance"] for e in nn_graph.edges], 1/k)
# for e in nn_graph.edges:
#     dist = nn_graph.edges[e]["distance"]
#     nn_graph.edges[e]["weight"] = np.exp(-dist**2/(2*dist_q**2))

In [25]:
np.histogram([nn_graph.edges[e]["weight"] for e in nn_graph.edges])
    

(array([   45,   761,  5282, 12213, 12577,  6805,  2300,   507,    59,
            2]),
 array([ 5.57917585,  7.27403388,  8.96889191, 10.66374995, 12.35860798,
        14.05346601, 15.74832404, 17.44318207, 19.1380401 , 20.83289814,
        22.52775617]))

In [26]:
mst = nx.minimum_spanning_tree(nn_graph, weight="weight")
len(list(nx.connected_components(mst)))

1

In [27]:
degrees = nx.degree(mst)
leaves = {n for n, d in degrees if d == 1}
len(leaves), len(leaves) / len(mst)

(3421, 0.4839439807610695)

In [28]:
np.histogram([d for _, d in degrees])

(array([5276, 1389,  206,  148,   29,   19,    0,    0,    1,    1]),
 array([ 1. ,  2.6,  4.2,  5.8,  7.4,  9. , 10.6, 12.2, 13.8, 15.4, 17. ]))

In [29]:
non_leaves = set(mst.nodes) - leaves

In [30]:
%%time
shortest_path_lengths = dict(nx.shortest_path_length(mst, weight="weight"))

CPU times: user 34.4 s, sys: 311 ms, total: 34.7 s
Wall time: 34.7 s


In [31]:
heights = {}
for node in non_leaves:
    lengths = shortest_path_lengths[node]
    h = min(lengths[leaf] for leaf in leaves)
    heights[node] = h

In [32]:
for n in leaves:
    heights[n] = 0

In [33]:
len(heights)

7069

In [34]:
max_height = max(heights.values())
roots = [n for n in non_leaves if heights[n] == max_height]
roots, max_height

([429], 56.019965408605586)

In [35]:
root_max_heights = {}
for node in roots:
    path_lengths = shortest_path_lengths[node]
    root_max_heights[node] = max([l for n, l in path_lengths.items() if n in leaves])

In [36]:
root_max_heights

{429: 705.6890839348234}

In [37]:
max_min_roots = sorted(roots, key=lambda node: root_max_heights[node])
max_min_roots

[429]

In [38]:
root = max_min_roots[0]

In [39]:
sp = {}
for u in roots:
    for v in roots:
        if u < v:
            sp[(u, v)] = nx.shortest_path(mst, source=u, target=v)

In [40]:
{k: len(path) for k, path in sp.items()}

{}

In [41]:
[(n, index_to_title(n)) for n in roots]

[(429,
  'SPT: Spectral Transformer for Red Giant Stars Age and Mass Estimation')]

In [42]:
mst[root]

AtlasView({4748: {'distance': 12.318845439685287, 'weight': 12.318845439685287}, 3721: {'distance': 13.106310458616756, 'weight': 13.106310458616756}})

In [43]:
nodes = list(nx.depth_first_search.dfs_preorder_nodes(mst, root))
nodes

[429,
 4748,
 651,
 6876,
 4264,
 6566,
 3721,
 3703,
 3932,
 5368,
 5995,
 2909,
 1591,
 348,
 878,
 1784,
 1677,
 5234,
 1008,
 2654,
 3066,
 6972,
 2576,
 2254,
 2018,
 2572,
 2806,
 5447,
 2653,
 2509,
 4333,
 6695,
 5420,
 3797,
 6194,
 5122,
 780,
 5164,
 1745,
 4541,
 1162,
 1766,
 2013,
 1404,
 1352,
 4992,
 369,
 602,
 4447,
 1973,
 1145,
 282,
 4555,
 3200,
 1055,
 1197,
 3612,
 3894,
 4016,
 3499,
 3273,
 2277,
 4867,
 824,
 1415,
 6288,
 2181,
 4975,
 5089,
 1510,
 4550,
 2338,
 3267,
 6060,
 6227,
 6356,
 508,
 3683,
 3622,
 3700,
 4548,
 7006,
 841,
 1043,
 5129,
 927,
 3939,
 6900,
 354,
 7024,
 5879,
 4905,
 1136,
 1881,
 261,
 6502,
 6512,
 3562,
 1644,
 5612,
 4425,
 74,
 5312,
 1062,
 2527,
 3757,
 3476,
 2455,
 4462,
 3052,
 3540,
 4383,
 988,
 6586,
 5256,
 3484,
 1254,
 5541,
 2463,
 390,
 2700,
 4681,
 6263,
 2138,
 5178,
 2801,
 2450,
 6903,
 5721,
 1326,
 2566,
 812,
 2332,
 1112,
 5665,
 1330,
 3308,
 6423,
 5694,
 1051,
 480,
 4718,
 1000,
 1882,
 5342,
 5754

In [44]:
[(heights[n], index_to_title(n)) for n in nodes[:20]]

[(56.019965408605586,
  'SPT: Spectral Transformer for Red Giant Stars Age and Mass Estimation'),
 (47.81375786746608,
  'Exoplanets Prediction in Multi-Planetary Systems and Determining the\n  Correlation Between the Parameters of Planets and Host Stars Using Artificial\n  Intelligence'),
 (35.28350097692183,
  'Inferring Stellar Parameters from Iodine-Imprinted Keck/HIRES Spectra\n  with Machine Learning'),
 (21.819193961784084,
  'RG-CAT: Detection Pipeline and Catalogue of Radio Galaxies in the EMU\n  Pilot Survey'),
 (12.038770829548994,
  'Classification of compact radio sources in the Galactic plane with\n  supervised machine learning'),
 (0,
  'Light Curve Classification with DistClassiPy: a new distance-based\n  classifier'),
 (42.91365494998883,
  'DBNets: A publicly available deep learning tool to measure the masses of\n  young planets in dusty protoplanetary discs'),
 (30.891991198537546,
  'Short-Period Variables in TESS Full-Frame Image Light Curves Identified\n  via Conv

In [45]:
differences = np.zeros([len(nodes)])
for i, node_id in enumerate(nodes):
    if i == 0:
        continue
    prev_node_id = nodes[i-1]
    differences[i] = np.linalg.norm(embeddings[node_id] - embeddings[prev_node_id])

In [46]:
np.min(differences[1:])

5.579175850882429

In [47]:
root_distances = shortest_path_lengths[root]

In [49]:
if "paper_ordering" in list(db.tables):
    db.drop_table("paper_ordering")
paper_ordering = db.create_table(
    name="paper_ordering", 
    column_names=["arxiv_id", "position", "difference", "depth", "height"])
paper_ordering.insert([(arxiv_ids[node_id], i, differences[i], root_distances[node_id], max_height - heights[node_id]) for i, node_id in enumerate(nodes)])
paper_ordering

|arxiv\_id|position|difference|depth|height|
|-|-|-|-|-|
|\'2401\.04900v1\'|0|0\.0|0|0\.0|
|\'2402\.17898v1\'|1|12\.318845439685287|12\.318845439685287|8\.206207541139506|
|\'2401\.06839v1\'|2|12\.530256890544253|24\.84910233022954|20\.736464431683757|
|\'2403\.14235v1\'|3|13\.464307015137747|38\.31340934536729|34\.200771446821506|
|\'2402\.15232v1\'|4|9\.78042313223509|48\.093832477602376|43\.98119457905659|
|\'2403\.12120v1\'|5|12\.038770829548994|60\.13260330715137|56\.019965408605586|
|\'2402\.12448v1\'|6|13\.641795645401789|13\.106310458616756|13\.106310458616754|
|\'2402\.12369v1\'|7|12\.02166375145128|25\.127974210068036|25\.12797421006804|
|\'2402\.13673v1\'|8|9\.421729421998384|34\.54970363206642|34\.549703632066425|
|\'2403\.03427v1\'|9|7\.948508981567371|42\.49821261363379|42\.49821261363379|
|\'2403\.07507v1\'|10|13\.521752794971794|56\.019965408605586|56\.019965408605586|
|\'2402\.07492v1\'|11|13\.694382498230791|35\.12686423795543|32\.401676661885446|
|\'2401\.15632v1\'|12|11\.478816335926417|46\.60568057388185|43\.880492997811864|
|\'2401\.03336v1\'|13|11\.78178304415258|58\.38746361803443|32\.67694964409639|
|\'2401\.08763v1\'|14|11\.689578147546458|70\.07704176558089|44\.366527791642845|
|\'2401\.17029v1\'|15|11\.65343761696274|81\.73047938254362|56\.019965408605586|
|...|...|...|...|...|


In [50]:
[differences] = zip(*db.query("select difference from paper_ordering"))

In [51]:
diff_quantiles = np.quantile(differences, [0.75, 0.9])
def diff_color(diff):
    i = np.searchsorted(diff_quantiles, diff)
    p = 1 - 0.5*(i / len(diff_quantiles))
    return f"rgb({p*100}%, {p*100}%, {p*100}%)"

In [52]:
diff_color(10)

'rgb(100.0%, 100.0%, 100.0%)'

In [53]:
def quote_html(text):
    return text.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")

In [54]:
output_dir = Path(f"ArXiv_{category}_{start_date}_{end_date}")
output_dir.mkdir(parents=True, exist_ok=True)

In [55]:
preamble = """
<!doctype html>
<meta charset="utf-8">
<style>
body { margin: 20px; }
</style>
<script>
function toggle(arxiv) {
  let elt = document.getElementById(arxiv);
  console.log(elt, elt.style.display);
  if(elt.style.display == "block") {
    elt.style.display = "none";
  } else {
    elt.style.display = "block";
  }
}
</script>
"""
rows = list(db.query("""
select * from paper_ordering join submissions using (arxiv_id) group by arxiv_id order by position 
"""))
batch_size = 256
batch_count = len(rows) // batch_size
for batch in range(batch_count+1):
    batch_rows = rows[(batch*batch_size):((batch+1)*batch_size)]
    buf = io.StringIO()
    buf.write(preamble)
    for row in batch_rows:
        arxiv = quote_html(row.arxiv_id)
        title = quote_html(row.title)
        authors = quote_html(row.authors)
        date = quote_html(row.date)
        body_q = quote_html(row.abstract)
        url = quote_html(row.url)
        diff = row.difference
        color = diff_color(diff)
        depth = row.depth
        height = row.height
    #     buf.write(f"<div style='display: list-item; margin-left: {(height-1)*10}px'>")
        buf.write(f"<div>")
        buf.write(f"<div style='margin-top: 5px; border-top: 1px solid {color}' onclick='toggle(\"{arxiv}\")'>{title}</div>\n")
        buf.write(f"<div id='{arxiv}' style='display: none; margin-left: 20px'>")
        buf.write(f"<div>Date: {date}</div>")
        buf.write(f"<div>Authors: {authors}</div>")
        buf.write(f"<div style='padding-top: 10px; width: 80ex'>{body_q}</div><div><a href='{url}'>{arxiv}</a></div>\n")
        buf.write("</div>")
        buf.write("</div>")

    buf.write(f"""
    <div><a href="arxiv_{batch-1}.html">Prev ({batch-1})</a></div>
    <div><a href="arxiv_{batch+1}.html">Next ({batch+1})</a></div>
    """)
    with open(output_dir / f"arxiv_{batch}.html", "w+") as f:
        f.write(buf.getvalue())

In [56]:
buf

<_io.StringIO at 0x38693a440>

In [57]:
db.query("select count(*) from paper_ordering")

|count\(\*\)|
|-|
|7069|


In [50]:
# if "seen.titles_seen" not in db.tables:
#     db.create_table(name="seen.titles_seen", column_names=["arxiv_id", "seen"], column_types={"seen": "int"})

In [51]:
titles_seen = db.open_table("seen.titles_seen")

In [52]:
# if True:
if False:
    db.execute("""
    insert into titles_seen select arxiv_id, 1 from paper_ordering
    """)

In [53]:
titles_seen.view("select count(distinct arxiv_id) from _")

|count\(distinct arxiv\_id\)|
|-|
|26611|


In [54]:
# titles_seen1 = titles_seen
# titles_seen_data = list(titles_seen)

In [55]:
# import json
# with open("titles_seen.json", "w+") as f:
#     json.dump(list(titles_seen), f)