In [1]:
category = "cs.LG"
start_date = "2023-02-01"
end_date = "2023-02-17"

In [2]:
import io
from pathlib import Path
import sqltables.sqlite3
import numpy as np
import json
import sklearn.decomposition
import networkx as nx
from tqdm.notebook import tqdm

In [3]:
db = sqltables.sqlite3.Database(uri=True)
db.execute("attach 'file:submissions.sqlite3?mode=ro' as submissions")
db.execute("attach 'file:seen.sqlite3' as seen")

In [4]:
unseen_submissions = db.query("""
select submissions.* from submissions left join titles_seen using (arxiv_id) 
where seen is null and categories like ? and date >= ? and date <= ? group by arxiv_id
""", kind="table", parameters=[f"%{category}%", start_date, end_date])

In [5]:
metadata = {
    row.arxiv_id: row for row in unseen_submissions
}

In [6]:
paper_embeddings = unseen_submissions.view("""
select specter.arxiv_id, min(specter.paper_info) as paper_info 
from _ join specter using (arxiv_id) 
group by specter.arxiv_id
""")

In [7]:
[[N]] = paper_embeddings.view("select count(*) from _")
proto_row = next(iter(paper_embeddings))
proto_data = json.loads(proto_row.paper_info)
arxiv_ids = [None] * N
embeddings = np.zeros((N, len(proto_data["embedding"]["vector"])))
N

1463

In [8]:
for i, row in enumerate(paper_embeddings):
    arxiv_ids[i] = row.arxiv_id
    data = json.loads(row.paper_info)
    embedding = np.array(data["embedding"]["vector"])
    embeddings[i, :] = embedding

In [9]:
embeddings

array([[-0.61412579,  0.11344957, -0.31292248, ...,  0.65503466,
        -0.34713092,  1.49091172],
       [ 0.48673815,  0.05675318,  0.24558103, ...,  0.56417137,
        -0.48406518,  0.06357094],
       [-0.41016117,  0.2506786 ,  0.05882282, ..., -0.92686689,
         0.31443235,  0.31271809],
       ...,
       [-0.55873054,  0.5450049 ,  0.47874066, ..., -0.65527314,
         0.14366326,  1.55479228],
       [-0.03991125, -0.03911826, -0.32076925, ..., -0.16867685,
        -0.11890399,  0.82671374],
       [-0.05277987,  0.2314726 , -0.61351967, ..., -0.80053371,
        -1.44407678,  0.74225414]])

In [10]:
def index_to_title(index):
    return metadata[arxiv_ids[index]].title

In [11]:
pca = sklearn.decomposition.PCA(16)
reduced_embeddings = pca.fit_transform(embeddings)

In [12]:
reduced_embeddings.nbytes / 2**10

182.875

In [13]:
%%time
k = 8
nearest_neighbours = {}
for i in range(reduced_embeddings.shape[0]):
    emb = reduced_embeddings[i, :]
    d2 = np.sum((emb[None, :] - reduced_embeddings)**2, axis=-1)
    nearest_neighbours[i] = np.argsort(d2)[1:(k+1)].tolist()

CPU times: user 789 ms, sys: 795 ms, total: 1.58 s
Wall time: 282 ms


In [14]:
nearest_neighbours

{0: [31, 30, 124, 187, 677, 523, 809, 989],
 1: [276, 1102, 278, 1031, 285, 1364, 108, 1319],
 2: [792, 95, 1282, 1460, 1262, 866, 32, 797],
 3: [887, 490, 1445, 1196, 1094, 1105, 466, 1191],
 4: [972, 1046, 391, 292, 497, 1280, 514, 738],
 5: [593, 459, 883, 650, 1135, 318, 55, 1197],
 6: [565, 1097, 169, 191, 936, 861, 978, 802],
 7: [136, 890, 671, 1328, 93, 131, 505, 185],
 8: [149, 426, 436, 448, 237, 1195, 547, 1052],
 9: [198, 1170, 697, 775, 1372, 379, 855, 199],
 10: [138, 845, 1011, 19, 1063, 1241, 530, 52],
 11: [776, 919, 460, 550, 85, 524, 255, 1408],
 12: [1367, 164, 894, 1037, 30, 809, 1233, 1106],
 13: [1181, 430, 801, 541, 1304, 177, 323, 1256],
 14: [740, 1421, 66, 1056, 1243, 1240, 1360, 252],
 15: [474, 1283, 160, 1073, 124, 640, 523, 54],
 16: [713, 1093, 753, 1326, 838, 286, 955, 1153],
 17: [535, 1261, 410, 875, 239, 302, 1057, 316],
 18: [906, 1149, 1168, 1233, 1200, 120, 434, 540],
 19: [596, 1063, 1241, 933, 854, 137, 258, 420],
 20: [404, 1111, 163, 1052, 548

In [15]:
nn_graph = nx.from_dict_of_lists(nearest_neighbours)

In [16]:
for u, v in nn_graph.edges:
    dist = np.linalg.norm(embeddings[u] - embeddings[v])
    nn_graph.edges[(u, v)]["distance"] = dist
    nn_graph.edges[(u, v)]["weight"] = dist

In [17]:
# distances = [nn_graph.edges[e]["distance"] for e in nn_graph.edges]
# dist_q = np.quantile([nn_graph.edges[e]["distance"] for e in nn_graph.edges], 1/k)
# for e in nn_graph.edges:
#     dist = nn_graph.edges[e]["distance"]
#     nn_graph.edges[e]["weight"] = np.exp(-dist**2/(2*dist_q**2))

In [18]:
np.histogram([nn_graph.edges[e]["weight"] for e in nn_graph.edges])
    

(array([  10,   68,  503, 1653, 2497, 1968, 1091,  359,   68,    6]),
 array([ 6.07448447,  7.61411515,  9.15374584, 10.69337652, 12.23300721,
        13.7726379 , 15.31226858, 16.85189927, 18.39152995, 19.93116064,
        21.47079132]))

In [19]:
mst = nx.minimum_spanning_tree(nn_graph, weight="weight")
len(list(nx.connected_components(mst)))

1

In [20]:
degrees = nx.degree(mst)
leaves = {n for n, d in degrees if d == 1}
len(leaves), len(leaves) / len(mst)

(711, 0.4859876965140123)

In [21]:
np.histogram([d for _, d in degrees])

(array([711, 378, 204,  89,  41,  16,  15,   2,   4,   3]),
 array([ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11.]))

In [22]:
non_leaves = set(mst.nodes) - leaves

In [23]:
%%time
shortest_path_lengths = dict(nx.shortest_path_length(mst, weight="weight"))

CPU times: user 1.97 s, sys: 17.6 ms, total: 1.99 s
Wall time: 1.99 s


In [24]:
heights = {}
for node in non_leaves:
    lengths = shortest_path_lengths[node]
    h = min(lengths[leaf] for leaf in leaves)
    heights[node] = h

In [25]:
for n in leaves:
    heights[n] = 0

In [26]:
len(heights)

1463

In [27]:
max_height = max(heights.values())
roots = [n for n in non_leaves if heights[n] == max_height]
roots, max_height

([173], 39.672584800340346)

In [28]:
root_max_heights = {}
for node in roots:
    path_lengths = shortest_path_lengths[node]
    root_max_heights[node] = max([l for n, l in path_lengths.items() if n in leaves])

In [29]:
root_max_heights

{173: 556.1908105009913}

In [30]:
max_min_roots = sorted(roots, key=lambda node: root_max_heights[node])
max_min_roots

[173]

In [31]:
root = max_min_roots[0]

In [32]:
sp = {}
for u in roots:
    for v in roots:
        if u < v:
            sp[(u, v)] = nx.shortest_path(mst, source=u, target=v)

In [33]:
{k: len(path) for k, path in sp.items()}

{}

In [34]:
[(n, index_to_title(n)) for n in roots]

[(173,
  'Computational Discovery of Microstructured Composites with Optimal\n  Strength-Toughness Trade-Offs')]

In [35]:
mst[root]

AtlasView({958: {'distance': 12.37684227217444, 'weight': 12.37684227217444}, 1266: {'distance': 12.632752565322892, 'weight': 12.632752565322892}})

In [36]:
nodes = list(nx.depth_first_search.dfs_preorder_nodes(mst, root))
nodes

[173,
 958,
 1379,
 1381,
 1266,
 597,
 937,
 1070,
 849,
 1213,
 311,
 755,
 960,
 1283,
 95,
 1357,
 1090,
 1203,
 1164,
 681,
 1007,
 844,
 158,
 430,
 37,
 801,
 1097,
 565,
 719,
 387,
 538,
 503,
 691,
 651,
 874,
 994,
 731,
 733,
 899,
 327,
 544,
 903,
 623,
 1116,
 1310,
 504,
 189,
 1405,
 763,
 615,
 625,
 1055,
 246,
 33,
 408,
 521,
 529,
 384,
 332,
 1029,
 616,
 1254,
 208,
 115,
 244,
 231,
 695,
 1216,
 72,
 23,
 598,
 1333,
 768,
 306,
 444,
 1251,
 1369,
 438,
 1375,
 91,
 1343,
 1279,
 800,
 646,
 1301,
 560,
 1156,
 1175,
 936,
 1398,
 644,
 1328,
 7,
 1128,
 890,
 349,
 136,
 895,
 881,
 191,
 1373,
 71,
 300,
 1042,
 83,
 740,
 204,
 409,
 1250,
 612,
 125,
 1155,
 383,
 445,
 1145,
 569,
 707,
 129,
 611,
 837,
 600,
 1418,
 183,
 920,
 753,
 1142,
 788,
 46,
 1064,
 674,
 584,
 945,
 649,
 152,
 237,
 111,
 1154,
 568,
 481,
 614,
 454,
 365,
 196,
 662,
 678,
 422,
 861,
 998,
 885,
 369,
 1178,
 243,
 1362,
 541,
 850,
 680,
 685,
 1222,
 1031,
 664,
 726,
 

In [37]:
[(heights[n], index_to_title(n)) for n in nodes[:20]]

[(39.672584800340346,
  'Computational Discovery of Microstructured Composites with Optimal\n  Strength-Toughness Trade-Offs'),
 (27.29574252816591,
  'Discovering Sparse Hysteresis Models: A Data-driven Study for\n  Piezoelectric Materials and Perspectives on Magnetic Hysteresis'),
 (13.43931365234593,
  'A Meta-Learning Approach to Population-Based Modelling of Structures'),
 (0,
  'On the Detection and Quantification of Nonlinearity via Statistics of\n  the Gradients of a Black-Box Model'),
 (33.58305847078519,
  'Scalable Bayesian optimization with high-dimensional outputs using\n  randomized prior networks'),
 (21.975659553057547,
  'IB-UQ: Information bottleneck based uncertainty quantification for\n  neural function regression and neural operator learning'),
 (10.660250735558177,
  'Monte Carlo Neural Operator for Learning PDEs via Probabilistic\n  Representation'),
 (16.115687811688435, 'Physics informed WNO'),
 (15.808028441645167,
  'New directions in the applications of roug

In [38]:
differences = np.zeros([len(nodes)])
for i, node_id in enumerate(nodes):
    if i == 0:
        continue
    prev_node_id = nodes[i-1]
    differences[i] = np.linalg.norm(embeddings[node_id] - embeddings[prev_node_id])

In [39]:
np.min(differences[1:])

6.074484468764066

In [40]:
root_distances = shortest_path_lengths[root]

In [41]:
if "paper_ordering" in db.tables:
    db.drop_table("paper_ordering")
paper_ordering = db.create_table(
    name="paper_ordering", 
    column_names=["arxiv_id", "position", "difference", "depth", "height"])
paper_ordering.insert([(arxiv_ids[node_id], i, differences[i], root_distances[node_id], max_height - heights[node_id]) for i, node_id in enumerate(nodes)])
paper_ordering

|arxiv\_id|position|difference|depth|height|
|-|-|-|-|-|
|\'2302\.01078v1\'|0|0\.0|0|0\.0|
|\'2302\.05313v2\'|1|12\.37684227217444|12\.37684227217444|12\.376842272174436|
|\'2302\.07980v1\'|2|13\.856428875819978|26\.233271147994415|26\.233271147994415|
|\'2302\.07986v1\'|3|13\.43931365234593|39\.672584800340346|39\.672584800340346|
|\'2302\.07260v1\'|4|13\.97305327998921|12\.632752565322892|6\.089526329555156|
|\'2302\.03271v1\'|5|11\.607398917727643|24\.240151483050536|17\.6969252472828|
|\'2302\.05104v1\'|6|11\.315408817499367|35\.5555603005499|29\.012334064782166|
|\'2302\.05925v1\'|7|9\.774238681535834|45\.329798982085734|23\.55689698865191|
|\'2302\.04586v1\'|8|12\.288710327208502|57\.61850930929424|23\.86455635869518|
|\'2302\.06839v1\'|9|15\.808028441645167|73\.4265377509394|39\.672584800340346|
|\'2302\.01746v1\'|10|19\.315126592557963|61\.445486793774165|39\.672584800340346|
|\'2302\.04107v1\'|11|18\.414448761350805|45\.862757643493254|28\.174827809125418|
|\'2302\.05322v1\'|12|10\.963677240124527|56\.82643488361778|25\.229269716630398|
|\'2302\.07384v1\'|13|11\.447999844890381|68\.27443472850815|17\.205254228575868|
|\'2302\.00709v3\'|14|10\.452091434890018|78\.72652616339818|27\.657345663465886|
|\'2302\.07862v1\'|15|10\.001601433118203|88\.72812759651637|29\.262417074036154|
|...|...|...|...|...|


In [42]:
[differences] = zip(*db.query("select difference from paper_ordering"))

In [43]:
diff_quantiles = np.quantile(differences, [0.75, 0.9])
def diff_color(diff):
    i = np.searchsorted(diff_quantiles, diff)
    p = 1 - 0.5*(i / len(diff_quantiles))
    return f"rgb({p*100}%, {p*100}%, {p*100}%)"

In [44]:
diff_color(10)

'rgb(100.0%, 100.0%, 100.0%)'

In [45]:
def quote_html(text):
    return text.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")

In [46]:
output_dir = Path(f"ArXiv_{category}_{start_date}_{end_date}")
output_dir.mkdir(parents=True, exist_ok=True)

In [47]:
preamble = """
<!doctype html>
<meta charset="utf-8">
<style>
body { margin: 20px; }
</style>
<script>
function toggle(arxiv) {
  let elt = document.getElementById(arxiv);
  console.log(elt, elt.style.display);
  if(elt.style.display == "block") {
    elt.style.display = "none";
  } else {
    elt.style.display = "block";
  }
}
</script>
"""
rows = list(db.query("""
select * from paper_ordering join submissions using (arxiv_id) group by arxiv_id order by position 
"""))
batch_size = 256
batch_count = len(rows) // batch_size
for batch in range(batch_count+1):
    batch_rows = rows[(batch*batch_size):((batch+1)*batch_size)]
    buf = io.StringIO()
    buf.write(preamble)
    for row in batch_rows:
        arxiv = quote_html(row.arxiv_id)
        title = quote_html(row.title)
        authors = quote_html(row.authors)
        date = quote_html(row.date)
        body_q = quote_html(row.abstract)
        url = quote_html(row.url)
        diff = row.difference
        color = diff_color(diff)
        depth = row.depth
        height = row.height
    #     buf.write(f"<div style='display: list-item; margin-left: {(height-1)*10}px'>")
        buf.write(f"<div>")
        buf.write(f"<div style='margin-top: 5px; border-top: 1px solid {color}' onclick='toggle(\"{arxiv}\")'>{title}</div>\n")
        buf.write(f"<div id='{arxiv}' style='display: none; margin-left: 20px'>")
        buf.write(f"<div>Date: {date}</div>")
        buf.write(f"<div>Authors: {authors}</div>")
        buf.write(f"<div style='padding-top: 10px; width: 80ex'>{body_q}</div><div><a href='{url}'>{arxiv}</a></div>\n")
        buf.write("</div>")
        buf.write("</div>")

    buf.write(f"""
    <div><a href="arxiv_{batch-1}.html">Prev ({batch-1})</a></div>
    <div><a href="arxiv_{batch+1}.html">Next ({batch+1})</a></div>
    """)
    with open(output_dir / f"arxiv_{batch}.html", "w+") as f:
        f.write(buf.getvalue())

In [48]:
buf

<_io.StringIO at 0x16a59f040>

In [49]:
db.query("select count(*) from paper_ordering")

|count\(\*\)|
|-|
|1463|


In [50]:
# if "seen.titles_seen" not in db.tables:
#     db.create_table(name="seen.titles_seen", column_names=["arxiv_id", "seen"], column_types={"seen": "int"})

In [51]:
titles_seen = db.open_table("seen.titles_seen")

In [52]:
# if True:
if False:
    db.execute("""
    insert into titles_seen select arxiv_id, 1 from paper_ordering
    """)

In [53]:
titles_seen.view("select count(distinct arxiv_id) from _")

|count\(distinct arxiv\_id\)|
|-|
|26611|


In [54]:
# titles_seen1 = titles_seen
# titles_seen_data = list(titles_seen)

In [55]:
# import json
# with open("titles_seen.json", "w+") as f:
#     json.dump(list(titles_seen), f)