In [237]:
import io
import sqltables.sqlite3
import numpy as np
import json
import sklearn.decomposition
import networkx as nx
from tqdm.notebook import tqdm

In [238]:
db = sqltables.sqlite3.Database(uri=True)
db.execute("attach 'file:submissions.sqlite3?mode=ro' as submissions")
db.execute("attach 'file:seen.sqlite3' as seen")

In [239]:
unseen_submissions = db.query("""
select submissions.* from submissions left join titles_seen using (arxiv_id) 
where seen is null
""")

In [240]:
metadata = {
    row.arxiv_id: row for row in unseen_submissions
}

In [241]:
paper_embeddings = unseen_submissions.view("""
select specter.arxiv_id, min(specter.paper_info) as paper_info 
from _ join specter using (arxiv_id) 
group by specter.arxiv_id
""")

In [242]:
[[N]] = paper_embeddings.view("select count(*) from _")
proto_row = next(iter(paper_embeddings))
proto_data = json.loads(proto_row.paper_info)
arxiv_ids = [None] * N
embeddings = np.zeros((N, len(proto_data["embedding"]["vector"])))
N

3690

In [243]:
for i, row in enumerate(paper_embeddings):
    arxiv_ids[i] = row.arxiv_id
    data = json.loads(row.paper_info)
    embedding = np.array(data["embedding"]["vector"])
    embeddings[i, :] = embedding

In [244]:
embeddings

array([[ 0.44301039,  0.43903628, -0.18109168, ..., -0.15402897,
         0.37721691,  0.80294162],
       [-0.38479692,  0.20737025, -0.14306906, ..., -0.29810682,
         0.51174051,  0.25418675],
       [-0.09693825,  0.35710678,  0.47641823, ...,  0.02871818,
        -0.60405248,  1.75118232],
       ...,
       [-0.38301799,  0.19988455, -0.11922613, ...,  0.15642451,
        -0.18388492,  0.89522731],
       [-0.64569211,  0.24136072, -0.23196328, ...,  0.58576339,
        -0.15982753,  0.12587707],
       [-0.27212369,  0.83795547, -0.46376166, ..., -0.45739731,
        -0.44925448,  1.36751485]])

In [245]:
def index_to_title(index):
    return metadata[arxiv_ids[index]].title

In [246]:
pca = sklearn.decomposition.PCA(16)
reduced_embeddings = pca.fit_transform(embeddings)

In [247]:
reduced_embeddings.nbytes / 2**10

461.25

In [248]:
%%time
k = 8
nearest_neighbours = {}
for i in range(reduced_embeddings.shape[0]):
    emb = reduced_embeddings[i, :]
    d2 = np.sum((emb[None, :] - reduced_embeddings)**2, axis=-1)
    nearest_neighbours[i] = np.argsort(d2)[1:(k+1)].tolist()

CPU times: user 1.56 s, sys: 861 ms, total: 2.42 s
Wall time: 1.18 s


In [249]:
nearest_neighbours

{0: [2117, 1817, 1527, 2069, 2703, 569, 3241, 757],
 1: [2067, 1468, 1329, 2946, 882, 2885, 1498, 1221],
 2: [1973, 2824, 2439, 2786, 2000, 2401, 2208, 1042],
 3: [1483, 2225, 1350, 3567, 3285, 877, 2502, 3162],
 4: [2641, 908, 1024, 312, 1026, 3533, 1768, 406],
 5: [1404, 626, 2467, 697, 172, 2385, 657, 1448],
 6: [2194, 2356, 3619, 3354, 2490, 896, 3122, 1650],
 7: [243, 174, 1555, 317, 2576, 647, 203, 195],
 8: [3435, 1574, 959, 960, 1189, 1689, 2438, 3282],
 9: [2555, 1937, 2633, 1782, 2510, 1485, 113, 2631],
 10: [67, 3590, 2577, 954, 1841, 2522, 3441, 3623],
 11: [1086, 2607, 3474, 42, 3268, 3473, 138, 2027],
 12: [3208, 1358, 3320, 1138, 74, 2861, 1087, 1576],
 13: [2626, 622, 1200, 3395, 3156, 489, 2367, 819],
 14: [2866, 3410, 2849, 3008, 238, 2097, 2418, 1880],
 15: [1289, 1121, 2507, 2813, 1318, 1546, 2903, 3221],
 16: [1545, 2525, 1363, 1726, 2861, 1236, 1845, 229],
 17: [3026, 2115, 2699, 1966, 1433, 2022, 45, 1392],
 18: [3077, 330, 2289, 2591, 1061, 2770, 3363, 1178],
 1

In [250]:
nn_graph = nx.from_dict_of_lists(nearest_neighbours)

In [251]:
for u, v in nn_graph.edges:
    dist = np.linalg.norm(embeddings[u] - embeddings[v])
    nn_graph.edges[(u, v)]["distance"] = dist
    nn_graph.edges[(u, v)]["weight"] = dist

In [252]:
# distances = [nn_graph.edges[e]["distance"] for e in nn_graph.edges]
# dist_q = np.quantile([nn_graph.edges[e]["distance"] for e in nn_graph.edges], 1/k)
# for e in nn_graph.edges:
#     dist = nn_graph.edges[e]["distance"]
#     nn_graph.edges[e]["weight"] = np.exp(-dist**2/(2*dist_q**2))

In [253]:
np.histogram([nn_graph.edges[e]["weight"] for e in nn_graph.edges])
    

(array([   1,    8,  210, 1780, 5783, 6862, 4316, 1668,  342,   45]),
 array([ 3.71981448,  5.4546608 ,  7.18950713,  8.92435345, 10.65919978,
        12.39404611, 14.12889243, 15.86373876, 17.59858508, 19.33343141,
        21.06827773]))

In [254]:
mst = nx.minimum_spanning_tree(nn_graph, weight="weight")
len(list(nx.connected_components(mst)))

1

In [255]:
degrees = nx.degree(mst)
leaves = {n for n, d in degrees if d == 1}
len(leaves), len(leaves) / len(mst)

(1769, 0.47940379403794037)

In [256]:
np.histogram([d for _, d in degrees])

(array([2786,  455,  242,  110,   41,   42,    8,    3,    1,    2]),
 array([ 1. ,  2.2,  3.4,  4.6,  5.8,  7. ,  8.2,  9.4, 10.6, 11.8, 13. ]))

In [257]:
non_leaves = set(mst.nodes) - leaves

In [258]:
%%time
shortest_path_lengths = dict(nx.shortest_path_length(mst, weight="weight"))

CPU times: user 15.5 s, sys: 562 ms, total: 16 s
Wall time: 18.1 s


In [259]:
heights = {}
for node in non_leaves:
    lengths = shortest_path_lengths[node]
    h = min(lengths[leaf] for leaf in leaves)
    heights[node] = h

In [260]:
for n in leaves:
    heights[n] = 0

In [261]:
len(heights)

3690

In [262]:
max_height = max(heights.values())
roots = [n for n in non_leaves if heights[n] == max_height]
roots, max_height

([2587], 46.46369824602794)

In [263]:
root_max_heights = {}
for node in roots:
    path_lengths = shortest_path_lengths[node]
    root_max_heights[node] = max([l for n, l in path_lengths.items() if n in leaves])

In [264]:
root_max_heights

{2587: 607.1977273093254}

In [265]:
max_min_roots = sorted(roots, key=lambda node: root_max_heights[node])
max_min_roots

[2587]

In [266]:
root = max_min_roots[0]

In [267]:
sp = {}
for u in roots:
    for v in roots:
        if u < v:
            sp[(u, v)] = nx.shortest_path(mst, source=u, target=v)

In [268]:
{k: len(path) for k, path in sp.items()}

{}

In [269]:
[(n, index_to_title(n)) for n in roots]

[(2587, 'Bias-Aware Face Mask Detection Dataset')]

In [270]:
mst[root]

AtlasView({2217: {'distance': 11.256197309925138, 'weight': 11.256197309925138}, 291: {'distance': 11.797994704036102, 'weight': 11.797994704036102}})

In [271]:
nodes = list(nx.depth_first_search.dfs_preorder_nodes(mst, root))
nodes

[2587,
 2217,
 1151,
 134,
 3239,
 3207,
 1347,
 583,
 3413,
 3573,
 2193,
 3485,
 2066,
 3390,
 1823,
 3183,
 959,
 960,
 2374,
 3340,
 1267,
 778,
 780,
 2577,
 1684,
 1213,
 848,
 1246,
 1798,
 2126,
 1263,
 3065,
 1903,
 3344,
 1579,
 1560,
 2816,
 2613,
 768,
 53,
 1599,
 766,
 3636,
 3486,
 360,
 1941,
 1206,
 72,
 1163,
 288,
 1274,
 767,
 2581,
 2129,
 1224,
 2246,
 2724,
 3432,
 82,
 826,
 3023,
 2744,
 1603,
 2276,
 439,
 696,
 1771,
 1671,
 3460,
 2307,
 612,
 2634,
 939,
 2290,
 506,
 871,
 2332,
 734,
 2240,
 1179,
 2660,
 3249,
 191,
 771,
 1346,
 3421,
 2752,
 597,
 1796,
 3279,
 632,
 695,
 3303,
 2953,
 1636,
 2853,
 336,
 1917,
 3237,
 3260,
 1954,
 155,
 3683,
 2036,
 169,
 3589,
 3475,
 3619,
 3447,
 2767,
 967,
 1051,
 2053,
 2228,
 2701,
 2894,
 2187,
 3467,
 739,
 1528,
 2650,
 167,
 3063,
 2308,
 3046,
 1946,
 3463,
 1133,
 2925,
 3048,
 181,
 2659,
 2543,
 3478,
 1736,
 892,
 2720,
 2484,
 1890,
 755,
 783,
 187,
 785,
 1870,
 112,
 2284,
 1098,
 712,
 1787,
 1

In [272]:
[(heights[n], index_to_title(n)) for n in nodes[:20]]

[(46.46369824602794, 'Bias-Aware Face Mask Detection Dataset'),
 (35.207500936102804,
  'Addressing Bias in Face Detectors using Decentralised Data collection with incentives'),
 (23.640877550663873,
  'On the Importance of Architectures and Hyperparameters for Fairness in Face Recognition'),
 (12.129462168488951,
  'A Differentiable Distance Approximation for Fairer Image Classification'),
 (0,
  'Debiasing Methods for Fairer Neural Models in Vision and Language Research: A Survey'),
 (35.4790507748799, 'Stay Home Safe with Starving Federated Data'),
 (23.865213243803872,
  'Analyzing the Robustness of Decentralized Horizontal and Vertical Federated Learning Architectures in a Non-IID Scenario'),
 (28.145395207743935,
  'Feature Reconstruction Attacks and Countermeasures of DNN training in Vertical Federated Learning'),
 (18.302221297231362, 'Differentially Private Vertical Federated Learning'),
 (8.872205018618534,
  'Quantifying the Impact of Label Noise on Federated Learning'),
 (1

In [273]:
differences = np.zeros([len(nodes)])
for i, node_id in enumerate(nodes):
    if i == 0:
        continue
    prev_node_id = nodes[i-1]
    differences[i] = np.linalg.norm(embeddings[node_id] - embeddings[prev_node_id])

In [274]:
np.min(differences[1:])

3.7198144769583075

In [275]:
root_distances = shortest_path_lengths[root]

In [276]:
if "paper_ordering" in db.tables:
    db.drop_table("paper_ordering")
paper_ordering = db.create_table(
    name="paper_ordering", 
    column_names=["arxiv_id", "position", "difference", "depth", "height"])
paper_ordering.insert([(arxiv_ids[node_id], i, differences[i], root_distances[node_id], max_height - heights[node_id]) for i, node_id in enumerate(nodes)])
paper_ordering

|arxiv\_id|position|difference|depth|height|
|-|-|-|-|-|
|\'2211\.01207\'|0|0\.0|0|0\.0|
|\'2210\.16024\'|1|11\.256197309925138|11\.256197309925138|11\.256197309925135|
|\'2210\.09943\'|2|11\.566623385438932|22\.822820695364072|22\.822820695364065|
|\'2210\.04369\'|3|11\.51141538217492|34\.33423607753899|34\.33423607753899|
|\'2211\.05617\'|4|12\.129462168488951|46\.46369824602794|46\.46369824602794|
|\'2211\.05410\'|5|16\.771687583200453|34\.66099391958011|10\.984647471148037|
|\'2210\.11061\'|6|11\.655058956535928|46\.31605287611603|22\.598485002224066|
|\'2210\.06771\'|7|11\.137794728499374|57\.453847604615405|18\.318303038284004|
|\'2211\.06782\'|8|9\.843173910512569|67\.29702151512797|28\.161476948796576|
|\'2211\.07816\'|9|9\.43001627861283|76\.72703779374079|37\.5914932274094|
|\'2210\.15865\'|10|8\.004893400281242|84\.73193119402204|29\.58659982712816|
|\'2211\.07364\'|11|8\.73614690467898|93\.46807809870101|25\.47551869428799|
|\'2210\.15120\'|12|10\.389579409640575|103\.85765750834159|35\.86509810392857|
|\'2211\.06614\'|13|10\.598600142099372|114\.45625765044096|46\.46369824602794|
|\'2210\.13686\'|14|14\.453192928033134|115\.53636832651098|24\.186387285759167|
|\'2211\.05239\'|15|11\.524079215556432|127\.06044754206741|34\.21235552925393|
|...|...|...|...|...|


In [277]:
[differences] = zip(*db.query("select difference from paper_ordering"))

In [278]:
diff_quantiles = np.quantile(differences, [0.75, 0.9])
def diff_color(diff):
    i = np.searchsorted(diff_quantiles, diff)
    p = 1 - 0.5*(i / len(diff_quantiles))
    return f"rgb({p*100}%, {p*100}%, {p*100}%)"

In [279]:
diff_color(10)

'rgb(100.0%, 100.0%, 100.0%)'

In [280]:
def quote_html(text):
    return text.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")

In [281]:
preamble = """
<!doctype html>
<style>
body { margin: 20px; }
</style>
<script>
function toggle(arxiv) {
  let elt = document.getElementById(arxiv);
  console.log(elt, elt.style.display);
  if(elt.style.display == "block") {
    elt.style.display = "none";
  } else {
    elt.style.display = "block";
  }
}
</script>
"""
rows = list(db.query("""
select * from paper_ordering join submissions using (arxiv_id) order by position
"""))
batch_size = 256
batch_count = len(rows) // batch_size
for batch in range(batch_count):
    batch_rows = rows[(batch*batch_size):((batch+1)*batch_size)]
    buf = io.StringIO()
    buf.write(preamble)
    for row in batch_rows:
        arxiv = quote_html(row.arxiv_id)
        title = quote_html(row.title)
        authors = quote_html(row.authors)
        date = quote_html(row.date)
        body_q = quote_html(row.abstract)
        url = quote_html(row.url)
        diff = row.difference
        color = diff_color(diff)
        depth = row.depth
        height = row.height
    #     buf.write(f"<div style='display: list-item; margin-left: {(height-1)*10}px'>")
        buf.write(f"<div>")
        buf.write(f"<div style='border-top: 1px solid {color}' onclick='toggle(\"{arxiv}\")'>{title}</div>\n")
        buf.write(f"<div id='{arxiv}' style='display: none; margin-left: 20px'>")
        buf.write(f"<div>Date: {date}</div>")
        buf.write(f"<div>Authors: {authors}</div>")
        buf.write(f"<div style='padding-top: 10px; width: 80ex'>{body_q}</div><div><a href='{url}'>{arxiv}</a></div>\n")
        buf.write("</div>")
        buf.write("</div>")

    buf.write(f"""
    <div><a href="arxiv_{batch-1}.html">Prev ({batch-1})</a></div>
    <div><a href="arxiv_{batch+1}.html">Next ({batch+1})</a></div>
    """)
    with open(f"output/arxiv_{batch}.html", "w+") as f:
        f.write(buf.getvalue())

In [282]:
db.query("select count(*) from paper_ordering")

|count\(\*\)|
|-|
|3690|


In [283]:
if "seen.titles_seen" not in db.tables:
    db.create_table(name="seen.titles_seen", column_names=["arxiv_id", "seen"], column_types={"seen": "int"})

OperationalError: table titles_seen already exists

In [284]:
titles_seen = db.open_table("seen.titles_seen")

In [285]:
db.execute("""
insert into titles_seen select arxiv_id, 1 from paper_ordering
""")

In [286]:
titles_seen.view("select count(distinct arxiv_id) from _")

|count\(distinct arxiv\_id\)|
|-|
|20613|


In [177]:
# titles_seen1 = titles_seen
# titles_seen_data = list(titles_seen)

In [172]:
# import json
# with open("titles_seen.json", "w+") as f:
#     json.dump(list(titles_seen), f)