## Init & paper id

In [51]:
import requests
import os
import pandas as pd
from tqdm import tqdm

index_dir='library_index.csv'
match_api='https://api.semanticscholar.org/graph/v1/paper/search/match?query={title}'

index_csv=pd.read_csv(index_dir)
index_dict=index_csv.to_dict(orient='records')
index={}
for i in index_dict:
    index[i['acronym'].lower()]=i['title']

headers = {'x-api-key': api_key}

In [10]:
papers={}
for i in tqdm(index):
    papers[i]=requests.get(match_api.format(title=index[i]), headers=headers).json()

100%|██████████| 256/256 [02:45<00:00,  1.55it/s]


In [11]:
index_csv

Unnamed: 0,acronym,title
0,gpt,Language Models are Few-Shot Learners
1,ttt,Learning to (Learn at Test Time): RNNs with Ex...
2,xlstm,xLSTM: Extended Long Short-Term Memory
3,griffin,Griffin: Mixing Gated Linear Recurrences with ...
4,hyena,Hyena Hierarchy: Towards Larger Convolutional ...
...,...,...
251,selfcondembdiffu,Self-conditioned Embedding Diffusion for Text ...
252,analogbits,Analog Bits: Generating Discrete Data using Di...
253,reparamdiscdiffu,A Reparameterized Discrete Diffusion Model for...
254,derandddm,Fast Sampling via De-randomization for Discret...


In [27]:
index_csv['acronym']=index_csv['acronym'].str.lower()

In [32]:
unmatched=[]

for i in papers:
    try:
        id=papers[i]['data'][0]['paperId']
        index_csv.loc[index_csv['acronym']==i,'id']=id
    except:
        unmatched.append(i)
        print('Error:',i)

print(len(unmatched))

Error: ntk
Error: longnet
Error: feedbackmem
Error: srt
Error: kangpt
5


In [17]:

for i in tqdm(unmatched):
    papers[i]=requests.get(match_api.format(title=index[i])).json()

100%|██████████| 8/8 [00:04<00:00,  1.75it/s]


## Construct metadata & references

In [86]:
import json

meta={}
meta_dir='library_meta.json'
if os.path.exists(meta_dir):
    with open(meta_dir,'r') as f:
        meta=json.load(f)

paper_detail='https://api.semanticscholar.org/graph/v1/paper/{paper_id}?fields=year,authors,tldr,venue,abstract,citationCount,influentialCitationCount,references,embedding.specter_v2,openAccessPdf'

for i in tqdm(index_csv.index):
    acronym=index_csv.loc[i,'acronym']
    title=index_csv.loc[i,'title']
    if 'detail' in meta.get(acronym,{}):
        if not 'message' in meta[acronym]['detail']:
            continue
    if not pd.isna(index_csv.loc[i,'id']):
        paper_id=index_csv.loc[i,'id']
        detail=requests.get(paper_detail.format(paper_id=paper_id)).json()
        if 'message' in detail:
            print('Error:',acronym,detail['message'])
            continue
        meta[acronym]={}
        meta[acronym]['title']=title
        meta[acronym]['id']=paper_id
        meta[acronym]['detail']=detail
    else:
        meta[acronym]={}
        meta[acronym]['title']=title
        meta[acronym]['id']=paper_id
        meta[acronym]['detail']={}

with open(meta_dir,'w') as f:
    json.dump(meta,f)
        

references={}
ref_dir='library_ref.json'
if os.path.exists(ref_dir):
    with open(ref_dir,'r') as f:
        references=json.load(f)

references_detail='https://api.semanticscholar.org/graph/v1/paper/{paper_id}/references?fields=contextsWithIntent,intents,isInfluential,title,influentialCitationCount'

for i in tqdm(index_csv.index):
    acronym=index_csv.loc[i,'acronym']
    id=index_csv.loc[i,'id']
    if acronym in references:
        if 'message' not in references[acronym]:
            continue
    if pd.isna(id):
        references[acronym]=[]
    else:
        ret=requests.get(references_detail.format(paper_id=id)).json()
        if 'message' in ret:
            print('Error:',acronym,ret['message'])
            continue
        references[acronym]=ret
        

with open(ref_dir,'w') as f:
    json.dump(references,f)


100%|██████████| 260/260 [00:00<00:00, 76495.44it/s]
 87%|████████▋ | 225/260 [00:00<00:00, 1153.05it/s]

100%|██████████| 260/260 [00:01<00:00, 179.17it/s] 


## Metadata to tree

In [88]:
index=pd.read_csv('./library_index.csv')
with open('./library_meta.json','r') as f:
    meta=json.load(f)
with open('./library_ref.json','r') as f:
    refs=json.load(f)

In [94]:
meta['transformer']

{'title': 'Attention is All you Need',
 'id': '204e3073870fae3d05bcbc2f6a8e263d9b72e776',
 'detail': {'paperId': '204e3073870fae3d05bcbc2f6a8e263d9b72e776',
  'abstract': 'The dominant sequence transduction models are based on complex recurrent or convolutional neural networks in an encoder-decoder configuration. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring significantly less time to train. Our model achieves 28.4 BLEU on the WMT 2014 English-to-German translation task, improving over the existing best results, including ensembles by over 2 BLEU. On the WMT 2014 English-to-French translation task, our model establishes a new single-model state-of-the-

In [113]:
from dataclasses import dataclass, field, asdict
from typing import List, Dict


LIBRARY_DIR = './'

@dataclass
class PaperObject:
    title: str
    acronym: str
    s2id: str
    abstract: str
    venue: str
    year: int
    tldr: str
    # embedding: list
    citationCount: int
    influentialCitationCount: int
    seed_ids: List[str]
    code: str = None

    def __post_init__(self):
        code_dir=os.path.join(LIBRARY_DIR,'base',self.acronym,self.acronym+'_edu.py')
        if os.path.exists(code_dir):
            self.code=open(code_dir,'r').read()
        else:
            self.code=None

    def to_dict(self) -> Dict:
        return asdict(self)
    
    @property
    def type(self) -> str:
        return self.__class__.__name__

    @classmethod
    def from_dict(cls, dict: Dict):
        return cls(**dict)
    
    @classmethod
    def load(cls, tree_dir: str, id:str):
        with open(os.path.join(tree_dir,id+'.json'),'r') as f:
            return cls.from_dict(json.load(f))

    def save(self,tree_dir: str):
        os.makedirs(tree_dir, exist_ok=True)
        with open(os.path.join(tree_dir,self.acronym+'.json'),'w') as f:
            json.dump(self.to_dict(),f,indent=4)

    def to_desc(self) -> str:
        title=self.title.replace(':',' ')
        abstract=self.abstract.replace(':',' ')
        tldr=self.tldr.replace(':',' ')
        venue=self.venue.replace(':',' ')
        mdtext=f'# {title}\nS2 ID {self.s2id}\n\nTL;DR {tldr}\n\n{abstract}\n\nPublished at {venue} in {self.year}\n\nCited {self.citationCount} times\n\nImpactful citations: {self.influentialCitationCount}'
        return mdtext


for acronym in refs:
    if refs[acronym]==[]:
        continue   
    refdata=refs[acronym]['data']
    seed_ids=[]
    for ref in refdata:
        if 'methodology' in ref['intents']:
            if ref['citedPaper']['paperId'] in index['id'].values:
                ref_acronym=index.loc[index['id']==ref['citedPaper']['paperId'],'acronym'].values[0]
                seed_ids.append(ref_acronym)
    title=meta[acronym]['title']
    s2id=meta[acronym]['id']
    abstract=meta[acronym]['detail']['abstract']
    if abstract is None:
        abstract='N/A'
    venue=meta[acronym]['detail']['venue']
    if venue is None:
        venue='N/A'
    year=meta[acronym]['detail']['year']
    if year is None:
        year='N/A'
    tldr=meta[acronym]['detail']['tldr']
    if tldr is None:
        tldr='N/A'
    else:
        tldr=tldr['text']
        if tldr is None: tldr='N/A'
    embedding=meta[acronym]['detail']['embedding']
    if embedding is None:
        embedding=[]
    else:
        embedding=embedding['vector']
    citationCount=meta[acronym]['detail']['citationCount']
    influentialCitationCount=meta[acronym]['detail']['influentialCitationCount']
    paper=PaperObject(title,acronym,s2id,abstract,venue,year,tldr,citationCount,influentialCitationCount,seed_ids)
    paper.save('./tree')


## Load tree and viz

In [93]:
import networkx as nx




{'transformer': [],
 'gpt': ['transformer'],
 'gpt2': ['transformer', 'bert', 'gpt'],
 'gpt3': [],
 'bert': ['transformer'],
 'ttt': ['mamba'],
 'xlstm': [],
 'griffin': ['gateloop',
  'retnet',
  'rwkv4',
  'resurrectrnn',
  'hyena',
  'flashconv',
  's5',
  's4',
  'lssl',
  'aft',
  'roformer',
  'hippo',
  'gpt3',
  'longformer',
  'mqa'],
 'hyena': ['flashconv',
  'gssm',
  'flashattn',
  's4',
  'aft',
  'gpt3',
  'compressivetransformer',
  'butterfly'],
 'm2': [],
 'spikegpt': ['aft',
  'performer',
  'gpt3',
  'synthesizer',
  'reformer',
  'transformer',
  'bert',
  'gpt2'],
 'mamba2': [],
 's4': ['hippo', 'lkconv', 'sparsetransformer', 'lighdynconv', 'transformer'],
 'hippo': ['butterfly'],
 'lssl': ['hippo', 'transformer'],
 'httyh': ['s4d', 'dssm', 's4', 'hippo'],
 's4d': ['dssm', 's4'],
 'mamba': [],
 'samba': ['based',
  'selfextend',
  'gla',
  'mamba',
  'retnet',
  'pi',
  'seqboat',
  'landmarkattn',
  'gqa',
  'mega',
  's4d',
  'flashattn',
  'alibi',
  'longformer