In [38]:
import requests
import os
import pandas as pd
import json
from tqdm import tqdm

index_dir='library_index.csv'


headers = {'x-api-key': 'api_key'}


## get paper id

In [39]:

match_api='https://api.semanticscholar.org/graph/v1/paper/search/match?query={title}'

index_csv=pd.read_csv(index_dir)
index_dict=index_csv.to_dict(orient='records')
index={}
for i in index_dict:
    index[i['acronym'].lower()]=i['title']


In [19]:
papers={}
for i in tqdm(index):
    papers[i]=requests.get(match_api.format(title=index[i]), headers=headers).json()

100%|██████████| 300/300 [02:55<00:00,  1.71it/s]


In [20]:
index_csv['acronym']=index_csv['acronym'].str.lower()

In [23]:
unmatched=[]

for i in papers:
    try:
        id=papers[i]['data'][0]['paperId']
        index_csv.loc[index_csv['acronym']==i,'id']=id
    except:
        unmatched.append(i)
        print('Error:',i)

print(len(unmatched))

Error: ntk
Error: longnet
Error: feedbackmem
Error: srt
Error: kangpt
Error: s4pp
6


In [22]:

for i in tqdm(unmatched):
    papers[i]=requests.get(match_api.format(title=index[i])).json()

100%|██████████| 6/6 [00:02<00:00,  2.15it/s]


In [26]:
# index_csv.to_csv('library_index.csv',index=False)
for i in index_csv['acronym']:
    if index_csv.loc[index_csv['acronym']==i,'id'].isnull().values.any():
        print(i)

ntk
feedbackmem
srt
kangpt
s4pp


## Construct metadata & references

In [48]:
meta={}
meta_dir='library_meta.json'
if os.path.exists(meta_dir):
    with open(meta_dir,'r') as f:
        meta=json.load(f)

paper_detail='https://api.semanticscholar.org/graph/v1/paper/{paper_id}?fields=year,authors,tldr,venue,abstract,citationCount,influentialCitationCount,references,embedding.specter_v2,openAccessPdf'

for i in tqdm(index_csv.index):
    acronym=index_csv.loc[i,'acronym']
    title=index_csv.loc[i,'title']
    if 'detail' in meta.get(acronym,{}):
        if not 'message' in meta[acronym]['detail']:
            continue
    if not pd.isna(index_csv.loc[i,'id']):
        paper_id=index_csv.loc[i,'id']
        detail=requests.get(paper_detail.format(paper_id=paper_id),headers=headers).json()
        if 'message' in detail:
            print('Error:',acronym,detail['message'])
            continue
        meta[acronym]={}
        meta[acronym]['title']=title
        meta[acronym]['id']=paper_id
        meta[acronym]['detail']=detail
    else:
        meta[acronym]={}
        meta[acronym]['title']=title
        meta[acronym]['id']=paper_id
        meta[acronym]['detail']={}

with open(meta_dir,'w') as f:
    json.dump(meta,f)
        

references={}
ref_dir='library_ref.json'
if os.path.exists(ref_dir):
    with open(ref_dir,'r') as f:
        references=json.load(f)

references_detail='https://api.semanticscholar.org/graph/v1/paper/{paper_id}/references?fields=contextsWithIntent,intents,isInfluential,title,influentialCitationCount'

for i in tqdm(index_csv.index):
    acronym=index_csv.loc[i,'acronym']
    id=index_csv.loc[i,'id']
    if acronym in references:
        if 'message' not in references[acronym]:
            continue
    if pd.isna(id):
        references[acronym]=[]
    else:
        ret=requests.get(references_detail.format(paper_id=id),headers=headers).json()
        if 'message' in ret:
            print('Error:',acronym,ret['message'])
            continue
        references[acronym]=ret
        

with open(ref_dir,'w') as f:
    json.dump(references,f)


100%|██████████| 301/301 [00:00<00:00, 6236.07it/s]


Error: hopfield Forbidden


100%|██████████| 301/301 [00:00<00:00, 6248.01it/s]


{'message': 'Forbidden'}
Error: hopfield Forbidden


## Metadata to tree

In [46]:
index=pd.read_csv('./library_index.csv')
with open('./library_meta.json','r') as f:
    meta=json.load(f)
with open('./library_ref.json','r') as f:
    refs=json.load(f)
print(len(index),len(meta),len(refs))

301 300 300


In [31]:
meta['hydra']

{'title': 'Hydra: Bidirectional State Space Models Through Generalized Matrix Mixers',
 'id': 'ea507df05bb5fe32cd8af80602708713c9bd2ba2',
 'detail': {'paperId': 'ea507df05bb5fe32cd8af80602708713c9bd2ba2',
  'abstract': 'A wide array of sequence models are built on a framework modeled after Transformers, comprising alternating sequence mixer and channel mixer layers. This paper studies a unifying matrix mixer view of sequence mixers that can be conceptualized as a linear map on the input sequence. This framework encompasses a broad range of well-known sequence models, including the self-attention of Transformers as well as recent strong alternatives such as structured state space models (SSMs), and allows understanding downstream characteristics such as efficiency and expressivity through properties of their structured matrix class. We identify a key axis of matrix parameterizations termed sequence alignment, which increases the flexibility and performance of matrix mixers, providing in

In [32]:
from dataclasses import dataclass, field, asdict
from typing import List, Dict


LIBRARY_DIR = './'

pjoin=os.path.join
pexists=os.path.exists

@dataclass
class NodeObject:
    acronym: str
    title: str
    seed_ids: List[str]

    def to_dict(self) -> Dict:
        return asdict(self)

    @classmethod
    def from_dict(cls, dict: Dict):
        return cls(**dict)
    
    @classmethod
    def load(cls, save_dir: str, acronym:str):
        with open(pjoin(save_dir,acronym+'.json'),'r') as f:
            return cls.from_dict(json.load(f))

    def save(self,save_dir: str):
        os.makedirs(save_dir, exist_ok=True)
        with open(pjoin(save_dir,self.acronym+'.json'),'w') as f:
            json.dump(self.to_dict(),f,indent=4)

    def to_desc(self) -> str:
        raise NotImplementedError

@dataclass
class LibraryReference(NodeObject):
    s2id: str = None
    abstract: str = None
    authors: List[str] = None
    venue: str = None
    year: int = None
    tldr: str = None
    # embedding: list
    citationCount: int = None
    influentialCitationCount: int = None
    code: str = None
    description: str = None
    url: str = None

    def __post_init__(self):
        code_dir=pjoin(LIBRARY_DIR,'base',self.acronym,self.acronym+'_edu.py')
        if pexists(code_dir):
            self.code=open(code_dir,'r').read()
        else:
            self.code=None

    @property
    def type(self) -> str:
        if self.code is not None:
            return 'ReferenceWithCode'
        else:
            return 'Reference'

    def to_desc(self) -> str:
        title=self.title.replace(':',' ')
        mdtext=f'# {title}'
        if self.s2id:
            mdtext+=f'\n* S2 ID {self.s2id} *'
        if self.authors:
            authors=', '.join(self.authors)
            mdtext+=f'\n* Authors: {authors} *'
        if self.tldr:
            tldr=self.tldr.replace(':',' ').replace(',',',\n')
            mdtext+=f'\n\n* TL;DR {tldr} *'
        if self.abstract:
            abstract=self.abstract.replace(':',' ').replace('.','.\n')
            mdtext+=f'\n\n## Abstract\n{abstract}'
        if self.venue:
            venue=self.venue.replace(':',' ')
            mdtext+=f'\n\n* Published at {venue} in {self.year} *'
            mdtext+=f'\n* Cited {self.citationCount} times *'
            mdtext+=f'\n* Impactful citations {self.influentialCitationCount} *'
        if self.description:
            description=self.description.replace(':',' ').replace('.','.\n')
            mdtext+=f'\n\n## Description\n{description}'
        return mdtext


manual_input={
    'srt': {
        'title': 'Self Reasoning Tokens',
        'authors': ['Felipe Sens Bonetto'],
        'year': 2024,
        'url': 'https://github.com/lucidrains/self-reasoning-tokens-pytorch',
        'description': "The project \"Reasoning Tokens\" by Felipe Sens Bonetto aims to enhance the reasoning abilities of language models like GPT by teaching them to plan ahead in a self-supervised way. The core idea is to introduce \"reasoning tokens,\" where for each token predicted, an additional token is generated that duplicates the input and doesn't receive a gradient from the next token but from future tokens. This approach encourages the model to pre-cache information useful for future predictions. Initial experiments showed a significant reduction in loss, indicating improved performance. The project plans to explore this method further, especially in fine-tuned instruction-following models, potentially replacing the need for step-by-step explanations during training. The ultimate goal is to create models that can reason internally, improving their performance and reducing the need for manually crafted training data.",
        'seed_ids': ['gpt3']
    },
    'ntk': {
        'title': 'NTK-Aware Scaled RoPE',
        'authors': ['bloc97'],
        'year': 2023,
        'url': 'https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have',
        'description': "The \"NTK-Aware Scaled RoPE\" project aims to extend the context size of LLaMA models beyond 8k tokens without fine-tuning and with minimal perplexity degradation. Traditional methods like RoPE interpolation often fail to distinguish between closely positioned tokens, leading to performance issues. By applying Neural Tangent Kernel (NTK) theory, this new method uses a nonlinear interpolation scheme that changes the RoPE's base rather than its scale, allowing for accurate distinction of token positions. This approach enables the LLaMA 7B model to handle longer contexts (up to 12k tokens) with minimal perplexity degradation, without fine-tuning. Initial tests show promising results, suggesting that further fine-tuning could enhance performance even more. The method provides a new way to extend the context window size efficiently, potentially benefiting tasks like long document summarization. The author encourages further experimentation and innovation in this area.",
        'seed_ids': ['roformer']
    },
    'feedbackmem': {
        'title': 'Addressing Some Limitations of Transformers with Feedback Memory',
        'authors': ['Angela Fan', 'Thibaut Lavril', 'Edouard Grave', 'Armand Joulin', 'Sainbayar Sukhbaatar'],
        'venue': 'arXiv',
        'year': 2020,
        'abstract': "Transformers have been successfully applied to sequential, auto-regressive tasks despite being feedforward networks. Unlike recurrent neural networks, Transformers use attention to capture temporal relations while processing input tokens in parallel. While this parallelization makes them computationally efficient, it restricts the model from fully exploiting the sequential nature of the input. The representation at a given layer can only access representations from lower layers, rather than the higher level representations already available. In this work, we propose the Feedback Transformer architecture that exposes all previous representations to all future representations, meaning the lowest representation of the current timestep is formed from the highest-level abstract representation of the past. We demonstrate on a variety of benchmarks in language modeling, machine translation, and reinforcement learning that the increased representation capacity can create small, shallow models with much stronger performance than comparable Transformers.",
        'tldr': 'Transformers have shortcomings - limited memory and limited state update - but Feedback Memory is a straightforward way to resolve these. ',
        'seed_ids': ['transformer','bert']
    },
    'kangpt': {
        'title': 'Generative Pre-trained Transformers (GPTs) using Kolmogorov-Arnold Networks (KANs) for language modeling',
        'authors': ['Aditya N Ganesh'],
        'year': 2024,
        'url': 'https://adityang.github.io/kan-gpt/',
        'description': "Kolmogorov-Arnold Networks (KANs) are promising alternatives of Multi-Layer Perceptrons (MLPs). KANs have strong mathematical foundations just like MLPs: MLPs are based on the universal approximation theorem, while KANs are based on Kolmogorov-Arnold representation theorem. KANs and MLPs are dual: KANs have activation functions on edges, while MLPs have activation functions on nodes. This simple change makes KANs better (sometimes much better!) than MLPs in terms of both model accuracy and interpretability. ",
        'seed_ids': ['gpt3','transformer']
    },
    's4pp': {
        'title': 'S4++: Elevating Long Sequence Modeling with State Memory Reply',
        'authors': ['Biqing Qi', 'Junqi Gao', 'Dong Li', 'Kaiyan Zhang', 'Jianxing Liu', 'Ligang Wu', 'Bowen Zhou'],
        'venue': 'ICLR 2024 Withdrawn Submission',
        'year': 2024,
        'url': 'https://openreview.net/forum?id=bdnw4qjfH9',
        'abstract': "Recently, state space models (SSMs) have shown significant performance advantages in modeling long sequences. However, in spite of their promising performance, there still exist limitations. 1. Non-Stable-States (NSS): Significant state variance discrepancies arise among discrete sampling steps, occasionally resulting in divergence. 2. Dependency Bias: The unidirectional state space dependency in SSM impedes the effective modeling of intricate dependencies. In this paper, we conduct theoretical analysis of SSM from the even-triggered control (ETC) theory perspective and first propose the presence of NSS Phenomenon. Our findings indicate that NSS primarily results from the sampling steps, and the integration of multi-state inputs into the current state significantly contributes to the mitigation of NSS. Building upon these theoretical analyses and findings, we propose a simple, yet effective, theoretically grounded State Memory Reply (SMR) mechanism that leverages learnable memories to incorporate multi-state information into the current state. This enables the precise modeling of finer state dependencies within the SSM, resulting in the introduction of S4+. Furthermore, we integrate the complex dependency bias into S4+ via interactive cross attentions mechanism, resulting in the development of S4++. Our extensive experiments in autoregressive language modeling and benchmarking against the Long Range Arena demonstrate superior performance in most post-processing tasks.",
        'seed_ids': ['s4']
    }
}



for acronym in refs:
    if refs[acronym]==[]:
        obj=manual_input[acronym]
        obj['acronym']=acronym
        proj=LibraryReference.from_dict(obj)
        proj.save('./tree')
        continue   
    refdata=refs[acronym]['data']
    seed_ids=[]
    for ref in refdata:
        if 'methodology' in ref['intents']:
            if ref['citedPaper']['paperId'] in index['id'].values:
                ref_acronym=index.loc[index['id']==ref['citedPaper']['paperId'],'acronym'].values[0]
                seed_ids.append(ref_acronym)
    title=meta[acronym]['title']
    s2id=meta[acronym]['id']
    abstract=meta[acronym]['detail']['abstract']
    authors=[author['name'] for author in meta[acronym]['detail']['authors']]
    if abstract is None:
        abstract='N/A'
    venue=meta[acronym]['detail']['venue']
    if venue is None:
        venue='arXiv'
    year=meta[acronym]['detail']['year']
    if year is None:
        year='N/A'
    tldr=meta[acronym]['detail']['tldr']
    if tldr is None:
        tldr='N/A'
    else:
        tldr=tldr['text']
        if tldr is None: tldr='N/A'
    embedding=meta[acronym]['detail']['embedding']
    if embedding is None:
        embedding=[]
    else:
        embedding=embedding['vector']
    citationCount=meta[acronym]['detail']['citationCount']
    influentialCitationCount=meta[acronym]['detail']['influentialCitationCount']
    paper=LibraryReference(title=title,acronym=acronym,seed_ids=seed_ids,s2id=s2id,abstract=abstract,authors=authors,venue=venue,year=year,tldr=tldr,citationCount=citationCount,influentialCitationCount=influentialCitationCount)
    paper.save('./tree')


## Build 1 hoc impactful cites

In [45]:
index=pd.read_csv('./library_index.csv')
with open('./library_meta.json','r') as f:
    meta=json.load(f)
print(len(index),len(meta))
dir_1hoc='./expanded_tree'
get_cite='https://api.semanticscholar.org/graph/v1/paper/{paper_id}/citations?fields=intents,contextsWithIntent,isInfluential,title&offset={offset}&limit=1000'
paper_detail='https://api.semanticscholar.org/graph/v1/paper/{paper_id}?fields=year,authors,tldr,venue,abstract,citationCount,influentialCitationCount,references,embedding.specter_v2,openAccessPdf'

301 300


In [36]:
import time

cite_1hoc_dir=dir_1hoc+'/cite_1hoc.json'
detail_1hoc_dir=dir_1hoc+'/detail_1hoc.json'
os.makedirs(dir_1hoc,exist_ok=True)

cite_1hoc={}
if os.path.exists(cite_1hoc_dir):
    with open(cite_1hoc_dir,'r') as f:
        cite_1hoc=json.load(f)

for acronym in tqdm(index['acronym']):
    id=index.loc[index['acronym']==acronym,'id'].values[0]
    if pd.isna(id):
        cite_1hoc[acronym]=[]
        continue
    citecount=meta[acronym]['detail']['citationCount']
    if acronym in cite_1hoc:
        if 'message' not in cite_1hoc[acronym]:
            # print('Already done:',acronym,len(cite_1hoc[acronym]),citecount)
            continue
    cite_1hoc[acronym]=[]
    maxoffset=min(citecount,9001)
    for offset in range(0,maxoffset,1000):
        if offset+1000>=10000:
            offset=8999
        print(acronym,offset,offset+1000)
        cites=requests.get(get_cite.format(paper_id=id,offset=offset),headers=headers).json()
        if 'message' in cites:
            print('Error:',id,cites['message'])
            continue
        if 'data' not in cites:
            print('Error:',cites)
            raise
        for c in cites['data']:
            paperid=c['citingPaper']['paperId']
            if paperid in cite_1hoc[acronym]:
                continue
            if 'methodology' in c['intents'] and c['isInfluential']:
                cite_1hoc[acronym].append(paperid)
        time.sleep(0.1)
    print('Done',acronym,len(cite_1hoc[acronym]),citecount)
    with open(cite_1hoc_dir,'w') as f:
        json.dump(cite_1hoc,f)

100%|██████████| 300/300 [00:00<00:00, 5802.67it/s]


In [44]:
detail_1hoc={}
if os.path.exists(detail_1hoc_dir):
    with open(detail_1hoc_dir,'r') as f:
        detail_1hoc=json.load(f)

for idx,acronym in enumerate(tqdm(cite_1hoc)):
    print('Progress:',idx+1,'/',len(cite_1hoc),acronym) 
    for c in cite_1hoc[acronym]:
        if c in detail_1hoc:
            if 'message' not in detail_1hoc[c]:
                continue
        detail=requests.get(paper_detail.format(paper_id=c),headers=headers).json()
        if 'message' in detail:
            print('Error:',c,detail['message'])
            continue
        detail_1hoc[c]=detail
        time.sleep(0.1)
    print('Done',acronym,len(detail_1hoc))
    with open(detail_1hoc_dir,'w') as f:
        json.dump(detail_1hoc,f)

JSONDecodeError: Expecting value: line 1 column 54594676 (char 54594675)