# Setup

In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
CS_CLASSES = [
    'cs.' + cat for cat in [
        'AI', 'AR', 'CC', 'CE', 'CG', 'CL', 'CR', 'CV', 'CY', 'DB',
        'DC', 'DL', 'DM', 'DS', 'ET', 'FL', 'GL', 'GR', 'GT', 'HC',
        'IR', 'IT', 'LG', 'LO', 'MA', 'MM', 'MS', 'NA', 'NE', 'NI',
        'OH', 'OS', 'PF', 'PL', 'RO', 'SC', 'SD', 'SE', 'SI', 'SY',
    ]
]
DATE = "2024-05-21" #Must be of format YYYY-MM-DD
TOPIC = "Natural Language Processing, Generative AI, Computer Vision"

# Gathering papers through ArXiV API

In [3]:
#arxiv api
import requests
import xml.etree.ElementTree as ET

def get_arxiv_papers(date):
    papers = []    
    for cls in CS_CLASSES:
        query = f"http://export.arxiv.org/api/query?search_query=cat:'{cls}'&sortBy=submittedDate&sortOrder=descending"
        params = {
                "start": 0,
                "max_results": 1000
        }
        response = requests.get(query, params=params)
        if response.status_code == 200:
            root = ET.fromstring(response.content)
            entries = root.findall('{http://www.w3.org/2005/Atom}entry')
            for entry in entries:
                paper = {
                    'id': entry.find('{http://www.w3.org/2005/Atom}id').text,
                    'title': entry.find('{http://www.w3.org/2005/Atom}title').text,
                    'summary': entry.find('{http://www.w3.org/2005/Atom}summary').text,
                    'authors': [author.find('{http://www.w3.org/2005/Atom}name').text for author in entry.findall('{http://www.w3.org/2005/Atom}author')],
                    'published': entry.find('{http://www.w3.org/2005/Atom}published').text
                }
                if date in paper["published"]:
                    papers.append(paper)
    return papers

In [4]:
papers = get_arxiv_papers(DATE)
print(len(papers))

416


# Postprocessing papers

In [5]:
from langchain_anthropic import ChatAnthropic
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

In [6]:
model = ChatAnthropic(model="claude-3-haiku-20240307")
prompt = """You are being given the abstract of a paper. Is this paper about {topic}, or not?
Answer "YES" if the abstract is related to {topic}, and "NO" otherwise.
<abstract> {abstract} </abstract>
Answer:"""

## Testing

In [9]:
parser = StrOutputParser()
prompt_template = ChatPromptTemplate.from_messages(
    [("system", "You are an helpful AI assistant"), ("user", prompt)]
)
chain = prompt_template | model | parser
abstracts_list = [paper["summary"] for paper in papers[:5]]
for abstract in abstracts_list:
    print(abstract)
    print(chain.invoke({"abstract":abstract, "topic":TOPIC}))

  Searching through chemical space is an exceptionally challenging problem
because the number of possible molecules grows combinatorially with the number
of atoms. Large, autoregressive models trained on databases of chemical
compounds have yielded powerful generators, but we still lack robust strategies
for generating molecules with desired properties. This molecular search problem
closely resembles the "alignment" problem for large language models, though for
many chemical tasks we have a specific and easily evaluable reward function.
Here, we introduce an algorithm called energy rank alignment (ERA) that
leverages an explicit reward function to produce a gradient-based objective
that we use to optimize autoregressive policies. We show theoretically that
this algorithm is closely related to proximal policy optimization (PPO) and
direct preference optimization (DPO), but has a minimizer that converges to an
ideal Gibbs-Boltzmann distribution with the reward playing the role of an
ener

## Adding Papers

In [None]:
import time

selected_papers = []
for i, paper in enumerate(papers):
    abstract = paper["summary"]
    if i%30 == 0:
        time.sleep(20) #time.sleep is there to avoid hitting anthropic rate limits
    if chain.invoke({"abstract":abstract, "topic":TOPIC}) == "YES":
        print(paper["title"])
        selected_papers.append(paper)

# Finalizing Results

In [12]:
import pandas as pd

df = pd.DataFrame(selected_papers)
df = df.drop_duplicates(subset="title") #duplicates can sometimes be found
print(f"{len(df)} papers extracted for date {DATE} and topics: {TOPIC}")
df.to_csv(f"extracted_papers.csv", index=False)

115 papers extracted for date 2024-05-21 and topics: Natural Language Processing, Generative AI, Computer Vision


In [13]:
import json

info_dict = {"date": DATE, "topic": TOPIC}
with open("parameters.json", "w") as outfile: 
    json.dump(info_dict, outfile)