# Setup

In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [12]:
CS_CLASSES = [
    'cs.' + cat for cat in [
        'AI', 'AR', 'CC', 'CE', 'CG', 'CL', 'CR', 'CV', 'CY', 'DB',
        'DC', 'DL', 'DM', 'DS', 'ET', 'FL', 'GL', 'GR', 'GT', 'HC',
        'IR', 'IT', 'LG', 'LO', 'MA', 'MM', 'MS', 'NA', 'NE', 'NI',
        'OH', 'OS', 'PF', 'PL', 'RO', 'SC', 'SD', 'SE', 'SI', 'SY',
    ]
]
DATE = "2024-06-03" #Must be of format YYYY-MM-DD
TOPIC = "Natural Language Processing, Generative AI, Computer Vision"

# Gathering papers through ArXiV API

In [13]:
#arxiv api
import requests
import xml.etree.ElementTree as ET

def get_arxiv_papers(date):
    papers = []    
    title_list = []
    for cls in CS_CLASSES:
        query = f"http://export.arxiv.org/api/query?search_query=cat:'{cls}'&sortBy=submittedDate&sortOrder=descending"
        params = {
                "start": 0,
                "max_results": 300
        }
        response = requests.get(query, params=params)
        if response.status_code == 200:
            root = ET.fromstring(response.content)
            entries = root.findall('{http://www.w3.org/2005/Atom}entry')
            for entry in entries:
                paper = {
                    'id': entry.find('{http://www.w3.org/2005/Atom}id').text,
                    'title': entry.find('{http://www.w3.org/2005/Atom}title').text,
                    'summary': entry.find('{http://www.w3.org/2005/Atom}summary').text,
                    'authors': [author.find('{http://www.w3.org/2005/Atom}name').text for author in entry.findall('{http://www.w3.org/2005/Atom}author')],
                    'published': entry.find('{http://www.w3.org/2005/Atom}published').text
                }
                paper["author_count"] = len(paper["authors"])
                if date in paper["published"]:
                    if paper["title"] not in title_list:
                        papers.append(paper)
                        title_list.append(paper["title"])
    return papers

In [14]:
papers = get_arxiv_papers(DATE)
print(len(papers))

0


# Postprocessing papers

In [5]:
from langchain_anthropic import ChatAnthropic
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

In [6]:
model = ChatAnthropic(model="claude-3-haiku-20240307")
prompt = """You are being given the abstract of a paper. Is this paper about {topic}, or not?
Answer "YES" if the abstract is related to {topic}, and "NO" otherwise.
<abstract> {abstract} </abstract>
Answer:"""

## Testing

In [7]:
parser = StrOutputParser()
prompt_template = ChatPromptTemplate.from_messages(
    [("system", "You are an helpful AI assistant"), ("user", prompt)]
)
chain = prompt_template | model | parser
abstracts_list = [paper["summary"] for paper in papers[:5]]
for abstract in abstracts_list:
    print(abstract)
    print(chain.invoke({"abstract":abstract, "topic":TOPIC}))

  Recent work has provided indirect evidence that pretraining language models
on code improves the ability of models to track state changes of discourse
entities expressed in natural language. In this work, we systematically test
this claim by comparing pairs of language models on their entity tracking
performance. Critically, the pairs consist of base models and models trained on
top of these base models with additional code data. We extend this analysis to
additionally examine the effect of math training, another highly structured
data type, and alignment tuning, an important step for enhancing the usability
of models. We find clear evidence that models additionally trained on large
amounts of code outperform the base models. On the other hand, we find no
consistent benefit of additional math training or alignment tuning across
various model families.

YES
  Recurrent neural networks (RNNs) notoriously struggle to learn long-term
memories, primarily due to vanishing and exploding gra

## Adding Papers

In [8]:
import time

selected_papers = []
for i, paper in enumerate(papers):
    abstract = paper["summary"]
    if i%100 == 0:
        time.sleep(20) #time.sleep is there to avoid hitting anthropic rate limits
    if chain.invoke({"abstract":abstract, "topic":TOPIC}) == "YES":
        selected_papers.append(paper)
print(len(selected_papers))

111


# Finalizing Results

In [9]:
import pandas as pd

df = pd.DataFrame(selected_papers)
df = df.drop_duplicates(subset="title") #duplicates can sometimes be found
print(f"{len(df)} papers extracted for date {DATE} and topics: {TOPIC}")

111 papers extracted for date 2024-05-31 and topics: Natural Language Processing, Generative AI, Computer Vision


In [10]:
df.to_csv(f"extracted_papers.csv", index=False)

In [11]:
import json

info_dict = {"date": DATE, "topic": TOPIC}
with open("parameters.json", "w") as outfile: 
    json.dump(info_dict, outfile)