In [33]:
# Install the arxiv package if not already installed
# !pip install arxiv

import arxiv
import pandas as pd
from datetime import datetime


Query:

Start with "ai safety OR "ai alignment"

("ai safety" OR "ai alignment" OR "value alignment" OR "inner alignment" OR "outer alignment" 
 OR "ai control" OR "specification gaming" OR "reward hacking" OR deception OR "scalable oversight")
ANDNOT ("sequence alignment" OR "image alignment" OR "camera alignment" OR "protein alignment")

(cat:cs.AI OR cat:cs.LG OR cat:cs.CL OR cat:stat.ML)
AND (
  "emergent misalignment"
  OR (misalignment AND (backdoor OR backdoored OR persona OR "alignment faking"))
  OR "evaluation awareness" OR "situational awareness" OR "test awareness"
  OR (deception AND (probe OR probes OR monitoring OR "lie detector" OR sandbagging OR "strategic"))
  OR (steganography AND ("chain of thought" OR reasoning))
  OR ("chain of thought" AND (faithful OR faithfulness OR monitor OR monitoring OR "reasoning model"))
  OR "mechanistic interpretability" OR "sparse autoencoder" OR SAEBench OR transcoder
  OR "representation engineering" OR "steering vector" OR "activation steering"
  OR "control evaluation" OR "Control Arena" OR "Ctrl-Z" OR "D-REX" OR "agent control"
  OR (debate AND ("scalable oversight" OR "prover verifier" OR "doubly efficient"))
  OR self-replication OR RepliBench
)

"ai safety" OR "ai alignment" OR "superalignment" OR "mechanistic interpretability" OR "scalable oversight" OR "preference alignment" OR "human preferences" OR "RLHF"

In [50]:
import time
import logging
import arxiv

logging.basicConfig(level=logging.INFO)
log = logging.getLogger("arxiv-fetch")

def search_arxiv_large(
    query: str,
    limit: int = 10_000,                 # total results you want to collect
    page_size: int = 500,                 # arxiv.Client page size (<= 2000)
    delay_seconds: float = 3.0,           # arXiv ToU: ≥3s between requests
    retries: int = 5,                     # per-page retry inside the client
    sort_by: arxiv.SortCriterion = arxiv.SortCriterion.SubmittedDate,
    sort_order: arxiv.SortOrder = arxiv.SortOrder.Descending,
):
    """
    Robust arXiv fetcher that:
      - Uses Client.results(search, offset=...) (correct for v2 API)
      - Recovers from UnexpectedEmptyPageError / HTTPError by resuming at the current offset
      - De-dups by arXiv ID
    Returns: list[dict]
    """
    client = arxiv.Client(
        page_size=page_size,
        delay_seconds=delay_seconds,
        num_retries=retries,
    )

    # max_results=None → "all available" (library will paginate internally).
    search = arxiv.Search(
        query=query,
        max_results=None,
        sort_by=sort_by,
        sort_order=sort_order,
    )

    results, seen = [], set()
    offset = 0
    consecutive_failures = 0
    backoff = delay_seconds

    def add_result(r):
        arxiv_id = r.entry_id.split("/")[-1]
        if arxiv_id in seen:
            return False
        seen.add(arxiv_id)
        results.append({
            "title": r.title,
            "authors": ", ".join(a.name for a in r.authors),
            "published": r.published.strftime("%Y-%m-%d"),
            "updated": r.updated.strftime("%Y-%m-%d"),
            "summary": (r.summary or "").replace("\n", " ").strip(),
            "primary_category": r.primary_category,
            "categories": ", ".join(r.categories or []),
            "arxiv_id": arxiv_id,
            "pdf_url": r.pdf_url,
            "doi": r.doi or "N/A",
        })
        return True

    log.info(f"Starting fetch (limit={limit}, page_size={page_size})")

    while len(results) < limit:
        try:
            # Stream from current offset to the end (or until we hit our limit)
            for r in client.results(search, offset=offset):
                added = add_result(r)
                offset += 1  # offset is count of yielded records so far
                if added and len(results) % 100 == 0:
                    log.info(f"✓ Collected {len(results)} (offset={offset})")
                if len(results) >= limit:
                    break

            # If we got here without exceptions, we reached the end cleanly.
            log.info("Reached end of result set.")
            break

        except (arxiv.UnexpectedEmptyPageError, arxiv.HTTPError) as e:
            consecutive_failures += 1
            log.warning(f"{type(e).__name__} at offset={offset}. "
                        f"Retrying after {backoff:.1f}s (failure #{consecutive_failures})…")
            time.sleep(backoff)
            # Exponential backoff, capped
            backoff = min(backoff * 1.5, 30.0)
            # Loop will restart generator from the same offset

            # Optional safety: give up after too many consecutive failures
            if consecutive_failures >= 8:
                log.error("Too many consecutive failures; returning partial results.")
                break
        else:
            # On success, reset failure counters/backoff
            consecutive_failures = 0
            backoff = delay_seconds

    log.info(f"Done. Collected {len(results)} results.")
    return results


In [52]:
# Example 1: Simple search query
# query = "ai alignment"
query = 'cat:cs.* AND (all:"ai safety" OR all:"ai alignment") AND submittedDate:[202501010000 TO 202512312359]'
# query = 'cat:cs.* AND (all:"ai alignment") AND submittedDate:[202501010000 TO 202512312359]'
results = search_arxiv_large(query, limit=1000)

# Display results
print(f"Found {len(results)} papers for query: '{query}'\n")
for i, paper in enumerate(results, 1):
    print(f"{i}. {paper['title']}")
    print(f"   Authors: {paper['authors']}")
    print(f"   Published: {paper['published']}")
    print(f"   arXiv ID: {paper['arxiv_id']}")
    print(f"   URL: {paper['pdf_url']}")
    print(f"   Summary: {paper['summary'][:200]}...")
    print()


INFO:arxiv-fetch:Starting fetch (limit=1000, page_size=500)
INFO:arxiv:Requesting page (first: True, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acs.%2A+AND+%28all%3A%22ai+safety%22+OR+all%3A%22ai+alignment%22%29+AND+submittedDate%3A%5B202501010000+TO+202512312359%5D&id_list=&sortBy=submittedDate&sortOrder=descending&start=0&max_results=500
INFO:arxiv:Got first page: 25 of 368 total results
INFO:arxiv:Sleeping: 2.983518 seconds
INFO:arxiv:Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=cat%3Acs.%2A+AND+%28all%3A%22ai+safety%22+OR+all%3A%22ai+alignment%22%29+AND+submittedDate%3A%5B202501010000+TO+202512312359%5D&id_list=&sortBy=submittedDate&sortOrder=descending&start=25&max_results=500
INFO:arxiv-fetch:✓ Collected 100 (offset=100)
INFO:arxiv-fetch:✓ Collected 200 (offset=200)
INFO:arxiv-fetch:✓ Collected 300 (offset=300)
INFO:arxiv-fetch:Reached end of result set.
INFO:arxiv-fetch:Done. Collected 368 results.


Found 368 papers for query: 'cat:cs.* AND (all:"ai safety" OR all:"ai alignment") AND submittedDate:[202501010000 TO 202512312359]'

1. Beyond Text: Multimodal Jailbreaking of Vision-Language and Audio Models through Perceptually Simple Transformations
   Authors: Divyanshu Kumar, Shreyas Jena, Nitin Aravind Birur, Tanay Baswa, Sahil Agarwal, Prashanth Harshangi
   Published: 2025-10-23
   arXiv ID: 2510.20223v1
   URL: http://arxiv.org/pdf/2510.20223v1
   Summary: Multimodal large language models (MLLMs) have achieved remarkable progress, yet remain critically vulnerable to adversarial attacks that exploit weaknesses in cross-modal processing. We present a syst...

2. LLMs can hide text in other text of the same length.ipynb
   Authors: Antonio Norelli, Michael Bronstein
   Published: 2025-10-22
   arXiv ID: 2510.20075v1
   URL: http://arxiv.org/pdf/2510.20075v1
   Summary: A meaningful text can be hidden inside another, completely different yet still coherent and plausible, text of t

In [24]:
# Example 2: Convert results to a pandas DataFrame for easier viewing
df = pd.DataFrame(results)
df


Unnamed: 0,title,authors,published,updated,summary,primary_category,categories,arxiv_id,pdf_url,doi
0,Towards General Modality Translation with Cont...,"Nimrod Berman, Omkar Joglekar, Eitan Kosman, D...",2025-10-23,2025-10-23,Recent advances in generative modeling have po...,cs.CV,"cs.CV, cs.AI, cs.LG",2510.20819v1,http://arxiv.org/pdf/2510.20819v1,
1,Generative Reasoning Recommendation via LLMs,"Minjie Hong, Zetong Zhou, Zirun Guo, Ziang Zha...",2025-10-23,2025-10-23,Despite their remarkable reasoning capabilitie...,cs.IR,cs.IR,2510.20815v1,http://arxiv.org/pdf/2510.20815v1,
2,"Real Deep Research for AI, Robotics and Beyond","Xueyan Zou, Jianglong Ye, Hao Zhang, Xiaoyu Xi...",2025-10-23,2025-10-23,With the rapid growth of research in AI and ro...,cs.AI,"cs.AI, cs.CL, cs.CV, cs.LG",2510.20809v1,http://arxiv.org/pdf/2510.20809v1,
3,Bilevel Analysis of Cost and Emissions Externa...,"Aron Brenner, Rahman Khorramfar, Nathan Engelm...",2025-10-23,2025-10-23,"Data centers are emerging as large, flexible e...",eess.SY,"eess.SY, cs.SY",2510.20805v1,http://arxiv.org/pdf/2510.20805v1,
4,AI-Enabled Digital Twins for Next-Generation N...,"John Sengendo, Fabrizio Granelli",2025-10-23,2025-10-23,As 5G and future 6G mobile networks become inc...,cs.NI,cs.NI,2510.20796v1,http://arxiv.org/pdf/2510.20796v1,
...,...,...,...,...,...,...,...,...,...,...
95,Design Optimization and Global Impact Assessme...,"Zhiyuan Fan, Bolun Xu",2025-10-23,2025-10-23,The dual challenge of decarbonizing the econom...,eess.SY,"eess.SY, cs.SY",2510.20135v1,http://arxiv.org/pdf/2510.20135v1,
96,SAID: Empowering Large Language Models with Se...,"Yulong Chen, Yadong Liu, Jiawen Zhang, Mu Li, ...",2025-10-23,2025-10-23,"Large Language Models (LLMs), despite advances...",cs.CR,"cs.CR, cs.AI",2510.20129v1,http://arxiv.org/pdf/2510.20129v1,
97,"""Learning Together"": AI-Mediated Support for P...","Yao Li, Jingyi Xie, Ya-Fang Ling, He Zhang, Ge...",2025-10-23,2025-10-23,Family learning takes place in everyday routin...,cs.HC,cs.HC,2510.20123v1,http://arxiv.org/pdf/2510.20123v1,
98,"There is No ""apple"" in Timeseries: Rethinking ...","Arian Prabowo, Flora D. Salim",2025-10-23,2025-10-23,Timeseries foundation models (TSFMs) have mult...,cs.LG,cs.LG,2510.20119v1,http://arxiv.org/pdf/2510.20119v1,


In [26]:
# Example 4: Save results to CSV
def save_results_to_csv(results, filename='arxiv_results.csv'):
    """Save search results to a CSV file"""
    df = pd.DataFrame(results)
    df.to_csv(filename, index=False)
    print(f"Results saved to {filename}")

# Uncomment to save:
save_results_to_csv(results, 'results/arxiv_search_results.csv')


Results saved to results/arxiv_search_results.csv
