In [28]:
import os
import requests
from dotenv import load_dotenv

load_dotenv("../.env")

assert os.getenv("SEMANTIC_SCHOLAR_API_KEY"), "SEMANTIC_SCHOLAR_API_KEY not found in .env"
api_key = os.getenv("SEMANTIC_SCHOLAR_API_KEY")

In [29]:
title_to_search = "The Berkeley FrameNet Project"
params = {"query": title_to_search, "fields": "title,openAccessPdf,url,externalIds,publicationDate", "limit": 1}
headers = {'x-api-key': api_key}
r = requests.get(f"{BASE_URL}/paper/search", params=params, headers=headers)

In [30]:
from pprint import pprint
obj = r.json()
pprint(obj)

{'data': [{'externalIds': {'ACL': 'P98-1013',
                           'CorpusId': 2505531,
                           'DBLP': 'conf/acl/BakerFL98',
                           'DOI': '10.3115/980845.980860',
                           'MAG': '2115792525'},
           'openAccessPdf': {'license': None,
                             'status': 'BRONZE',
                             'url': 'http://dl.acm.org/ft_gateway.cfm?id=980860&type=pdf'},
           'paperId': '547f23597f9ec8a93f66cedaa6fbfb73960426b1',
           'publicationDate': '1998-08-10',
           'title': 'The Berkeley FrameNet Project',
           'url': 'https://www.semanticscholar.org/paper/547f23597f9ec8a93f66cedaa6fbfb73960426b1'}],
 'next': 1,
 'offset': 0,
 'total': 80}


In [31]:
data = obj['data']
print("Data:")
pprint(data)

record = data[0]
print("Record:")
pprint(record)

Data:
[{'externalIds': {'ACL': 'P98-1013',
                  'CorpusId': 2505531,
                  'DBLP': 'conf/acl/BakerFL98',
                  'DOI': '10.3115/980845.980860',
                  'MAG': '2115792525'},
  'openAccessPdf': {'license': None,
                    'status': 'BRONZE',
                    'url': 'http://dl.acm.org/ft_gateway.cfm?id=980860&type=pdf'},
  'paperId': '547f23597f9ec8a93f66cedaa6fbfb73960426b1',
  'publicationDate': '1998-08-10',
  'title': 'The Berkeley FrameNet Project',
  'url': 'https://www.semanticscholar.org/paper/547f23597f9ec8a93f66cedaa6fbfb73960426b1'}]
Record:
{'externalIds': {'ACL': 'P98-1013',
                 'CorpusId': 2505531,
                 'DBLP': 'conf/acl/BakerFL98',
                 'DOI': '10.3115/980845.980860',
                 'MAG': '2115792525'},
 'openAccessPdf': {'license': None,
                   'status': 'BRONZE',
                   'url': 'http://dl.acm.org/ft_gateway.cfm?id=980860&type=pdf'},
 'paperId': '547f2

In [None]:
# pdf_url = data['data'][0]['openAccessPdf']['url']
# print(f"PDF URL: {pdf_url}")

# # Mock browser headers
# session = requests.Session()
# session.headers.update({
#     "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
#     "Accept": "application/pdf,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
#     "Accept-Language": "en-US,en;q=0.9",
#     "Accept-Encoding": "gzip, deflate, br",
#     "DNT": "1",
#     "Connection": "keep-alive",
#     "Upgrade-Insecure-Requests": "1"
# })

# pdf_response = session.get(pdf_url, allow_redirects=True, timeout=30)
# print(f"Final URL: {pdf_response.url}")
# print(f"Status: {pdf_response.status_code}")

PDF URL: http://dl.acm.org/ft_gateway.cfm?id=980860&type=pdf


In [13]:
# Get detailed paper info to find alternative PDF sources
paper_id = data['data'][0]['paperId']
print(f"Paper ID: {paper_id}")

detail_url = f"{BASE_URL}/paper/{paper_id}"
params = {
    "fields": "title,openAccessPdf,externalIds,url,publicationVenue,isOpenAccess"
}
headers = {'x-api-key': api_key}
r_detail = requests.get(detail_url, params=params, headers=headers)
paper_details = r_detail.json()

print("\nPaper details:")
pprint(paper_details)

Paper ID: 547f23597f9ec8a93f66cedaa6fbfb73960426b1

Paper details:
{'externalIds': {'ACL': 'P98-1013',
                 'CorpusId': 2505531,
                 'DBLP': 'conf/acl/BakerFL98',
                 'DOI': '10.3115/980845.980860',
                 'MAG': '2115792525'},
 'isOpenAccess': True,
 'openAccessPdf': {'license': None,
                   'status': 'BRONZE',
                   'url': 'http://dl.acm.org/ft_gateway.cfm?id=980860&type=pdf'},
 'paperId': '547f23597f9ec8a93f66cedaa6fbfb73960426b1',
 'publicationVenue': {'alternate_names': ['Annu Meet Assoc Comput Linguistics',
                                          'Meeting of the Association for '
                                          'Computational Linguistics',
                                          'ACL',
                                          'Meet Assoc Comput Linguistics'],
                      'id': '1e33b3be-b2ab-46e9-96e8-d4eb4bad6e44',
                      'name': 'Annual Meeting of the Association for

In [23]:
# ACL papers are available on ACL Anthology!
# acl_id = paper_details['externalIds']['ACL']
# print(f"ACL ID: {acl_id}")

# ACL Anthology PDF URL pattern
acl_pdf_url = f"https://aclanthology.org/{acl_id}.pdf"
print(f"ACL Anthology PDF URL: {acl_pdf_url}")

# Try downloading directly with requests (no Cloudflare on ACL Anthology!)
print("\nAttempting direct download from ACL Anthology...")
response = requests.get(acl_pdf_url, timeout=30)
print(f"Status: {response.status_code}")
print(f"Content-Type: {response.headers.get('Content-Type')}")

if response.status_code == 200 and 'pdf' in response.headers.get('Content-Type', '').lower():
    pdf_data = response.content
    print(f"✓ PDF downloaded successfully! Size: {len(pdf_data)} bytes")
    
    # Save to file
    pdf_filename = f"{acl_id}.pdf"
    with open(pdf_filename, 'wb') as f:
        f.write(pdf_data)
    print(f"✓ Saved to: {pdf_filename}")
else:
    print(f"✗ Failed with status {response.status_code}")
    if response.status_code == 404:
        print("ACL Anthology URL not found. Trying alternative sources...")
        # Try the DOI resolver
        doi = paper_details['externalIds'].get('DOI')
        if doi:
            doi_url = f"https://doi.org/{doi}"
            print(f"\nTrying DOI: {doi_url}")
    else:
        print(f"Response preview: {response.text[:500]}")

ACL Anthology PDF URL: https://aclanthology.org/P98-1013.pdf

Attempting direct download from ACL Anthology...
Status: 200
Content-Type: application/pdf
✓ PDF downloaded successfully! Size: 459289 bytes
✓ Saved to: P98-1013.pdf


In [17]:
import pymupdf  # PyMuPDF

# Open the PDF
pdf_path = f"{acl_id}.pdf"
doc = pymupdf.open(pdf_path)

# Extract text from all pages
full_text = ""
for page_num, page in enumerate(doc, start=1):
    text = page.get_text()
    full_text += text

# Get page count before closing
num_pages = len(doc)
doc.close()

print(f"✓ Extracted text from {num_pages} pages")
print(f"Total characters: {len(full_text)}")
print(f"\nFirst 500 characters:\n{full_text[:500]}")

# Optionally save to text file
text_filename = f"{acl_id}.txt"
with open(text_filename, 'w', encoding='utf-8') as f:
    f.write(full_text)
print(f"\n✓ Saved text to: {text_filename}")

✓ Extracted text from 5 pages
Total characters: 19628

First 500 characters:
The Berkeley FrameNet Project 
Collin F. Baker and Charles J. Fillmore and John B. Lowe 
{collinb, fillmore, jblowe}@icsi.berkeley.edu 
International Computer Science Institute 
1947 Center St. Suite 600 
Berkeley, Calif., 94704 
Abstract 
FrameNet 
is 
a 
three-year 
NSF-supported 
project in corpus-based computational lexicog- 
raphy, now in its second year (NSF IRI-9618838, 
"Tools for Lexicon Building"). The project's 
key features are (a) a commitment to corpus 
evidence for semantic and sy

✓ Saved text to: P98-1013.txt


## Pipeline

1. Get the title from the acl200 dataset
1. Use semantic scholar to search for that title to get the ACL id
1. Use the ACL id to get the pdf file from the ACL anthology
1. Convert the pdf to text

In [None]:
# Example from RefSeer

title_to_search = "Image Denoising Using Scale Mixtures of Gaussians in the Wavelet Domain"
params = {"query": title_to_search, "fields": "title,openAccessPdf,url", "limit": 1}
headers = {"x-api-key": api_key}
r = requests.get(f"{BASE_URL}/paper/search", params=params, headers=headers)
data = r.json()
pprint(data)

{'data': [{'openAccessPdf': {'license': None,
                             'status': 'GREEN',
                             'url': 'http://www.cns.nyu.edu/pub/lcv/portilla03-reprint.pdf'},
           'paperId': '85791491919e1f740f0e882366046acbe56fb14c',
           'title': 'Image denoising using scale mixtures of Gaussians in the '
                    'wavelet domain',
           'url': 'https://www.semanticscholar.org/paper/85791491919e1f740f0e882366046acbe56fb14c'}],
 'next': 1,
 'offset': 0,
 'total': 53}
