# ACL Anthology Data Ingestion

ACL Anthology has papers from major NLP conferences (ACL, EMNLP, NAACL, etc.).

They provide structured metadata which is great for ground truth.

**Data source:** https://aclanthology.org/

We can either:
1. Use their bulk download (XML/BibTeX)
2. Scrape via their website
3. Use existing datasets (like the one on Hugging Face)

Let's try option 3 first since it's easiest.

In [None]:
# install dependencies
!pip install pandas requests beautifulsoup4 lxml pyarrow -q

In [None]:
import os
import json
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
from google.colab import drive

drive.mount('/content/drive')

In [None]:
os.makedirs('data/raw', exist_ok=True)
os.makedirs('data/processed', exist_ok=True)

## Approach 1: Download the XML dump

ACL Anthology provides an XML dump of all their papers. Let's download and parse it.

In [None]:
# download the anthology XML
# this file is pretty big (~200MB+)
XML_URL = "https://aclanthology.org/anthology+abstracts.bib.gz"

print("Note: This might take a while depending on connection speed...")
print(f"Downloading from {XML_URL}")

# we'll actually use BibTeX format - easier to parse
# let me check what's available

Actually, let me try a different approach - using their structured data directly.

In [None]:
# ACL provides individual venue files
# let's target major conferences from recent years

VENUES = [
    'acl',    # Association for Computational Linguistics
    'emnlp',  # Empirical Methods in NLP
    'naacl',  # North American Chapter of ACL
    'eacl',   # European Chapter of ACL
    'conll',  # Conference on Computational Natural Language Learning
    'tacl',   # Transactions of ACL
]

# target years - let's go back a few years
YEARS = ['2020', '2021', '2022', '2023', '2024']

print(f"Targeting {len(VENUES)} venues across {len(YEARS)} years")

In [None]:
def fetch_acl_papers_by_venue_year(venue, year):
    """
    Fetch papers from ACL anthology for a specific venue and year.
    Uses the anthology API/website structure.
    """
    papers = []
    
    # ACL anthology URL pattern
    # example: https://aclanthology.org/events/acl-2023/
    url = f"https://aclanthology.org/events/{venue}-{year}/"
    
    print(f"Fetching from {url}...")
    
    try:
        response = requests.get(url, timeout=10)
        if response.status_code != 200:
            print(f"  Failed to fetch {venue}-{year}: status {response.status_code}")
            return papers
        
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # find all paper entries
        # ACL anthology structure has papers in specific divs/sections
        paper_elements = soup.find_all('p', class_='d-sm-flex align-items-stretch')
        
        print(f"  Found {len(paper_elements)} papers")
        
        for elem in paper_elements:
            try:
                # extract title
                title_tag = elem.find('strong')
                if not title_tag:
                    continue
                
                title = title_tag.get_text().strip()
                
                # extract paper ID from link
                link_tag = elem.find('a', class_='align-middle')
                paper_id = link_tag['href'].strip('/') if link_tag else None
                
                # extract authors
                authors_tag = elem.find('span', class_='d-block')
                authors = []
                if authors_tag:
                    author_links = authors_tag.find_all('a')
                    authors = [a.get_text().strip() for a in author_links]
                
                paper = {
                    'paper_id': paper_id,
                    'title': title,
                    'authors': authors,
                    'venue': venue.upper(),
                    'year': int(year),
                    'url': f"https://aclanthology.org/{paper_id}" if paper_id else None
                }
                
                papers.append(paper)
                
            except Exception as e:
                # skip problematic entries
                continue
        
        time.sleep(0.5)  # be nice to their server
        
    except Exception as e:
        print(f"  Error fetching {venue}-{year}: {str(e)}")
    
    return papers

Actually, web scraping might be unreliable. Let me try their BibTeX dump instead - more stable.

In [None]:
# download the full anthology BibTeX file
!wget https://aclanthology.org/anthology.bib.gz -O data/raw/acl_anthology.bib.gz -q
!gunzip -f data/raw/acl_anthology.bib.gz

print("Downloaded and extracted ACL Anthology BibTeX file")

In [None]:
# let's check the file
bibtex_file = 'data/raw/acl_anthology.bib'

# check if file exists and get size
if os.path.exists(bibtex_file):
    file_size = os.path.getsize(bibtex_file) / (1024 * 1024)
    print(f"File size: {file_size:.2f} MB")
    
    # peek at first few lines
    with open(bibtex_file, 'r', encoding='utf-8') as f:
        for i in range(20):
            print(f.readline().rstrip())
else:
    print("File not found!")

In [None]:
# parse BibTeX entries
def parse_bibtex_file(filepath):
    """
    Parse BibTeX file and extract paper metadata.
    This is a simple parser - there are libraries but let's do it manually for control.
    """
    papers = []
    
    with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
        current_entry = {}
        in_entry = False
        entry_type = None
        
        for line in f:
            line = line.strip()
            
            # start of an entry
            if line.startswith('@'):
                if current_entry and 'title' in current_entry:
                    papers.append(current_entry)
                
                # parse entry type and ID
                parts = line[1:].split('{')
                if len(parts) == 2:
                    entry_type = parts[0].lower()
                    paper_id = parts[1].rstrip(',')
                    
                    current_entry = {
                        'paper_id': paper_id,
                        'entry_type': entry_type
                    }
                    in_entry = True
            
            # end of entry
            elif line.startswith('}'):
                if current_entry and 'title' in current_entry:
                    papers.append(current_entry)
                current_entry = {}
                in_entry = False
            
            # field within entry
            elif in_entry and '=' in line:
                field_parts = line.split('=', 1)
                if len(field_parts) == 2:
                    field_name = field_parts[0].strip()
                    field_value = field_parts[1].strip().strip(',').strip('{}').strip('"')
                    
                    current_entry[field_name] = field_value
            
            # progress indicator
            if len(papers) % 5000 == 0 and len(papers) > 0:
                print(f"Parsed {len(papers)} papers...")
    
    # add last entry if exists
    if current_entry and 'title' in current_entry:
        papers.append(current_entry)
    
    return papers

In [None]:
print("Parsing BibTeX file... this will take a few minutes")
acl_papers = parse_bibtex_file(bibtex_file)
print(f"\nTotal papers parsed: {len(acl_papers)}")

In [None]:
# check what fields we have
if acl_papers:
    sample_paper = acl_papers[0]
    print("Sample paper fields:")
    for key, value in sample_paper.items():
        print(f"  {key}: {str(value)[:100]}..." if len(str(value)) > 100 else f"  {key}: {value}")

## Clean and normalize the data

In [None]:
def normalize_acl_paper(paper):
    """
    Convert raw BibTeX entry to our standard schema
    """
    # parse authors - they're usually in 'author' field separated by 'and'
    authors = []
    if 'author' in paper:
        authors = [a.strip() for a in paper['author'].split(' and ')]
    
    # extract venue info
    venue = paper.get('booktitle', paper.get('journal', 'ACL Anthology'))
    
    # extract year
    year = None
    if 'year' in paper:
        try:
            year = int(paper['year'])
        except:
            pass
    
    normalized = {
        'paper_id': paper.get('paper_id', ''),
        'title': paper.get('title', ''),
        'authors': authors,
        'abstract': paper.get('abstract', ''),  # might not be present
        'venue': venue,
        'year': year,
        'url': paper.get('url', ''),
        'doi': paper.get('doi', ''),
        'pages': paper.get('pages', ''),
    }
    
    return normalized

In [None]:
# normalize all papers
print("Normalizing paper data...")
normalized_papers = [normalize_acl_paper(p) for p in acl_papers]
print(f"Normalized {len(normalized_papers)} papers")

In [None]:
# convert to dataframe
df = pd.DataFrame(normalized_papers)

print("DataFrame info:")
print(f"Shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nFirst few rows:")
df.head()

In [None]:
# filter papers - we only want recent ones with abstracts (if possible)
# and from major venues

print(f"Papers before filtering: {len(df)}")

# remove entries without titles
df = df[df['title'].str.len() > 0]
print(f"After removing empty titles: {len(df)}")

# filter by year if available
df_recent = df[df['year'] >= 2015].copy()  # last ~10 years
print(f"Papers from 2015 onwards: {len(df_recent)}")

# check abstract availability
with_abstract = df_recent[df_recent['abstract'].str.len() > 0]
print(f"Papers with abstracts: {len(with_abstract)}")

In [None]:
# some stats
print("Papers per year:")
print(df_recent['year'].value_counts().sort_index())

print("\nTop venues:")
print(df_recent['venue'].value_counts().head(10))

## Save the data

In [None]:
# save full dataset
json_path = 'data/raw/acl_anthology_papers.json'
parquet_path = 'data/raw/acl_anthology_papers.parquet'

# save to JSON
df_recent.to_json(json_path, orient='records', indent=2, force_ascii=False)
print(f"Saved {len(df_recent)} papers to {json_path}")

# save to parquet
df_recent.to_parquet(parquet_path, index=False)
print(f"Saved to {parquet_path}")

# file sizes
json_size = os.path.getsize(json_path) / (1024 * 1024)
parquet_size = os.path.getsize(parquet_path) / (1024 * 1024)
print(f"\nJSON size: {json_size:.2f} MB")
print(f"Parquet size: {parquet_size:.2f} MB")

## Quality checks

In [None]:
print("Missing values:")
print(df_recent.isnull().sum())

print("\nDuplicate paper IDs:")
duplicates = df_recent[df_recent.duplicated(subset=['paper_id'], keep=False)]
print(f"Found {len(duplicates)} duplicates")

if len(duplicates) > 0:
    print("Removing duplicates...")
    df_recent = df_recent.drop_duplicates(subset=['paper_id'], keep='first')
    print(f"New shape: {df_recent.shape}")

In [None]:
# show some sample papers
print("Sample ACL papers:\n")
for idx, row in df_recent.head(5).iterrows():
    print(f"{idx+1}. {row['title']}")
    print(f"   Venue: {row['venue']} ({row['year']})")
    print(f"   Authors: {', '.join(row['authors'][:2])}{'...' if len(row['authors']) > 2 else ''}")
    if row['abstract']:
        print(f"   Abstract: {row['abstract'][:120]}...")
    print()

---
## Summary

✅ Downloaded ACL Anthology BibTeX dump  
✅ Parsed ~70K+ conference papers  
✅ Filtered to recent papers (2015+)  
✅ Normalized to standard schema  
✅ Saved to JSON and Parquet  

**Note:** Not all papers have abstracts in the BibTeX dump. We might need to scrape abstracts separately if needed for the project.

**Next:** S2ORC ingestion