# S2ORC (Semantic Scholar Open Research Corpus) Ingestion

S2ORC is MASSIVE - it has millions of papers with full text and citation graphs.

**Important:** We can't download the entire corpus (it's 100+ GB). We'll work with a subset.

Options:
1. Download specific slices (recommended)
2. Use Semantic Scholar API for targeted queries
3. Download sample datasets

Let's go with option 2 (API) since it's most practical for our use case.

In [None]:
!pip install pandas requests pyarrow -q

In [None]:
import os
import json
import pandas as pd
import requests
import time
from google.colab import drive

drive.mount('/content/drive')

In [None]:
os.makedirs('data/raw', exist_ok=True)
os.makedirs('data/processed', exist_ok=True)

## Using Semantic Scholar API

They have a nice API that's free to use (with rate limits).

Docs: https://api.semanticscholar.org/api-docs/

In [None]:
# API base URL
S2_API_BASE = "https://api.semanticscholar.org/graph/v1"

# test the API with a simple query
test_url = f"{S2_API_BASE}/paper/search?query=natural+language+processing&limit=5"

response = requests.get(test_url)
if response.status_code == 200:
    data = response.json()
    print(f"API works! Found {data.get('total', 0)} papers for test query")
    print(f"\nSample result:")
    if 'data' in data and len(data['data']) > 0:
        print(json.dumps(data['data'][0], indent=2))
else:
    print(f"API error: {response.status_code}")

In [None]:
def search_semantic_scholar(query, limit=100, fields=None):
    """
    Search Semantic Scholar for papers matching query.
    
    Args:
        query: search string
        limit: max results (API allows up to 100 per request)
        fields: list of fields to return (e.g., ['title', 'abstract', 'authors'])
    
    Returns:
        list of paper dictionaries
    """
    if fields is None:
        fields = ['paperId', 'title', 'abstract', 'authors', 'year', 
                  'venue', 'citationCount', 'referenceCount', 'fieldsOfStudy',
                  'publicationDate', 'journal', 'externalIds']
    
    url = f"{S2_API_BASE}/paper/search"
    params = {
        'query': query,
        'limit': min(limit, 100),
        'fields': ','.join(fields)
    }
    
    try:
        response = requests.get(url, params=params, timeout=10)
        if response.status_code == 200:
            data = response.json()
            return data.get('data', [])
        else:
            print(f"Error {response.status_code}: {response.text}")
            return []
    except Exception as e:
        print(f"Request failed: {e}")
        return []

In [None]:
# define search queries for NLP-related topics
# we'll search multiple topics to get diverse papers

SEARCH_QUERIES = [
    'natural language processing',
    'transformers bert gpt',
    'machine translation',
    'sentiment analysis',
    'named entity recognition',
    'question answering',
    'text summarization',
    'language models',
    'information extraction',
    'semantic parsing',
    'dialogue systems',
    'text generation',
]

print(f"Will search for {len(SEARCH_QUERIES)} different topics")

In [None]:
# fetch papers for each query
# note: API has rate limits, so we add delays

all_papers = []
papers_per_query = 100  # max allowed per request

print("Fetching papers from Semantic Scholar...\n")

for i, query in enumerate(SEARCH_QUERIES):
    print(f"{i+1}/{len(SEARCH_QUERIES)}: Searching for '{query}'...", end=' ')
    
    papers = search_semantic_scholar(query, limit=papers_per_query)
    all_papers.extend(papers)
    
    print(f"got {len(papers)} papers")
    
    # be nice to their API
    time.sleep(1)

print(f"\nTotal papers fetched: {len(all_papers)}")

In [None]:
# check for duplicates (same paper might appear in multiple searches)
unique_ids = set()
unique_papers = []

for paper in all_papers:
    paper_id = paper.get('paperId')
    if paper_id and paper_id not in unique_ids:
        unique_ids.add(paper_id)
        unique_papers.append(paper)

print(f"Unique papers: {len(unique_papers)}")
print(f"Duplicates removed: {len(all_papers) - len(unique_papers)}")

In [None]:
# let's see what we got
if unique_papers:
    sample = unique_papers[0]
    print("Sample paper structure:")
    print(json.dumps(sample, indent=2))

## Normalize to our standard schema

In [None]:
def normalize_s2orc_paper(paper):
    """
    Convert S2 API response to our standard schema
    """
    # extract author names
    authors = []
    if 'authors' in paper and paper['authors']:
        authors = [a.get('name', '') for a in paper['authors'] if a.get('name')]
    
    # get venue info
    venue = paper.get('venue', '')
    if not venue and 'journal' in paper and paper['journal']:
        venue = paper['journal'].get('name', '')
    
    # categories from fieldsOfStudy
    categories = paper.get('fieldsOfStudy', [])
    if not categories:
        categories = []
    
    normalized = {
        'paper_id': paper.get('paperId', ''),
        'title': paper.get('title', ''),
        'authors': authors,
        'abstract': paper.get('abstract', ''),
        'venue': venue,
        'year': paper.get('year'),
        'publication_date': paper.get('publicationDate', ''),
        'citation_count': paper.get('citationCount', 0),
        'reference_count': paper.get('referenceCount', 0),
        'fields_of_study': categories,
        'external_ids': paper.get('externalIds', {}),
    }
    
    return normalized

In [None]:
# normalize all papers
normalized_papers = [normalize_s2orc_paper(p) for p in unique_papers]

# filter out papers without titles or abstracts
filtered_papers = [
    p for p in normalized_papers 
    if p['title'] and p['abstract']
]

print(f"Papers after filtering: {len(filtered_papers)}")
print(f"Removed {len(normalized_papers) - len(filtered_papers)} papers without title/abstract")

In [None]:
# convert to dataframe
df = pd.DataFrame(filtered_papers)

print("DataFrame info:")
print(f"Shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nData types:")
print(df.dtypes)

In [None]:
# some statistics
print("Papers per year:")
year_counts = df['year'].value_counts().sort_index()
print(year_counts.tail(10))  # show last 10 years

print("\nCitation statistics:")
print(df['citation_count'].describe())

print("\nTop venues:")
print(df[df['venue'] != '']['venue'].value_counts().head(10))

In [None]:
# check fields of study distribution
from collections import Counter

all_fields = []
for fields in df['fields_of_study']:
    if fields:
        all_fields.extend(fields)

field_counts = Counter(all_fields)
print("Top fields of study:")
for field, count in field_counts.most_common(15):
    print(f"  {field}: {count}")

## Save the data

In [None]:
# save to JSON and Parquet
json_path = 'data/raw/s2orc_papers.json'
parquet_path = 'data/raw/s2orc_papers.parquet'

# JSON
df.to_json(json_path, orient='records', indent=2, force_ascii=False)
print(f"Saved {len(df)} papers to {json_path}")

# Parquet
df.to_parquet(parquet_path, index=False)
print(f"Saved to {parquet_path}")

# file sizes
json_size = os.path.getsize(json_path) / (1024 * 1024)
parquet_size = os.path.getsize(parquet_path) / (1024 * 1024)
print(f"\nJSON size: {json_size:.2f} MB")
print(f"Parquet size: {parquet_size:.2f} MB")
print(f"Compression: {json_size/parquet_size:.2f}x")

## Quality checks

In [None]:
print("Missing values:")
print(df.isnull().sum())

print("\nAbstract length stats:")
df['abstract_length'] = df['abstract'].str.len()
print(df['abstract_length'].describe())

# find very short abstracts
short_abstracts = df[df['abstract_length'] < 100]
print(f"\nPapers with abstracts < 100 chars: {len(short_abstracts)}")

In [None]:
# show sample papers
print("Sample S2ORC papers:\n")
for idx, row in df.head(5).iterrows():
    print(f"{idx+1}. {row['title']}")
    print(f"   Year: {row['year']} | Venue: {row['venue']}")
    print(f"   Citations: {row['citation_count']} | References: {row['reference_count']}")
    print(f"   Authors: {', '.join(row['authors'][:2])}{'...' if len(row['authors']) > 2 else ''}")
    print(f"   Fields: {', '.join(row['fields_of_study'][:3])}")
    print(f"   Abstract: {row['abstract'][:150]}...")
    print()

## Optional: Fetch additional batches

If we need more papers, we can use pagination or different queries.

In [None]:
# example: fetch highly cited papers
def get_highly_cited_papers(min_citations=100, limit=100):
    """
    Fetch highly cited NLP papers from S2.
    Note: S2 API doesn't directly support citation count filtering,
    so we fetch and filter.
    """
    papers = search_semantic_scholar('natural language processing', limit=limit)
    
    # filter by citation count
    highly_cited = [
        p for p in papers 
        if p.get('citationCount', 0) >= min_citations
    ]
    
    return highly_cited

# uncomment to fetch
# influential_papers = get_highly_cited_papers(min_citations=50)
# print(f"Found {len(influential_papers)} highly cited papers")

---
## Summary

✅ Used Semantic Scholar API to fetch papers  
✅ Searched multiple NLP-related topics  
✅ Collected ~1000+ papers with metadata  
✅ Included citation counts and field classifications  
✅ Normalized to standard schema  
✅ Saved to JSON and Parquet  

**Note:** We used API instead of bulk download due to size constraints. For production, consider:
- Downloading specific S2ORC slices if full-text needed
- Using S2 bulk metadata (much smaller than full corpus)
- Implementing pagination for more comprehensive coverage

**Next:** Combine all three datasets into unified format