Data Ingestion, Preprocessing & Embeddings

This notebook covers:
1. Data ingestion from ArXiv, ACL, S2ORC
2. Preprocessing (cleaning, tokenization, etc)
3. Embedding generation

In [1]:
# install dependencies first
!pip install pandas arxiv requests beautifulsoup4 lxml pyarrow -q
!pip install spacy nltk tqdm langdetect -q
!pip install gensim sentence-transformers torch scikit-learn -q
!python -m spacy download en_core_web_sm

  Preparing metadata (setup.py) ... [?25l[?25hdone
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/81.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.5/81.5 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for sgmllib3k (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m54.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m95.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m153.4 MB/s[0m eta

In [2]:
import os
import re
import json
import time
import pickle
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from datetime import datetime
from collections import Counter
from tqdm import tqdm

import arxiv
import requests
from bs4 import BeautifulSoup

import spacy
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from langdetect import detect, LangDetectException

from gensim.models import Word2Vec
from sentence_transformers import SentenceTransformer
import torch

from google.colab import drive

print("imports done")

imports done


In [3]:
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# setup directories
os.makedirs('data/raw', exist_ok=True)
os.makedirs('data/processed', exist_ok=True)
os.makedirs('data/embeddings', exist_ok=True)

# nltk stuff - need all these for proper tokenization
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)  # needed for sentence tokenization in newer nltk

nlp = spacy.load('en_core_web_sm')
stop_words = set(stopwords.words('english'))

print("setup complete")

setup complete


---
## Part 1: Data Ingestion

### ArXiv

In [5]:
# arxiv ingestion with deduplication
# papers can appear in multiple categories, so we track seen IDs

def fetch_arxiv(categories, max_per_cat=5000):
    client = arxiv.Client()
    papers = []
    seen_ids = set()  # deduplicate across categories

    for cat in categories:
        print(f"fetching {cat}...", end=' ')

        search = arxiv.Search(
            query=f'cat:{cat}',
            max_results=max_per_cat,
            sort_by=arxiv.SortCriterion.SubmittedDate
        )

        cnt = 0
        skipped = 0
        for r in client.results(search):
            pid = r.entry_id.split('/')[-1]
            if pid in seen_ids:
                skipped += 1
                continue
            seen_ids.add(pid)

            papers.append({
                'paper_id': pid,
                'title': r.title,
                'authors': [a.name for a in r.authors],
                'abstract': r.summary.replace('\n', ' '),
                'categories': r.categories,
                'venue': 'arXiv',
                'year': r.published.year,
                'published': r.published.isoformat(),
                'pdf_url': r.pdf_url
            })
            cnt += 1
        print(f"got {cnt} new, {skipped} duplicates skipped")

    return papers

In [6]:
# fetch papers - using NLP and ML related categories
# 5000 per category for comprehensive coverage
cats = ['cs.CL', 'cs.LG', 'stat.ML', 'cs.AI', 'cs.IR']
arxiv_papers = fetch_arxiv(cats, max_per_cat=5000)
print(f"\ntotal: {len(arxiv_papers)}")

fetching cs.CL... got 5000 new, 0 duplicates skipped
fetching cs.LG... got 4533 new, 467 duplicates skipped
fetching stat.ML... got 4365 new, 635 duplicates skipped
fetching cs.AI... got 2435 new, 2565 duplicates skipped
fetching cs.IR... got 4650 new, 350 duplicates skipped

total: 20983


In [7]:
# quick check
arxiv_df = pd.DataFrame(arxiv_papers)
arxiv_df.head(2)

Unnamed: 0,paper_id,title,authors,abstract,categories,venue,year,published,pdf_url
0,2512.02010v1,Four Over Six: More Accurate NVFP4 Quantizatio...,"[Jack Cook, Junxian Guo, Guangxuan Xiao, Yujun...","As large language models have grown larger, lo...","[cs.CL, cs.LG]",arXiv,2025,2025-12-01T18:59:45+00:00,https://arxiv.org/pdf/2512.02010v1
1,2512.02008v1,The Art of Scaling Test-Time Compute for Large...,"[Aradhye Agarwal, Ayan Sengupta, Tanmoy Chakra...",Test-time scaling (TTS) -- the dynamic allocat...,[cs.CL],arXiv,2025,2025-12-01T18:59:28+00:00,https://arxiv.org/pdf/2512.02008v1


In [8]:
# check for any issues
print("shape:", arxiv_df.shape)
print("\nmissing values:")
print(arxiv_df.isnull().sum())
print("\nyear distribution:")
print(arxiv_df['year'].value_counts().head())

shape: (20983, 9)

missing values:
paper_id      0
title         0
authors       0
abstract      0
categories    0
venue         0
year          0
published     0
pdf_url       0
dtype: int64

year distribution:
year
2025    19811
2024     1172
Name: count, dtype: int64


### ACL Anthology

In [9]:
# download ACL bibtex dump
# this is like 100MB+ so takes a minute
print("downloading ACL data...")
!wget https://aclanthology.org/anthology.bib.gz -O data/raw/acl.bib.gz -q
!gunzip -f data/raw/acl.bib.gz
print("done")

downloading ACL data...
done


In [10]:
# check file size
import os
size_mb = os.path.getsize('data/raw/acl.bib') / (1024*1024)
print(f"file size: {size_mb:.1f} MB")

file size: 77.2 MB


In [11]:
# parse bibtex - this is kinda slow but works
# tried pybtex but it kept crashing on malformed entries

def parse_bib(fpath):
    papers = []
    current = {}

    with open(fpath, 'r', encoding='utf-8', errors='ignore') as f:
        for line in f:
            line = line.strip()

            if line.startswith('@'):
                # save previous entry
                if current and 'title' in current:
                    papers.append(current)
                # start new entry
                parts = line[1:].split('{')
                if len(parts) == 2:
                    current = {'paper_id': parts[1].rstrip(',')}

            elif line.startswith('}'):
                if current and 'title' in current:
                    papers.append(current)
                current = {}

            elif '=' in line and current:
                # parse field
                parts = line.split('=', 1)
                if len(parts) == 2:
                    field = parts[0].strip()
                    val = parts[1].strip().strip(',').strip('{}').strip('"')
                    current[field] = val

    return papers

In [12]:
%%time
# this takes a few minutes
print("parsing bibtex...")
acl_raw = parse_bib('data/raw/acl.bib')
print(f"parsed {len(acl_raw)} entries")

parsing bibtex...
parsed 118461 entries
CPU times: user 1.4 s, sys: 115 ms, total: 1.51 s
Wall time: 1.51 s


In [13]:
# look at one entry
acl_raw[100]

{'paper_id': 'melis-etal-2025-modular',
 'title': 'A Modular Taxonomy for Hate Speech Definitions and Its Impact on Zero-Shot {LLM} Classification Performance',
 'author': 'Melis, Matteo  and',
 'editor': 'Calabrese, Agostina  and',
 'booktitle': 'Proceedings of the The 9th Workshop on Online Abuse and Harms (WOAH)',
 'month': 'aug',
 'year': '2025',
 'address': 'Vienna, Austria',
 'publisher': 'Association for Computational Linguistics',
 'url': 'https://aclanthology.org/2025.woah-1.45/',
 'pages': '490--521',
 'ISBN': '979-8-89176-105-6'}

In [14]:
# normalize to our schema
acl_normalized = []

for p in acl_raw:
    # parse authors - they use "and" as separator
    authors = []
    if 'author' in p:
        authors = [a.strip() for a in p['author'].split(' and ')]

    # get year
    year = None
    if 'year' in p:
        try:
            year = int(p['year'])
        except:
            pass  # some have weird year formats

    acl_normalized.append({
        'paper_id': p.get('paper_id', ''),
        'title': p.get('title', ''),
        'authors': authors,
        'abstract': p.get('abstract', ''),
        'venue': p.get('booktitle', p.get('journal', 'ACL')),
        'year': year,
        'url': p.get('url', '')
    })

acl_df = pd.DataFrame(acl_normalized)
print(f"total entries: {len(acl_df)}")

total entries: 118461


In [15]:
# check data quality
print("missing abstracts:", (acl_df['abstract'].str.len() == 0).sum())
print("missing titles:", (acl_df['title'].str.len() == 0).sum())

# NOTE: ACL bibtex doesn't include abstracts - this is a known limitation
# The bibtex format only has metadata (title, authors, venue, year)
# For abstracts, we'd need to scrape individual paper pages
# For now, we'll keep ACL papers but they'll be filtered out later
# when we require abstracts for preprocessing

# filter to papers with titles from 2000 onwards
acl_df = acl_df[(acl_df['title'].str.len() > 0) & (acl_df['year'] >= 2000)]
print(f"\nafter filtering (2000+): {len(acl_df)}")
print("NOTE: ACL papers lack abstracts in bibtex - will be filtered during cleaning")

missing abstracts: 118461
missing titles: 0

after filtering (2000+): 108888
NOTE: ACL papers lack abstracts in bibtex - will be filtered during cleaning


In [16]:
# year distribution
acl_df['year'].value_counts().sort_index()

Unnamed: 0_level_0,count
year,Unnamed: 1_level_1
2000,1254
2001,814
2002,1216
2003,1186
2004,1924
2005,1261
2006,2174
2007,1601
2008,2232
2009,2233


### S2ORC (Semantic Scholar)

In [17]:
# semantic scholar API with retry logic
# rate limited so we need exponential backoff
import random

def search_s2(query, limit=100, max_retries=5):
    url = "https://api.semanticscholar.org/graph/v1/paper/search"
    params = {
        'query': query,
        'limit': min(limit, 100),  # API max is 100
        'fields': 'paperId,title,abstract,authors,year,venue,citationCount,fieldsOfStudy'
    }

    for attempt in range(max_retries):
        try:
            r = requests.get(url, params=params, timeout=15)
            if r.status_code == 200:
                return r.json().get('data', [])
            elif r.status_code == 429:
                # exponential backoff with jitter
                wait = (2 ** attempt) + random.uniform(1, 3)
                print(f"rate limited, waiting {wait:.1f}s...", end=' ')
                time.sleep(wait)
            else:
                print(f"error: {r.status_code}")
                return []
        except Exception as e:
            print(f"request failed: {e}")
            time.sleep(2)

    print("max retries exceeded")
    return []

In [18]:
# test it first
test = search_s2("natural language processing", 5)
print(f"got {len(test)} results")
if test:
    print(test[0].keys())

got 5 results
dict_keys(['paperId', 'title', 'venue', 'year', 'citationCount', 'openAccessPdf', 'fieldsOfStudy', 'authors', 'abstract'])


In [19]:
# fetch papers for different NLP topics
# using longer delays to avoid rate limiting
queries = [
    'natural language processing',
    'transformers bert',
    'machine translation',
    'sentiment analysis',
    'named entity recognition',
    'question answering',
    'text classification',
    'language model',
    'word embeddings',
    'neural machine translation',
    'text summarization',
    'information extraction',
    'semantic similarity',
    'text generation',
    'dialogue systems',
    'speech recognition',
    'knowledge graphs',
    'relation extraction',
    'coreference resolution',
    'dependency parsing'
]

all_s2 = []
for i, q in enumerate(queries):
    print(f"[{i+1}/{len(queries)}] {q}...", end=' ')
    papers = search_s2(q, 100)
    all_s2.extend(papers)
    print(f"{len(papers)}")
    # longer delay between queries to avoid rate limiting
    time.sleep(3 + random.uniform(0, 2))

print(f"\ntotal (with dups): {len(all_s2)}")

[1/20] natural language processing... rate limited, waiting 3.8s... 100
[2/20] transformers bert... 100
[3/20] machine translation... rate limited, waiting 2.2s... rate limited, waiting 3.7s... rate limited, waiting 6.8s... rate limited, waiting 10.9s... rate limited, waiting 18.7s... max retries exceeded
0
[4/20] sentiment analysis... 100
[5/20] named entity recognition... rate limited, waiting 2.8s... 100
[6/20] question answering... rate limited, waiting 3.8s... 100
[7/20] text classification... 100
[8/20] language model... rate limited, waiting 2.9s... rate limited, waiting 4.4s... rate limited, waiting 6.6s... rate limited, waiting 9.8s... rate limited, waiting 17.0s... max retries exceeded
0
[9/20] word embeddings... rate limited, waiting 2.9s... 100
[10/20] neural machine translation... 100
[11/20] text summarization... rate limited, waiting 2.8s... rate limited, waiting 4.0s... 100
[12/20] information extraction... 100
[13/20] semantic similarity... 100
[14/20] text generation.

In [20]:
# remove duplicates and normalize
seen_ids = set()
s2_normalized = []

for p in all_s2:
    pid = p.get('paperId')
    if not pid or pid in seen_ids:
        continue
    seen_ids.add(pid)

    # get author names
    authors = [a.get('name', '') for a in p.get('authors', [])]

    s2_normalized.append({
        'paper_id': pid,
        'title': p.get('title', ''),
        'authors': authors,
        'abstract': p.get('abstract', ''),
        'venue': p.get('venue', ''),
        'year': p.get('year'),
        'citation_count': p.get('citationCount', 0),
        'categories': p.get('fieldsOfStudy', [])
    })

s2_df = pd.DataFrame(s2_normalized)
print(f"unique papers: {len(s2_df)}")

unique papers: 1782


In [21]:
# filter out papers without abstracts
s2_df = s2_df[(s2_df['title'].str.len() > 0) & (s2_df['abstract'].str.len() > 0)]
print(f"with abstracts: {len(s2_df)}")

with abstracts: 1553


### Combine all sources

In [22]:
# normalize everything to same schema
def normalize_df(df, source):
    result = pd.DataFrame({
        'paper_id': source + '_' + df['paper_id'].astype(str).str.replace('/', '_'),
        'title': df['title'],
        'authors': df['authors'],
        'abstract': df['abstract'].fillna(''),
        'venue': df['venue'].fillna(''),
        'year': df['year'],
        'categories': df.get('categories', [[]]*len(df)),
        'source': source
    })
    return result

arxiv_norm = normalize_df(arxiv_df, 'arxiv')
acl_norm = normalize_df(acl_df, 'acl')
s2_norm = normalize_df(s2_df, 's2orc')

print(f"arxiv: {len(arxiv_norm)}")
print(f"acl: {len(acl_norm)}")
print(f"s2orc: {len(s2_norm)}")

arxiv: 20983
acl: 108888
s2orc: 1553


In [23]:
# combine
combined = pd.concat([arxiv_norm, acl_norm, s2_norm], ignore_index=True)
print(f"combined: {len(combined)}")

combined: 131424


### Data Validation

In [24]:
# let's check data quality
print("=" * 50)
print("DATA VALIDATION")
print("=" * 50)

# missing abstracts - this is important for our pipeline
no_abstract = combined['abstract'].str.len() == 0
print(f"\nmissing abstracts: {no_abstract.sum()} ({no_abstract.sum()/len(combined)*100:.1f}%)")

# breakdown by source
for src in combined['source'].unique():
    mask = combined['source'] == src
    missing = (combined[mask]['abstract'].str.len() == 0).sum()
    total = mask.sum()
    print(f"  {src}: {missing}/{total} ({missing/total*100:.1f}%)")

DATA VALIDATION

missing abstracts: 108888 (82.9%)
  arxiv: 0/20983 (0.0%)
  acl: 108888/108888 (100.0%)
  s2orc: 0/1553 (0.0%)


In [25]:
# other checks
print(f"missing titles: {(combined['title'].str.len() == 0).sum()}")
print(f"missing authors: {(combined['authors'].apply(len) == 0).sum()}")

# check for duplicates
dups = combined.duplicated(subset=['title'], keep=False)
print(f"\nduplicate titles: {dups.sum()}")

# show some examples
if dups.sum() > 0:
    print("\nexample duplicates:")
    dup_titles = combined[dups].groupby('title')['source'].apply(list).head(3)
    for t, srcs in dup_titles.items():
        print(f"  '{t[:50]}...' -> {srcs}")

missing titles: 0
missing authors: 2832

duplicate titles: 1999

example duplicates:
  'A Compact Architecture for Dialogue Management Bas...' -> ['acl', 'acl']
  'A Comparison of Independent and Joint Fine-tuning ...' -> ['arxiv', 'acl']
  'A Context-Dependent Gated Module for Incorporating...' -> ['acl', 's2orc']


In [26]:
# year distribution
print("\nyear range:", combined['year'].min(), "-", combined['year'].max())

# invalid years?
bad_years = (combined['year'].isna()) | (combined['year'] < 1990) | (combined['year'] > 2025)
print(f"invalid years: {bad_years.sum()}")


year range: 1995 - 2025
invalid years: 0


### Clean and deduplicate

In [27]:
# filter out bad entries
print("cleaning data...")
print(f"before: {len(combined)}")

# check abstract availability by source before filtering
print("\nabstract availability by source:")
for src in combined['source'].unique():
    mask = combined['source'] == src
    has_abstract = (combined[mask]['abstract'].str.len() >= 50).sum()
    total = mask.sum()
    print(f"  {src}: {has_abstract}/{total} ({has_abstract/total*100:.1f}%)")

clean = combined[
    (combined['title'].str.len() > 10) &
    (combined['abstract'].str.len() >= 50) &  # need decent abstracts
    (combined['authors'].apply(len) > 0) &
    (combined['year'] >= 1990) &
    (combined['year'] <= 2025)
].copy()

print(f"\nafter filtering: {len(clean)}")

cleaning data...
before: 131424

abstract availability by source:
  arxiv: 20983/20983 (100.0%)
  acl: 0/108888 (0.0%)
  s2orc: 1550/1553 (99.8%)

after filtering: 22530


In [28]:
# deduplicate - prefer ACL > S2ORC > ArXiv
# (ACL is more authoritative for NLP papers)
priority = {'acl': 1, 's2orc': 2, 'arxiv': 3}
clean['_prio'] = clean['source'].map(priority)
clean = clean.sort_values('_prio').drop_duplicates(subset=['title'], keep='first')
clean = clean.drop('_prio', axis=1)

print(f"after dedup: {len(clean)}")

after dedup: 22527


In [29]:
# add some useful columns
clean['title_len'] = clean['title'].str.len()
clean['abstract_len'] = clean['abstract'].str.len()
clean['n_authors'] = clean['authors'].apply(len)

clean.describe()

Unnamed: 0,year,title_len,abstract_len,n_authors
count,22527.0,22527.0,22527.0,22527.0
mean,2024.624451,80.544813,1290.827185,5.013672
std,1.398666,23.076086,326.125973,5.174346
min,1995.0,12.0,83.0,1.0
25%,2025.0,65.0,1068.0,3.0
50%,2025.0,80.0,1287.0,4.0
75%,2025.0,95.0,1524.0,6.0
max,2025.0,223.0,3898.0,338.0


In [30]:
# source distribution
print("\nfinal source distribution:")
print(clean['source'].value_counts())


final source distribution:
source
arxiv    20978
s2orc     1549
Name: count, dtype: int64


In [31]:
# save
clean.to_parquet('data/raw/complete_dataset.parquet', index=False)
print(f"saved {len(clean)} papers to data/raw/complete_dataset.parquet")
print(f"file size: {os.path.getsize('data/raw/complete_dataset.parquet')/(1024*1024):.2f} MB")

saved 22527 papers to data/raw/complete_dataset.parquet
file size: 17.91 MB


---
## Part 2: Preprocessing

In [32]:
# preprocessing functions - optimized to use single spacy pass

def clean_text(text):
    if not isinstance(text, str):
        return ""
    # urls
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # emails
    text = re.sub(r'\S+@\S+', '', text)
    # whitespace
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def remove_special(text):
    # keep alphanumeric and basic punctuation
    text = re.sub(r'[^a-zA-Z0-9\s.,!?-]', '', text)
    text = re.sub(r'([.,!?-])\1+', r'\1', text)  # remove repeated punct
    return text

def get_language(text):
    try:
        return detect(text)
    except:
        return 'unknown'

def get_sentences(text):
    try:
        sents = sent_tokenize(text)
        return sents if len(sents) > 0 else [text]
    except Exception as e:
        # fallback: split on periods
        sents = [s.strip() for s in text.split('.') if len(s.strip()) > 10]
        return sents if sents else [text]

def process_with_spacy(text, do_stopwords=True, do_lemma=True):
    """Single spacy pass for tokenization and lemmatization"""
    doc = nlp(text)

    if do_lemma:
        if do_stopwords:
            tokens = [tok.lemma_ for tok in doc if tok.text.lower() not in stop_words]
        else:
            tokens = [tok.lemma_ for tok in doc]
    else:
        if do_stopwords:
            tokens = [tok.text for tok in doc if tok.text.lower() not in stop_words]
        else:
            tokens = [tok.text for tok in doc]

    # filter junk tokens
    tokens = [t for t in tokens if len(t) > 2 and t.isalnum()]
    return tokens

In [33]:
# test preprocessing on one example
test_text = "We propose a novel approach for NLP using transformers. This method achieves state-of-the-art results. Visit https://example.com for more info!"

cleaned = clean_text(test_text)
print("cleaned:", cleaned)

cleaned = remove_special(cleaned)
print("no special:", cleaned)

sents = get_sentences(cleaned)
print("sentences:", sents)

tokens = process_with_spacy(cleaned.lower())
print("processed tokens:", tokens)

cleaned: We propose a novel approach for NLP using transformers. This method achieves state-of-the-art results. Visit for more info!
no special: We propose a novel approach for NLP using transformers. This method achieves state-of-the-art results. Visit for more info!
sentences: ['We propose a novel approach for NLP using transformers.', 'This method achieves state-of-the-art results.', 'Visit for more info!']
processed tokens: ['propose', 'novel', 'approach', 'nlp', 'use', 'transformer', 'method', 'achieve', 'state', 'art', 'result', 'visit', 'info']


In [34]:
# full pipeline - optimized with single spacy pass
def preprocess(text, do_lowercase=True, do_stopwords=True, do_lemma=True):
    if not isinstance(text, str) or len(text) == 0:
        return {
            'cleaned': '',
            'lang': 'unknown',
            'sentences': [],
            'n_sents': 0,
            'tokens': [],
            'processed': ''
        }

    # detect language first (before cleaning)
    lang = get_language(text)

    # clean
    text = clean_text(text)
    text = remove_special(text)

    # sentences - do this before lowercasing
    sents = get_sentences(text)

    if do_lowercase:
        text = text.lower()

    cleaned = text

    # tokenize + lemmatize in single pass
    tokens = process_with_spacy(text, do_stopwords, do_lemma)

    return {
        'cleaned': cleaned,
        'lang': lang,
        'sentences': sents,
        'n_sents': len(sents),
        'tokens': tokens,
        'processed': ' '.join(tokens)
    }

In [35]:
# load data
df = pd.read_parquet('data/raw/complete_dataset.parquet')
print(f"loaded {len(df)} papers")

# filter to English papers only (based on title heuristic)
# we'll do proper language detection during preprocessing

# use ALL papers for best embedding quality
sample = df.copy()
print(f"using all {len(sample)} papers for processing")

loaded 22527 papers
using all 22527 papers for processing


In [36]:
%%time
# process all papers
print(f"preprocessing {len(sample)} papers...")

results = []
for idx, row in tqdm(sample.iterrows(), total=len(sample)):
    res = preprocess(row['abstract'])

    results.append({
        'paper_id': row['paper_id'],
        'title': row['title'],
        'authors': row['authors'],
        'original_abstract': row['abstract'],
        'cleaned_text': res['cleaned'],
        'language': res['lang'],
        'sentences': res['sentences'],
        'n_sentences': res['n_sents'],
        'tokens': res['tokens'],
        'processed_text': res['processed'],
        'n_tokens': len(res['tokens']),
        'source': row['source'],
        'year': row['year'],
        'venue': row['venue']
    })

processed = pd.DataFrame(results)
print("done!")

preprocessing 22527 papers...


100%|██████████| 22527/22527 [13:44<00:00, 27.33it/s]

done!
CPU times: user 13min 43s, sys: 2.64 s, total: 13min 46s
Wall time: 13min 44s





In [37]:
# check results
print("\ntoken stats:")
print(f"  mean: {processed['n_tokens'].mean():.1f}")
print(f"  min: {processed['n_tokens'].min()}")
print(f"  max: {processed['n_tokens'].max()}")

print(f"\nsentence stats:")
print(f"  mean: {processed['n_sentences'].mean():.1f}")
print(f"  min: {processed['n_sentences'].min()}")
print(f"  max: {processed['n_sentences'].max()}")

print(f"\nlanguages:")
lang_counts = processed['language'].value_counts()
print(lang_counts)

# filter to English only for downstream tasks
non_english = processed[processed['language'] != 'en']
if len(non_english) > 0:
    print(f"\nfiltering out {len(non_english)} non-English papers")
    processed = processed[processed['language'] == 'en'].copy()
    print(f"remaining: {len(processed)} English papers")


token stats:
  mean: 121.0
  min: 7
  max: 353

sentence stats:
  mean: 7.5
  min: 1
  max: 28

languages:
language
en    22522
id        2
fr        2
ca        1
Name: count, dtype: int64

filtering out 5 non-English papers
remaining: 22522 English papers


In [38]:
# build vocabulary
all_tokens = []
for toks in processed['tokens']:
    all_tokens.extend(toks)

vocab = Counter(all_tokens)
print(f"vocabulary size: {len(vocab)}")
print(f"total tokens: {len(all_tokens)}")

print("\ntop 20 words:")
for word, cnt in vocab.most_common(20):
    print(f"  {word}: {cnt}")

vocabulary size: 48425
total tokens: 2724644

top 20 words:
  model: 46802
  method: 18327
  base: 17417
  llm: 16418
  use: 16311
  datum: 16127
  language: 14591
  task: 14282
  propose: 13725
  performance: 13184
  framework: 12604
  approach: 12443
  dataset: 12319
  system: 11888
  large: 11797
  result: 11002
  demonstrate: 10716
  learning: 10363
  show: 10206
  across: 9671


In [39]:
# save
processed.to_parquet('data/processed/cleaned_papers.parquet', index=False)

vocab_data = {
    'vocab_size': len(vocab),
    'total_tokens': len(all_tokens),
    'vocabulary': sorted(list(vocab.keys())),
    'frequencies': dict(vocab.most_common(1000))
}
with open('data/processed/vocabulary.json', 'w') as f:
    json.dump(vocab_data, f, indent=2)

print(f"saved to data/processed/cleaned_papers.parquet")
print(f"file size: {os.path.getsize('data/processed/cleaned_papers.parquet')/(1024*1024):.2f} MB")

saved to data/processed/cleaned_papers.parquet
file size: 64.07 MB


---
## Part 3: Embeddings

In [40]:
# load preprocessed data
embed_df = pd.read_parquet('data/processed/cleaned_papers.parquet')

abstracts = embed_df['processed_text'].tolist()
titles = embed_df['title'].tolist()
paper_ids = embed_df['paper_id'].tolist()

print(f"loaded {len(embed_df)} papers for embeddings")

loaded 22522 papers for embeddings


### Word2Vec (baseline)

In [41]:
# train word2vec
# using processed text (already tokenized)
tokenized = [text.split() for text in abstracts if text]

print(f"training word2vec on {len(tokenized)} documents...")
w2v = Word2Vec(
    sentences=tokenized,
    vector_size=100,
    window=5,
    min_count=2,
    workers=4,
    epochs=10
)
print(f"vocab size: {len(w2v.wv)}")

training word2vec on 22522 documents...
vocab size: 28727


In [42]:
# test it
if 'model' in w2v.wv:
    print("similar to 'model':")
    for word, score in w2v.wv.most_similar('model', topn=5):
        print(f"  {word}: {score:.3f}")

similar to 'model':
  llm: 0.548
  approach: 0.487
  method: 0.481
  mllm: 0.480
  slm: 0.470


In [43]:
# get document embeddings (average of word vectors)
def get_doc_embedding(text, model):
    words = text.split()
    vecs = [model.wv[w] for w in words if w in model.wv]
    if vecs:
        return np.mean(vecs, axis=0)
    return np.zeros(model.wv.vector_size)

w2v_embeddings = np.array([get_doc_embedding(t, w2v) for t in abstracts])
print(f"w2v embeddings shape: {w2v_embeddings.shape}")

w2v embeddings shape: (22522, 100)


### SBERT

In [44]:
# load SBERT model
# all-MiniLM-L6-v2 is a good balance of speed/quality
print("loading SBERT...")
sbert = SentenceTransformer('all-MiniLM-L6-v2')
print(f"embedding dim: {sbert.get_sentence_embedding_dimension()}")

loading SBERT...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

embedding dim: 384


In [45]:
%%time
# generate embeddings for abstracts
print("generating SBERT embeddings for abstracts...")
sbert_abstract = sbert.encode(
    abstracts,
    show_progress_bar=True,
    batch_size=32
)
print(f"shape: {sbert_abstract.shape}")

generating SBERT embeddings for abstracts...


Batches:   0%|          | 0/704 [00:00<?, ?it/s]

shape: (22522, 384)
CPU times: user 36.2 s, sys: 1.17 s, total: 37.3 s
Wall time: 16.2 s


In [46]:
%%time
# and for titles
print("generating SBERT embeddings for titles...")
sbert_title = sbert.encode(
    titles,
    show_progress_bar=True,
    batch_size=32
)
print(f"shape: {sbert_title.shape}")

generating SBERT embeddings for titles...


Batches:   0%|          | 0/704 [00:00<?, ?it/s]

shape: (22522, 384)
CPU times: user 8.28 s, sys: 851 ms, total: 9.13 s
Wall time: 5.02 s


### SciBERT (for scientific text)

In [47]:
# specter is trained on scientific papers
# should work better for our domain
print("loading SciBERT (specter)...")
scibert = SentenceTransformer('allenai-specter')
print(f"embedding dim: {scibert.get_sentence_embedding_dimension()}")

loading SciBERT (specter)...


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/622 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/331 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

embedding dim: 768


In [48]:
%%time
# this is slower - larger model
print("generating SciBERT embeddings...")
scibert_embeddings = scibert.encode(
    abstracts,
    show_progress_bar=True,
    batch_size=16  # smaller batch for memory
)
print(f"shape: {scibert_embeddings.shape}")

generating SciBERT embeddings...


Batches:   0%|          | 0/1408 [00:00<?, ?it/s]

shape: (22522, 768)
CPU times: user 1min 15s, sys: 1.94 s, total: 1min 17s
Wall time: 54.8 s


### Compare embeddings

In [49]:
from sklearn.metrics.pairwise import cosine_similarity

# compare on first 10 papers
n = min(10, len(abstracts))

print("Embedding comparison (avg pairwise similarity):")
print("="*50)

# word2vec
w2v_sim = cosine_similarity(w2v_embeddings[:n])
w2v_avg = np.mean(w2v_sim[np.triu_indices_from(w2v_sim, k=1)])
print(f"Word2Vec (dim={w2v_embeddings.shape[1]}): {w2v_avg:.3f}")

# sbert
sbert_sim = cosine_similarity(sbert_abstract[:n])
sbert_avg = np.mean(sbert_sim[np.triu_indices_from(sbert_sim, k=1)])
print(f"SBERT (dim={sbert_abstract.shape[1]}): {sbert_avg:.3f}")

# scibert
sci_sim = cosine_similarity(scibert_embeddings[:n])
sci_avg = np.mean(sci_sim[np.triu_indices_from(sci_sim, k=1)])
print(f"SciBERT (dim={scibert_embeddings.shape[1]}): {sci_avg:.3f}")

Embedding comparison (avg pairwise similarity):
Word2Vec (dim=100): 0.679
SBERT (dim=384): 0.593
SciBERT (dim=768): 0.867


In [50]:
# quick similarity test
# find most similar paper to first one
query_idx = 0
print(f"Query: {titles[query_idx][:80]}...")
print()

# using SBERT
sims = cosine_similarity([sbert_abstract[query_idx]], sbert_abstract)[0]
top_idx = np.argsort(sims)[::-1][1:4]  # skip self

print("Most similar (SBERT):")
for i in top_idx:
    print(f"  [{sims[i]:.3f}] {titles[i][:70]}...")

Query: Joint Transition-based Dependency Parsing and Disfluency Detection for Automatic...

Most similar (SBERT):
  [0.747] CRF Autoencoder for Unsupervised Dependency Parsing...
  [0.716] Unsupervised Neural Dependency Parsing...
  [0.715] An Improved Neural Network Model for Joint POS Tagging and Dependency ...


### Save embeddings

In [51]:
# save all embeddings
print("saving embeddings...")

np.save('data/embeddings/word2vec_embeddings.npy', w2v_embeddings)
np.save('data/embeddings/sbert_abstract_embeddings.npy', sbert_abstract)
np.save('data/embeddings/sbert_title_embeddings.npy', sbert_title)
np.save('data/embeddings/scibert_embeddings.npy', scibert_embeddings)

print(f"  word2vec: {w2v_embeddings.shape}")
print(f"  sbert_abstract: {sbert_abstract.shape}")
print(f"  sbert_title: {sbert_title.shape}")
print(f"  scibert: {scibert_embeddings.shape}")

saving embeddings...
  word2vec: (22522, 100)
  sbert_abstract: (22522, 384)
  sbert_title: (22522, 384)
  scibert: (22522, 768)


In [52]:
# save paper index for later retrieval
paper_index = {
    'paper_ids': paper_ids,
    'titles': titles,
    'n_papers': len(paper_ids),
    'methods': ['word2vec', 'sbert_abstract', 'sbert_title', 'scibert'],
    'dims': {
        'word2vec': w2v_embeddings.shape[1],
        'sbert': sbert_abstract.shape[1],
        'scibert': scibert_embeddings.shape[1]
    }
}

with open('data/embeddings/paper_index.pkl', 'wb') as f:
    pickle.dump(paper_index, f)

print(f"saved paper index ({len(paper_ids)} papers)")

saved paper index (22522 papers)


In [53]:
# also save metadata
meta = {
    'created': datetime.now().isoformat(),
    'n_papers': len(paper_ids),
    'embeddings': {
        'word2vec': {
            'file': 'word2vec_embeddings.npy',
            'shape': list(w2v_embeddings.shape),
            'model': 'gensim Word2Vec'
        },
        'sbert_abstract': {
            'file': 'sbert_abstract_embeddings.npy',
            'shape': list(sbert_abstract.shape),
            'model': 'all-MiniLM-L6-v2'
        },
        'sbert_title': {
            'file': 'sbert_title_embeddings.npy',
            'shape': list(sbert_title.shape),
            'model': 'all-MiniLM-L6-v2'
        },
        'scibert': {
            'file': 'scibert_embeddings.npy',
            'shape': list(scibert_embeddings.shape),
            'model': 'allenai-specter'
        }
    }
}

with open('data/embeddings/metadata.json', 'w') as f:
    json.dump(meta, f, indent=2)

print("saved metadata")

saved metadata


---
## Summary

In [54]:
print("="*60)
print("DAY 1 PIPELINE COMPLETE")
print("="*60)

print("\nData Ingestion:")
print(f"  ArXiv: {len(arxiv_df)} papers")
print(f"  ACL: {len(acl_df)} papers (metadata only, no abstracts)")
print(f"  S2ORC: {len(s2_df)} papers")
print(f"  Combined raw: {len(combined)} papers")
print(f"  After cleaning: {len(clean)} papers")

print("\nPreprocessing:")
print(f"  Processed: {len(processed)} papers")
print(f"  Vocab size: {len(vocab)}")
print(f"  Avg tokens/paper: {processed['n_tokens'].mean():.1f}")
print(f"  Avg sentences/paper: {processed['n_sentences'].mean():.1f}")

print("\nEmbeddings:")
print(f"  Word2Vec: {w2v_embeddings.shape}")
print(f"  SBERT abstract: {sbert_abstract.shape}")
print(f"  SBERT title: {sbert_title.shape}")
print(f"  SciBERT: {scibert_embeddings.shape}")

print("\nOutput files:")
files = [
    'data/raw/complete_dataset.parquet',
    'data/processed/cleaned_papers.parquet',
    'data/processed/vocabulary.json',
    'data/embeddings/word2vec_embeddings.npy',
    'data/embeddings/sbert_abstract_embeddings.npy',
    'data/embeddings/sbert_title_embeddings.npy',
    'data/embeddings/scibert_embeddings.npy',
    'data/embeddings/paper_index.pkl',
    'data/embeddings/metadata.json'
]
total_size = 0
for f in files:
    if os.path.exists(f):
        sz = os.path.getsize(f) / (1024*1024)
        total_size += sz
        print(f"  {f} ({sz:.2f} MB)")

print(f"\nTotal size: {total_size:.2f} MB")
print("="*60)

DAY 1 PIPELINE COMPLETE

Data Ingestion:
  ArXiv: 20983 papers
  ACL: 108888 papers (metadata only, no abstracts)
  S2ORC: 1553 papers
  Combined raw: 131424 papers
  After cleaning: 22527 papers

Preprocessing:
  Processed: 22522 papers
  Vocab size: 48425
  Avg tokens/paper: 121.0
  Avg sentences/paper: 7.5

Embeddings:
  Word2Vec: (22522, 100)
  SBERT abstract: (22522, 384)
  SBERT title: (22522, 384)
  SciBERT: (22522, 768)

Output files:
  data/raw/complete_dataset.parquet (17.91 MB)
  data/processed/cleaned_papers.parquet (64.07 MB)
  data/processed/vocabulary.json (0.74 MB)
  data/embeddings/word2vec_embeddings.npy (8.59 MB)
  data/embeddings/sbert_abstract_embeddings.npy (32.99 MB)
  data/embeddings/sbert_title_embeddings.npy (32.99 MB)
  data/embeddings/scibert_embeddings.npy (65.98 MB)
  data/embeddings/paper_index.pkl (2.29 MB)
  data/embeddings/metadata.json (0.00 MB)

Total size: 225.57 MB


In [55]:
# copy to drive for persistence
import shutil

drive_path = '/content/drive/MyDrive/scholarly-topic-navigator/data'
try:
    shutil.copytree('data/', drive_path, dirs_exist_ok=True)
    print(f"copied to {drive_path}")
except Exception as e:
    print(f"drive copy failed: {e}")
    print("you can manually copy the data folder to your drive")

copied to /content/drive/MyDrive/scholarly-topic-navigator/data
