Data Ingestion, Preprocessing & Embeddings

This notebook covers:
1. Data ingestion from ArXiv, ACL, S2ORC
2. Preprocessing (cleaning, tokenization, etc)
3. Embedding generation

In [1]:
# install dependencies first
!pip install pandas arxiv requests beautifulsoup4 lxml pyarrow -q
!pip install spacy nltk tqdm langdetect -q
!pip install gensim sentence-transformers torch scikit-learn -q
!python -m spacy download en_core_web_sm

  Preparing metadata (setup.py) ... [?25l[?25hdone
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/81.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.5/81.5 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for sgmllib3k (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m52.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m96.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m147.8 MB/s[0m eta

In [2]:
import os
import re
import json
import time
import pickle
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from datetime import datetime
from collections import Counter
from tqdm import tqdm

import arxiv
import requests
from bs4 import BeautifulSoup

import spacy
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from langdetect import detect, LangDetectException

from gensim.models import Word2Vec
from sentence_transformers import SentenceTransformer
import torch

from google.colab import drive

print("imports done")

imports done


In [3]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# setup directories
os.makedirs('data/raw', exist_ok=True)
os.makedirs('data/processed', exist_ok=True)
os.makedirs('data/embeddings', exist_ok=True)

# nltk stuff - need all these for proper tokenization
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)  # needed for sentence tokenization in newer nltk

nlp = spacy.load('en_core_web_sm')
stop_words = set(stopwords.words('english'))

print("setup complete")

---
## Part 1: Data Ingestion

### ArXiv

In [5]:
# arxiv ingestion
# tried using the bulk download first but it was too slow
# API works better for our use case

def fetch_arxiv(categories, max_per_cat=5000):
    client = arxiv.Client()
    papers = []

    for cat in categories:
        print(f"fetching {cat}...", end=' ')

        search = arxiv.Search(
            query=f'cat:{cat}',
            max_results=max_per_cat,
            sort_by=arxiv.SortCriterion.SubmittedDate
        )

        cnt = 0
        for r in client.results(search):
            papers.append({
                'paper_id': r.entry_id.split('/')[-1],
                'title': r.title,
                'authors': [a.name for a in r.authors],
                'abstract': r.summary.replace('\n', ' '),
                'categories': r.categories,
                'venue': 'arXiv',
                'year': r.published.year,
                'published': r.published.isoformat(),
                'pdf_url': r.pdf_url
            })
            cnt += 1
        print(f"got {cnt}")

    return papers

In [6]:
# fetch papers - using NLP and ML related categories
# 5000 per category for comprehensive coverage
cats = ['cs.CL', 'cs.LG', 'stat.ML', 'cs.AI', 'cs.IR']
arxiv_papers = fetch_arxiv(cats, max_per_cat=5000)
print(f"\ntotal: {len(arxiv_papers)}")

fetching cs.CL... got 5000
fetching cs.LG... got 5000
fetching stat.ML... got 5000
fetching cs.AI... got 5000
fetching cs.IR... got 5000

total: 25000


In [7]:
# quick check
arxiv_df = pd.DataFrame(arxiv_papers)
arxiv_df.head(2)

Unnamed: 0,paper_id,title,authors,abstract,categories,venue,year,published,pdf_url
0,2512.02010v1,Four Over Six: More Accurate NVFP4 Quantizatio...,"[Jack Cook, Junxian Guo, Guangxuan Xiao, Yujun...","As large language models have grown larger, lo...","[cs.CL, cs.LG]",arXiv,2025,2025-12-01T18:59:45+00:00,https://arxiv.org/pdf/2512.02010v1
1,2512.02008v1,The Art of Scaling Test-Time Compute for Large...,"[Aradhye Agarwal, Ayan Sengupta, Tanmoy Chakra...",Test-time scaling (TTS) -- the dynamic allocat...,[cs.CL],arXiv,2025,2025-12-01T18:59:28+00:00,https://arxiv.org/pdf/2512.02008v1


In [8]:
# check for any issues
print("shape:", arxiv_df.shape)
print("\nmissing values:")
print(arxiv_df.isnull().sum())
print("\nyear distribution:")
print(arxiv_df['year'].value_counts().head())

shape: (25000, 9)

missing values:
paper_id      0
title         0
authors       0
abstract      0
categories    0
venue         0
year          0
published     0
pdf_url       0
dtype: int64

year distribution:
year
2025    23828
2024     1172
Name: count, dtype: int64


### ACL Anthology

In [9]:
# download ACL bibtex dump
# this is like 100MB+ so takes a minute
print("downloading ACL data...")
!wget https://aclanthology.org/anthology.bib.gz -O data/raw/acl.bib.gz -q
!gunzip -f data/raw/acl.bib.gz
print("done")

downloading ACL data...
done


In [10]:
# check file size
import os
size_mb = os.path.getsize('data/raw/acl.bib') / (1024*1024)
print(f"file size: {size_mb:.1f} MB")

file size: 77.2 MB


In [11]:
# parse bibtex - this is kinda slow but works
# tried pybtex but it kept crashing on malformed entries

def parse_bib(fpath):
    papers = []
    current = {}

    with open(fpath, 'r', encoding='utf-8', errors='ignore') as f:
        for line in f:
            line = line.strip()

            if line.startswith('@'):
                # save previous entry
                if current and 'title' in current:
                    papers.append(current)
                # start new entry
                parts = line[1:].split('{')
                if len(parts) == 2:
                    current = {'paper_id': parts[1].rstrip(',')}

            elif line.startswith('}'):
                if current and 'title' in current:
                    papers.append(current)
                current = {}

            elif '=' in line and current:
                # parse field
                parts = line.split('=', 1)
                if len(parts) == 2:
                    field = parts[0].strip()
                    val = parts[1].strip().strip(',').strip('{}').strip('"')
                    current[field] = val

    return papers

In [12]:
%%time
# this takes a few minutes
print("parsing bibtex...")
acl_raw = parse_bib('data/raw/acl.bib')
print(f"parsed {len(acl_raw)} entries")

parsing bibtex...
parsed 118461 entries
CPU times: user 1.41 s, sys: 105 ms, total: 1.51 s
Wall time: 1.51 s


In [13]:
# look at one entry
acl_raw[100]

{'paper_id': 'melis-etal-2025-modular',
 'title': 'A Modular Taxonomy for Hate Speech Definitions and Its Impact on Zero-Shot {LLM} Classification Performance',
 'author': 'Melis, Matteo  and',
 'editor': 'Calabrese, Agostina  and',
 'booktitle': 'Proceedings of the The 9th Workshop on Online Abuse and Harms (WOAH)',
 'month': 'aug',
 'year': '2025',
 'address': 'Vienna, Austria',
 'publisher': 'Association for Computational Linguistics',
 'url': 'https://aclanthology.org/2025.woah-1.45/',
 'pages': '490--521',
 'ISBN': '979-8-89176-105-6'}

In [14]:
# normalize to our schema
acl_normalized = []

for p in acl_raw:
    # parse authors - they use "and" as separator
    authors = []
    if 'author' in p:
        authors = [a.strip() for a in p['author'].split(' and ')]

    # get year
    year = None
    if 'year' in p:
        try:
            year = int(p['year'])
        except:
            pass  # some have weird year formats

    acl_normalized.append({
        'paper_id': p.get('paper_id', ''),
        'title': p.get('title', ''),
        'authors': authors,
        'abstract': p.get('abstract', ''),
        'venue': p.get('booktitle', p.get('journal', 'ACL')),
        'year': year,
        'url': p.get('url', '')
    })

acl_df = pd.DataFrame(acl_normalized)
print(f"total entries: {len(acl_df)}")

total entries: 118461


In [15]:
# check data quality
print("missing abstracts:", (acl_df['abstract'].str.len() == 0).sum())
print("missing titles:", (acl_df['title'].str.len() == 0).sum())

# keep all papers from 2000 onwards for max coverage
acl_df = acl_df[(acl_df['title'].str.len() > 0) & (acl_df['year'] >= 2000)]
print(f"\nafter filtering (2000+): {len(acl_df)}")

missing abstracts: 118461
missing titles: 0

after filtering (2000+): 108888


In [16]:
# year distribution
acl_df['year'].value_counts().sort_index()

Unnamed: 0_level_0,count
year,Unnamed: 1_level_1
2000,1254
2001,814
2002,1216
2003,1186
2004,1924
2005,1261
2006,2174
2007,1601
2008,2232
2009,2233


### S2ORC (Semantic Scholar)

In [17]:
# semantic scholar API
# rate limited so we need to be careful

def search_s2(query, limit=100):
    url = "https://api.semanticscholar.org/graph/v1/paper/search"
    params = {
        'query': query,
        'limit': min(limit, 100),  # API max is 100
        'fields': 'paperId,title,abstract,authors,year,venue,citationCount,fieldsOfStudy'
    }

    try:
        r = requests.get(url, params=params, timeout=10)
        if r.status_code == 200:
            return r.json().get('data', [])
        else:
            print(f"error: {r.status_code}")
            return []
    except Exception as e:
        print(f"request failed: {e}")
        return []

In [18]:
# test it first
test = search_s2("natural language processing", 5)
print(f"got {len(test)} results")
if test:
    print(test[0].keys())

error: 429
got 0 results


In [19]:
# fetch papers for different NLP topics
# expanded query list for more coverage
queries = [
    'natural language processing',
    'transformers bert',
    'machine translation',
    'sentiment analysis',
    'named entity recognition',
    'question answering',
    'text classification',
    'language model',
    'word embeddings',
    'neural machine translation',
    'text summarization',
    'information extraction',
    'semantic similarity',
    'text generation',
    'dialogue systems',
    'speech recognition',
    'knowledge graphs',
    'relation extraction',
    'coreference resolution',
    'dependency parsing'
]

all_s2 = []
for q in queries:
    print(f"{q}...", end=' ')
    papers = search_s2(q, 100)  # 100 is API max per query
    all_s2.extend(papers)
    print(f"{len(papers)}")
    time.sleep(1.5)  # slightly longer delay to avoid rate limit

print(f"\ntotal (with dups): {len(all_s2)}")

natural language processing... error: 429
0
transformers bert... error: 429
0
machine translation... error: 429
0
sentiment analysis... error: 429
0
named entity recognition... error: 429
0
question answering... error: 429
0
text classification... 100
language model... error: 429
0
word embeddings... error: 429
0
neural machine translation... error: 429
0
text summarization... error: 429
0
information extraction... 100
semantic similarity... error: 429
0
text generation... 100
dialogue systems... 100
speech recognition... error: 429
0
knowledge graphs... error: 429
0
relation extraction... error: 429
0
coreference resolution... 100
dependency parsing... error: 429
0

total (with dups): 500


In [20]:
# remove duplicates and normalize
seen_ids = set()
s2_normalized = []

for p in all_s2:
    pid = p.get('paperId')
    if not pid or pid in seen_ids:
        continue
    seen_ids.add(pid)

    # get author names
    authors = [a.get('name', '') for a in p.get('authors', [])]

    s2_normalized.append({
        'paper_id': pid,
        'title': p.get('title', ''),
        'authors': authors,
        'abstract': p.get('abstract', ''),
        'venue': p.get('venue', ''),
        'year': p.get('year'),
        'citation_count': p.get('citationCount', 0),
        'categories': p.get('fieldsOfStudy', [])
    })

s2_df = pd.DataFrame(s2_normalized)
print(f"unique papers: {len(s2_df)}")

unique papers: 500


In [21]:
# filter out papers without abstracts
s2_df = s2_df[(s2_df['title'].str.len() > 0) & (s2_df['abstract'].str.len() > 0)]
print(f"with abstracts: {len(s2_df)}")

with abstracts: 444


### Combine all sources

In [22]:
# normalize everything to same schema
def normalize_df(df, source):
    result = pd.DataFrame({
        'paper_id': source + '_' + df['paper_id'].astype(str).str.replace('/', '_'),
        'title': df['title'],
        'authors': df['authors'],
        'abstract': df['abstract'].fillna(''),
        'venue': df['venue'].fillna(''),
        'year': df['year'],
        'categories': df.get('categories', [[]]*len(df)),
        'source': source
    })
    return result

arxiv_norm = normalize_df(arxiv_df, 'arxiv')
acl_norm = normalize_df(acl_df, 'acl')
s2_norm = normalize_df(s2_df, 's2orc')

print(f"arxiv: {len(arxiv_norm)}")
print(f"acl: {len(acl_norm)}")
print(f"s2orc: {len(s2_norm)}")

arxiv: 25000
acl: 108888
s2orc: 444


In [23]:
# combine
combined = pd.concat([arxiv_norm, acl_norm, s2_norm], ignore_index=True)
print(f"combined: {len(combined)}")

combined: 134332


### Data Validation

In [24]:
# let's check data quality
print("=" * 50)
print("DATA VALIDATION")
print("=" * 50)

# missing abstracts - this is important for our pipeline
no_abstract = combined['abstract'].str.len() == 0
print(f"\nmissing abstracts: {no_abstract.sum()} ({no_abstract.sum()/len(combined)*100:.1f}%)")

# breakdown by source
for src in combined['source'].unique():
    mask = combined['source'] == src
    missing = (combined[mask]['abstract'].str.len() == 0).sum()
    total = mask.sum()
    print(f"  {src}: {missing}/{total} ({missing/total*100:.1f}%)")

DATA VALIDATION

missing abstracts: 108888 (81.1%)
  arxiv: 0/25000 (0.0%)
  acl: 108888/108888 (100.0%)
  s2orc: 0/444 (0.0%)


In [25]:
# other checks
print(f"missing titles: {(combined['title'].str.len() == 0).sum()}")
print(f"missing authors: {(combined['authors'].apply(len) == 0).sum()}")

# check for duplicates
dups = combined.duplicated(subset=['title'], keep=False)
print(f"\nduplicate titles: {dups.sum()}")

# show some examples
if dups.sum() > 0:
    print("\nexample duplicates:")
    dup_titles = combined[dups].groupby('title')['source'].apply(list).head(3)
    for t, srcs in dup_titles.items():
        print(f"  '{t[:50]}...' -> {srcs}")

missing titles: 0
missing authors: 2832

duplicate titles: 8969

example duplicates:
  '"When Data is Scarce, Prompt Smarter"... Approache...' -> ['arxiv', 'arxiv']
  '$A^3$: Attention-Aware Accurate KV Cache Fusion fo...' -> ['arxiv', 'arxiv']
  '$L_1$-norm Regularized Indefinite Kernel Logistic ...' -> ['arxiv', 'arxiv']


In [26]:
# year distribution
print("\nyear range:", combined['year'].min(), "-", combined['year'].max())

# invalid years?
bad_years = (combined['year'].isna()) | (combined['year'] < 1990) | (combined['year'] > 2025)
print(f"invalid years: {bad_years.sum()}")


year range: 2000 - 2025
invalid years: 0


### Clean and deduplicate

In [27]:
# filter out bad entries
print("cleaning data...")
print(f"before: {len(combined)}")

clean = combined[
    (combined['title'].str.len() > 10) &
    (combined['abstract'].str.len() >= 50) &  # need decent abstracts
    (combined['authors'].apply(len) > 0) &
    (combined['year'] >= 1990) &
    (combined['year'] <= 2025)
].copy()

print(f"after filtering: {len(clean)}")

cleaning data...
before: 134332
after filtering: 25439


In [28]:
# deduplicate - prefer ACL > S2ORC > ArXiv
# (ACL is more authoritative for NLP papers)
priority = {'acl': 1, 's2orc': 2, 'arxiv': 3}
clean['_prio'] = clean['source'].map(priority)
clean = clean.sort_values('_prio').drop_duplicates(subset=['title'], keep='first')
clean = clean.drop('_prio', axis=1)

print(f"after dedup: {len(clean)}")

after dedup: 21422


In [29]:
# add some useful columns
clean['title_len'] = clean['title'].str.len()
clean['abstract_len'] = clean['abstract'].str.len()
clean['n_authors'] = clean['authors'].apply(len)

clean.describe()

Unnamed: 0,year,title_len,abstract_len,n_authors
count,21422.0,21422.0,21422.0,21422.0
mean,2024.858323,80.908319,1298.841798,5.010877
std,0.722019,23.052828,320.2461,5.210066
min,2005.0,12.0,93.0,1.0
25%,2025.0,65.0,1080.0,3.0
50%,2025.0,81.0,1297.0,4.0
75%,2025.0,95.0,1530.0,6.0
max,2025.0,223.0,2298.0,338.0


In [30]:
# source distribution
print("\nfinal source distribution:")
print(clean['source'].value_counts())


final source distribution:
source
arxiv    20980
s2orc      442
Name: count, dtype: int64


In [31]:
# save
clean.to_parquet('data/raw/complete_dataset.parquet', index=False)
print(f"saved {len(clean)} papers to data/raw/complete_dataset.parquet")
print(f"file size: {os.path.getsize('data/raw/complete_dataset.parquet')/(1024*1024):.2f} MB")

saved 21422 papers to data/raw/complete_dataset.parquet
file size: 17.15 MB


---
## Part 2: Preprocessing

In [32]:
# preprocessing functions

def clean_text(text):
    if not isinstance(text, str):
        return ""
    # urls
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # emails
    text = re.sub(r'\S+@\S+', '', text)
    # whitespace
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def remove_special(text):
    # keep alphanumeric and basic punctuation
    text = re.sub(r'[^a-zA-Z0-9\s.,!?-]', '', text)
    text = re.sub(r'([.,!?-])\1+', r'\1', text)  # remove repeated punct
    return text

def get_language(text):
    try:
        return detect(text)
    except:
        return 'unknown'

def get_sentences(text):
    try:
        return sent_tokenize(text)
    except:
        return [text]

def tokenize(text):
    doc = nlp(text)
    return [tok.text for tok in doc]

def remove_stops(tokens):
    return [t for t in tokens if t.lower() not in stop_words]

def lemmatize(tokens):
    text = ' '.join(tokens)
    doc = nlp(text)
    return [tok.lemma_ for tok in doc]

In [33]:
# test on one example
test_text = "We propose a novel approach for NLP using transformers. Visit https://example.com for more info!"

cleaned = clean_text(test_text)
print("cleaned:", cleaned)

cleaned = remove_special(cleaned)
print("no special:", cleaned)

tokens = tokenize(cleaned.lower())
print("tokens:", tokens)

tokens = remove_stops(tokens)
print("no stops:", tokens)

tokens = lemmatize(tokens)
print("lemmas:", tokens)

cleaned: We propose a novel approach for NLP using transformers. Visit for more info!
no special: We propose a novel approach for NLP using transformers. Visit for more info!
tokens: ['we', 'propose', 'a', 'novel', 'approach', 'for', 'nlp', 'using', 'transformers', '.', 'visit', 'for', 'more', 'info', '!']
no stops: ['propose', 'novel', 'approach', 'nlp', 'using', 'transformers', '.', 'visit', 'info', '!']
lemmas: ['propose', 'novel', 'approach', 'nlp', 'use', 'transformer', '.', 'visit', 'info', '!']


In [34]:
# full pipeline
def preprocess(text, do_lowercase=True, do_stopwords=True, do_lemma=True):
    if not isinstance(text, str) or len(text) == 0:
        return {
            'cleaned': '',
            'lang': 'unknown',
            'sentences': [],
            'n_sents': 0,
            'tokens': [],
            'processed': ''
        }

    # detect language first (before cleaning)
    lang = get_language(text)

    # clean
    text = clean_text(text)
    text = remove_special(text)

    # sentences
    sents = get_sentences(text)

    if do_lowercase:
        text = text.lower()

    cleaned = text

    # tokenize
    tokens = tokenize(text)

    if do_stopwords:
        tokens = remove_stops(tokens)

    if do_lemma:
        tokens = lemmatize(tokens)

    # filter junk tokens
    tokens = [t for t in tokens if len(t) > 2 and t.isalnum()]

    return {
        'cleaned': cleaned,
        'lang': lang,
        'sentences': sents,
        'n_sents': len(sents),
        'tokens': tokens,
        'processed': ' '.join(tokens)
    }

In [35]:
# load data
df = pd.read_parquet('data/raw/complete_dataset.parquet')
print(f"loaded {len(df)} papers")

# use all papers for better embedding quality
# with 6k+ papers, embeddings will be much more useful
SAMPLE_SIZE = 10000  # or use len(df) for everything
if len(df) > SAMPLE_SIZE:
    sample = df.sample(n=SAMPLE_SIZE, random_state=42).copy()
else:
    sample = df.copy()
print(f"using {len(sample)} papers")

loaded 21422 papers
using 10000 papers


In [36]:
%%time
# process all papers
print(f"preprocessing {len(sample)} papers...")

results = []
for idx, row in tqdm(sample.iterrows(), total=len(sample)):
    res = preprocess(row['abstract'])

    results.append({
        'paper_id': row['paper_id'],
        'title': row['title'],
        'authors': row['authors'],
        'original_abstract': row['abstract'],
        'cleaned_text': res['cleaned'],
        'language': res['lang'],
        'sentences': res['sentences'],
        'n_sentences': res['n_sents'],
        'tokens': res['tokens'],
        'processed_text': res['processed'],
        'n_tokens': len(res['tokens']),
        'source': row['source'],
        'year': row['year'],
        'venue': row['venue']
    })

processed = pd.DataFrame(results)
print("done!")

preprocessing 10000 papers...


100%|██████████| 10000/10000 [09:51<00:00, 16.92it/s]

done!
CPU times: user 9min 49s, sys: 2.34 s, total: 9min 52s
Wall time: 9min 51s





In [37]:
# check results
print("\ntoken stats:")
print(f"  mean: {processed['n_tokens'].mean():.1f}")
print(f"  min: {processed['n_tokens'].min()}")
print(f"  max: {processed['n_tokens'].max()}")

print(f"\nsentence stats:")
print(f"  mean: {processed['n_sentences'].mean():.1f}")

print(f"\nlanguages:")
print(processed['language'].value_counts())


token stats:
  mean: 122.1
  min: 12
  max: 199

sentence stats:
  mean: 1.0

languages:
language
en    9999
fr       1
Name: count, dtype: int64


In [38]:
# build vocabulary
all_tokens = []
for toks in processed['tokens']:
    all_tokens.extend(toks)

vocab = Counter(all_tokens)
print(f"vocabulary size: {len(vocab)}")
print(f"total tokens: {len(all_tokens)}")

print("\ntop 20 words:")
for word, cnt in vocab.most_common(20):
    print(f"  {word}: {cnt}")

vocabulary size: 30383
total tokens: 1220534

top 20 words:
  model: 20691
  method: 8199
  base: 7636
  llm: 7490
  datum: 7249
  use: 7153
  language: 6199
  propose: 6097
  task: 6042
  performance: 5913
  framework: 5775
  approach: 5550
  dataset: 5393
  large: 5263
  system: 5178
  result: 4907
  demonstrate: 4852
  show: 4549
  across: 4425
  introduce: 4321


In [39]:
# save
processed.to_parquet('data/processed/cleaned_papers.parquet', index=False)

vocab_data = {
    'vocab_size': len(vocab),
    'total_tokens': len(all_tokens),
    'vocabulary': sorted(list(vocab.keys())),
    'frequencies': dict(vocab.most_common(1000))
}
with open('data/processed/vocabulary.json', 'w') as f:
    json.dump(vocab_data, f, indent=2)

print(f"saved to data/processed/cleaned_papers.parquet")
print(f"file size: {os.path.getsize('data/processed/cleaned_papers.parquet')/(1024*1024):.2f} MB")

saved to data/processed/cleaned_papers.parquet
file size: 28.95 MB


---
## Part 3: Embeddings

In [40]:
# load preprocessed data
embed_df = pd.read_parquet('data/processed/cleaned_papers.parquet')

abstracts = embed_df['processed_text'].tolist()
titles = embed_df['title'].tolist()
paper_ids = embed_df['paper_id'].tolist()

print(f"loaded {len(embed_df)} papers for embeddings")

loaded 10000 papers for embeddings


### Word2Vec (baseline)

In [41]:
# train word2vec
# using processed text (already tokenized)
tokenized = [text.split() for text in abstracts if text]

print(f"training word2vec on {len(tokenized)} documents...")
w2v = Word2Vec(
    sentences=tokenized,
    vector_size=100,
    window=5,
    min_count=2,
    workers=4,
    epochs=10
)
print(f"vocab size: {len(w2v.wv)}")

training word2vec on 10000 documents...
vocab size: 18269


In [42]:
# test it
if 'model' in w2v.wv:
    print("similar to 'model':")
    for word, score in w2v.wv.most_similar('model', topn=5):
        print(f"  {word}: {score:.3f}")

similar to 'model':
  llm: 0.483
  classifier: 0.455
  slm: 0.436
  pre: 0.405
  transformer: 0.396


In [43]:
# get document embeddings (average of word vectors)
def get_doc_embedding(text, model):
    words = text.split()
    vecs = [model.wv[w] for w in words if w in model.wv]
    if vecs:
        return np.mean(vecs, axis=0)
    return np.zeros(model.wv.vector_size)

w2v_embeddings = np.array([get_doc_embedding(t, w2v) for t in abstracts])
print(f"w2v embeddings shape: {w2v_embeddings.shape}")

w2v embeddings shape: (10000, 100)


### SBERT

In [44]:
# load SBERT model
# all-MiniLM-L6-v2 is a good balance of speed/quality
print("loading SBERT...")
sbert = SentenceTransformer('all-MiniLM-L6-v2')
print(f"embedding dim: {sbert.get_sentence_embedding_dimension()}")

loading SBERT...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

embedding dim: 384


In [45]:
%%time
# generate embeddings for abstracts
print("generating SBERT embeddings for abstracts...")
sbert_abstract = sbert.encode(
    abstracts,
    show_progress_bar=True,
    batch_size=32
)
print(f"shape: {sbert_abstract.shape}")

generating SBERT embeddings for abstracts...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

shape: (10000, 384)
CPU times: user 16.2 s, sys: 618 ms, total: 16.8 s
Wall time: 7.6 s


In [46]:
%%time
# and for titles
print("generating SBERT embeddings for titles...")
sbert_title = sbert.encode(
    titles,
    show_progress_bar=True,
    batch_size=32
)
print(f"shape: {sbert_title.shape}")

generating SBERT embeddings for titles...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

shape: (10000, 384)
CPU times: user 3.75 s, sys: 337 ms, total: 4.09 s
Wall time: 2.25 s


### SciBERT (for scientific text)

In [47]:
# specter is trained on scientific papers
# should work better for our domain
print("loading SciBERT (specter)...")
scibert = SentenceTransformer('allenai-specter')
print(f"embedding dim: {scibert.get_sentence_embedding_dimension()}")

loading SciBERT (specter)...


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/622 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/331 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

embedding dim: 768


In [48]:
%%time
# this is slower - larger model
print("generating SciBERT embeddings...")
scibert_embeddings = scibert.encode(
    abstracts,
    show_progress_bar=True,
    batch_size=16  # smaller batch for memory
)
print(f"shape: {scibert_embeddings.shape}")

generating SciBERT embeddings...


Batches:   0%|          | 0/625 [00:00<?, ?it/s]

shape: (10000, 768)
CPU times: user 33.6 s, sys: 894 ms, total: 34.5 s
Wall time: 24.2 s


### Compare embeddings

In [49]:
from sklearn.metrics.pairwise import cosine_similarity

# compare on first 10 papers
n = min(10, len(abstracts))

print("Embedding comparison (avg pairwise similarity):")
print("="*50)

# word2vec
w2v_sim = cosine_similarity(w2v_embeddings[:n])
w2v_avg = np.mean(w2v_sim[np.triu_indices_from(w2v_sim, k=1)])
print(f"Word2Vec (dim={w2v_embeddings.shape[1]}): {w2v_avg:.3f}")

# sbert
sbert_sim = cosine_similarity(sbert_abstract[:n])
sbert_avg = np.mean(sbert_sim[np.triu_indices_from(sbert_sim, k=1)])
print(f"SBERT (dim={sbert_abstract.shape[1]}): {sbert_avg:.3f}")

# scibert
sci_sim = cosine_similarity(scibert_embeddings[:n])
sci_avg = np.mean(sci_sim[np.triu_indices_from(sci_sim, k=1)])
print(f"SciBERT (dim={scibert_embeddings.shape[1]}): {sci_avg:.3f}")

Embedding comparison (avg pairwise similarity):
Word2Vec (dim=100): 0.471
SBERT (dim=384): 0.356
SciBERT (dim=768): 0.744


In [50]:
# quick similarity test
# find most similar paper to first one
query_idx = 0
print(f"Query: {titles[query_idx][:80]}...")
print()

# using SBERT
sims = cosine_similarity([sbert_abstract[query_idx]], sbert_abstract)[0]
top_idx = np.argsort(sims)[::-1][1:4]  # skip self

print("Most similar (SBERT):")
for i in top_idx:
    print(f"  [{sims[i]:.3f}] {titles[i][:70]}...")

Query: scipy.spatial.transform: Differentiable Framework-Agnostic 3D Transformations in...

Most similar (SBERT):
  [0.621] Context-aware Learned Mesh-based Simulation via Trajectory-Level Meta-...
  [0.590] $\mathcal{E}_0$: Enhancing Generalization and Fine-Grained Control in ...
  [0.588] URDF-Anything: Constructing Articulated Objects with 3D Multimodal Lan...


### Save embeddings

In [51]:
# save all embeddings
print("saving embeddings...")

np.save('data/embeddings/word2vec_embeddings.npy', w2v_embeddings)
np.save('data/embeddings/sbert_abstract_embeddings.npy', sbert_abstract)
np.save('data/embeddings/sbert_title_embeddings.npy', sbert_title)
np.save('data/embeddings/scibert_embeddings.npy', scibert_embeddings)

print(f"  word2vec: {w2v_embeddings.shape}")
print(f"  sbert_abstract: {sbert_abstract.shape}")
print(f"  sbert_title: {sbert_title.shape}")
print(f"  scibert: {scibert_embeddings.shape}")

saving embeddings...
  word2vec: (10000, 100)
  sbert_abstract: (10000, 384)
  sbert_title: (10000, 384)
  scibert: (10000, 768)


In [52]:
# save paper index for later retrieval
paper_index = {
    'paper_ids': paper_ids,
    'titles': titles,
    'n_papers': len(paper_ids),
    'methods': ['word2vec', 'sbert_abstract', 'sbert_title', 'scibert'],
    'dims': {
        'word2vec': w2v_embeddings.shape[1],
        'sbert': sbert_abstract.shape[1],
        'scibert': scibert_embeddings.shape[1]
    }
}

with open('data/embeddings/paper_index.pkl', 'wb') as f:
    pickle.dump(paper_index, f)

print(f"saved paper index ({len(paper_ids)} papers)")

saved paper index (10000 papers)


In [53]:
# also save metadata
meta = {
    'created': datetime.now().isoformat(),
    'n_papers': len(paper_ids),
    'embeddings': {
        'word2vec': {
            'file': 'word2vec_embeddings.npy',
            'shape': list(w2v_embeddings.shape),
            'model': 'gensim Word2Vec'
        },
        'sbert_abstract': {
            'file': 'sbert_abstract_embeddings.npy',
            'shape': list(sbert_abstract.shape),
            'model': 'all-MiniLM-L6-v2'
        },
        'sbert_title': {
            'file': 'sbert_title_embeddings.npy',
            'shape': list(sbert_title.shape),
            'model': 'all-MiniLM-L6-v2'
        },
        'scibert': {
            'file': 'scibert_embeddings.npy',
            'shape': list(scibert_embeddings.shape),
            'model': 'allenai-specter'
        }
    }
}

with open('data/embeddings/metadata.json', 'w') as f:
    json.dump(meta, f, indent=2)

print("saved metadata")

saved metadata


---
## Summary

In [54]:
print("="*60)
print("DAY 1 PIPELINE COMPLETE")
print("="*60)

print("\nData Ingestion:")
print(f"  ArXiv: {len(arxiv_df)} papers")
print(f"  ACL: {len(acl_df)} papers")
print(f"  S2ORC: {len(s2_df)} papers")
print(f"  Final dataset: {len(clean)} papers")

print("\nPreprocessing:")
print(f"  Processed: {len(processed)} papers")
print(f"  Vocab size: {len(vocab)}")
print(f"  Avg tokens/paper: {processed['n_tokens'].mean():.1f}")

print("\nEmbeddings:")
print(f"  Word2Vec: {w2v_embeddings.shape}")
print(f"  SBERT abstract: {sbert_abstract.shape}")
print(f"  SBERT title: {sbert_title.shape}")
print(f"  SciBERT: {scibert_embeddings.shape}")

print("\nOutput files:")
files = [
    'data/raw/complete_dataset.parquet',
    'data/processed/cleaned_papers.parquet',
    'data/processed/vocabulary.json',
    'data/embeddings/word2vec_embeddings.npy',
    'data/embeddings/sbert_abstract_embeddings.npy',
    'data/embeddings/sbert_title_embeddings.npy',
    'data/embeddings/scibert_embeddings.npy',
    'data/embeddings/paper_index.pkl'
]
total_size = 0
for f in files:
    if os.path.exists(f):
        sz = os.path.getsize(f) / (1024*1024)
        total_size += sz
        print(f"  {f} ({sz:.2f} MB)")

print(f"\nTotal size: {total_size:.2f} MB")
print("="*60)

DAY 1 PIPELINE COMPLETE

Data Ingestion:
  ArXiv: 25000 papers
  ACL: 108888 papers
  S2ORC: 444 papers
  Final dataset: 21422 papers

Preprocessing:
  Processed: 10000 papers
  Vocab size: 30383
  Avg tokens/paper: 122.1

Embeddings:
  Word2Vec: (10000, 100)
  SBERT abstract: (10000, 384)
  SBERT title: (10000, 384)
  SciBERT: (10000, 768)

Output files:
  data/raw/complete_dataset.parquet (17.15 MB)
  data/processed/cleaned_papers.parquet (28.95 MB)
  data/processed/vocabulary.json (0.47 MB)
  data/embeddings/word2vec_embeddings.npy (3.81 MB)
  data/embeddings/sbert_abstract_embeddings.npy (14.65 MB)
  data/embeddings/sbert_title_embeddings.npy (14.65 MB)
  data/embeddings/scibert_embeddings.npy (29.30 MB)
  data/embeddings/paper_index.pkl (1.00 MB)

Total size: 109.99 MB


In [55]:
# copy to drive for persistence
# uncomment if you want to save to drive

# !cp -r data/ /content/drive/MyDrive/scholarly-topic-navigator/
# print("copied to drive")