# Complete Pipeline: Data Ingestion ‚Üí Preprocessing ‚Üí Embeddings

**Day 1 Deliverables - Full Implementation**

This notebook implements:

## ‚úÖ Part 1: Data Ingestion & Normalization
- ArXiv, ACL Anthology, S2ORC ingestion
- Metadata normalization (paper_id, title, authors, abstract, venue, year, categories)
- **Dataset validation** (missing abstracts, duplicates, invalid fields)
- Output: `/data/raw/complete_dataset.parquet`

## ‚úÖ Part 2: Full Preprocessing Pipeline  
- Text cleaning, lowercasing, regex fixes
- Tokenization, lemmatization, stopword removal
- **Language detection**
- **Sentence segmentation**
- Output: `/data/processed/cleaned_papers.parquet`

## ‚úÖ Part 3: Embeddings Pipeline
- Compare Word2Vec, SBERT, and SciBERT on sample
- Implement SBERT embedding generator
- Generate abstract/title embeddings
- Output: `/data/embeddings/*.npy`, `paper_index.pkl`

**‚è±Ô∏è Total Runtime:** ~4-5 hours  
**üíæ Storage Required:** ~1-2GB

## Setup & Installation

In [None]:
# install all required packages
!pip install pandas arxiv requests beautifulsoup4 lxml pyarrow -q
!pip install spacy nltk tqdm langdetect -q
!pip install gensim sentence-transformers torch scikit-learn -q
!python -m spacy download en_core_web_sm

In [None]:
# imports
import os
import re
import json
import time
import pickle
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from datetime import datetime
from collections import Counter
from tqdm import tqdm

# data ingestion
import arxiv
import requests
from bs4 import BeautifulSoup

# NLP
import spacy
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from langdetect import detect, LangDetectException

# embeddings
from gensim.models import Word2Vec
from sentence_transformers import SentenceTransformer
import torch

# colab
from google.colab import drive

print("‚úì All imports successful!")

In [None]:
# mount drive
drive.mount('/content/drive')

In [None]:
# create directories
os.makedirs('data/raw', exist_ok=True)
os.makedirs('data/processed', exist_ok=True)
os.makedirs('data/embeddings', exist_ok=True)

print("‚úì Directory structure created")

In [None]:
# download NLTK data
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)

# load spacy
nlp = spacy.load('en_core_web_sm')
stop_words = set(stopwords.words('english'))

print("‚úì NLP models loaded")

---
# Part 1: Data Ingestion & Normalization

## 1.1 ArXiv Ingestion

In [None]:
def fetch_arxiv_papers(categories, max_results_per_category=300):
    """Fetch papers from ArXiv API."""
    client = arxiv.Client()
    papers = []
    
    for cat in categories:
        print(f"Fetching {cat}...", end=' ')
        search = arxiv.Search(
            query=f'cat:{cat}',
            max_results=max_results_per_category,
            sort_by=arxiv.SortCriterion.SubmittedDate
        )
        
        count = 0
        for result in client.results(search):
            papers.append({
                'paper_id': result.entry_id.split('/')[-1],
                'title': result.title,
                'authors': [author.name for author in result.authors],
                'abstract': result.summary.replace('\n', ' '),
                'categories': result.categories,
                'venue': 'arXiv',
                'year': result.published.year,
                'published': result.published.isoformat(),
                'pdf_url': result.pdf_url
            })
            count += 1
        
        print(f"{count} papers")
    
    return papers

# fetch
print("ArXiv Ingestion:\n")
arxiv_papers = fetch_arxiv_papers(['cs.CL', 'cs.LG', 'stat.ML'], max_results_per_category=300)
arxiv_df = pd.DataFrame(arxiv_papers)
print(f"\n‚úì ArXiv: {len(arxiv_df)} papers")

## 1.2 ACL Anthology Ingestion

In [None]:
# download ACL
print("Downloading ACL Anthology...")
!wget https://aclanthology.org/anthology.bib.gz -O data/raw/acl.bib.gz -q
!gunzip -f data/raw/acl.bib.gz
print("‚úì Downloaded")

In [None]:
def parse_bibtex(filepath):
    """Parse BibTeX file."""
    papers = []
    current = {}
    
    with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
        for line in f:
            line = line.strip()
            
            if line.startswith('@'):
                if current and 'title' in current:
                    papers.append(current)
                parts = line[1:].split('{')
                if len(parts) == 2:
                    current = {'paper_id': parts[1].rstrip(',')}
            
            elif line.startswith('}'):
                if current and 'title' in current:
                    papers.append(current)
                current = {}
            
            elif '=' in line and current:
                parts = line.split('=', 1)
                if len(parts) == 2:
                    field = parts[0].strip()
                    value = parts[1].strip().strip(',').strip('{}').strip('"')
                    current[field] = value
    
    return papers

# parse and normalize
print("\nParsing ACL BibTeX...")
acl_papers = parse_bibtex('data/raw/acl.bib')

acl_normalized = []
for p in acl_papers:
    authors = [a.strip() for a in p.get('author', '').split(' and ')] if 'author' in p else []
    year = None
    try:
        year = int(p.get('year', 0))
    except:
        pass
    
    acl_normalized.append({
        'paper_id': p.get('paper_id', ''),
        'title': p.get('title', ''),
        'authors': authors,
        'abstract': p.get('abstract', ''),
        'venue': p.get('booktitle', p.get('journal', 'ACL')),
        'year': year,
        'url': p.get('url', '')
    })

acl_df = pd.DataFrame(acl_normalized)
acl_df = acl_df[(acl_df['title'].str.len() > 0) & (acl_df['year'] >= 2015)]
print(f"‚úì ACL: {len(acl_df)} papers (2015+)")

## 1.3 S2ORC Ingestion

In [None]:
def search_s2(query, limit=100):
    """Search Semantic Scholar."""
    url = "https://api.semanticscholar.org/graph/v1/paper/search"
    params = {
        'query': query,
        'limit': min(limit, 100),
        'fields': 'paperId,title,abstract,authors,year,venue,citationCount,fieldsOfStudy'
    }
    
    try:
        r = requests.get(url, params=params, timeout=10)
        if r.status_code == 200:
            return r.json().get('data', [])
    except:
        pass
    return []

# fetch
print("\nS2ORC Ingestion:")
queries = ['natural language processing', 'transformers bert', 'machine translation',
           'sentiment analysis', 'named entity recognition', 'question answering']

all_s2 = []
for q in queries:
    print(f"  {q}...", end=' ')
    papers = search_s2(q, 100)
    all_s2.extend(papers)
    print(f"{len(papers)} papers")
    time.sleep(1)

# dedupe and normalize
seen = set()
s2_normalized = []
for p in all_s2:
    pid = p.get('paperId')
    if pid and pid not in seen:
        seen.add(pid)
        authors = [a.get('name', '') for a in p.get('authors', [])]
        s2_normalized.append({
            'paper_id': pid,
            'title': p.get('title', ''),
            'authors': authors,
            'abstract': p.get('abstract', ''),
            'venue': p.get('venue', ''),
            'year': p.get('year'),
            'citation_count': p.get('citationCount', 0),
            'categories': p.get('fieldsOfStudy', [])
        })

s2_df = pd.DataFrame(s2_normalized)
s2_df = s2_df[(s2_df['title'].str.len() > 0) & (s2_df['abstract'].str.len() > 0)]
print(f"\n‚úì S2ORC: {len(s2_df)} papers")

## 1.4 Normalization to Unified Schema

In [None]:
# normalize to unified schema
def normalize_to_schema(df, source):
    """Normalize dataframe to unified schema."""
    normalized = pd.DataFrame({
        'paper_id': source + '_' + df['paper_id'].astype(str).str.replace('/', '_'),
        'title': df['title'],
        'authors': df['authors'],
        'abstract': df['abstract'].fillna(''),
        'venue': df['venue'].fillna(''),
        'year': df['year'],
        'categories': df.get('categories', [[]]*len(df)),
        'source': source
    })
    return normalized

print("\nNormalizing datasets...")
arxiv_norm = normalize_to_schema(arxiv_df, 'arxiv')
acl_norm = normalize_to_schema(acl_df, 'acl')
s2_norm = normalize_to_schema(s2_df, 's2orc')

# combine
combined_df = pd.concat([arxiv_norm, acl_norm, s2_norm], ignore_index=True)
print(f"‚úì Combined: {len(combined_df)} papers")

## 1.5 Dataset Validation

**Comprehensive validation checks for data quality.**

In [None]:
print("\n" + "="*70)
print("DATASET VALIDATION REPORT")
print("="*70)

# 1. Missing abstracts
missing_abstracts = combined_df['abstract'].str.len() == 0
print(f"\n1. Missing Abstracts:")
print(f"   Count: {missing_abstracts.sum()} papers")
print(f"   Percentage: {missing_abstracts.sum()/len(combined_df)*100:.1f}%")
print(f"   By source:")
for src in combined_df['source'].unique():
    src_missing = combined_df[combined_df['source']==src]['abstract'].str.len() == 0
    print(f"     {src}: {src_missing.sum()} ({src_missing.sum()/len(combined_df[combined_df['source']==src])*100:.1f}%)")

# 2. Missing titles
missing_titles = combined_df['title'].str.len() == 0
print(f"\n2. Missing Titles: {missing_titles.sum()} papers")

# 3. Missing authors
missing_authors = combined_df['authors'].apply(len) == 0
print(f"\n3. Missing Authors:")
print(f"   Count: {missing_authors.sum()} papers")
print(f"   Percentage: {missing_authors.sum()/len(combined_df)*100:.1f}%")

# 4. Invalid years
invalid_years = (combined_df['year'].isna()) | (combined_df['year'] < 1990) | (combined_df['year'] > 2025)
print(f"\n4. Invalid Years: {invalid_years.sum()} papers")
if invalid_years.sum() > 0:
    print(f"   Year range: {combined_df[invalid_years]['year'].min()} - {combined_df[invalid_years]['year'].max()}")

# 5. Duplicate titles
duplicates = combined_df.duplicated(subset=['title'], keep=False)
print(f"\n5. Duplicate Titles:")
print(f"   Total duplicate entries: {duplicates.sum()} papers")
print(f"   Unique duplicated titles: {duplicates.sum()//2} titles")
if duplicates.sum() > 0:
    print(f"   Example duplicates:")
    dup_titles = combined_df[duplicates].groupby('title')['source'].apply(list).head(3)
    for title, sources in dup_titles.items():
        print(f"     '{title[:60]}...' in sources: {sources}")

# 6. Short abstracts
short_abstracts = (combined_df['abstract'].str.len() > 0) & (combined_df['abstract'].str.len() < 50)
print(f"\n6. Short Abstracts (<50 chars): {short_abstracts.sum()} papers")

# 7. Missing venue
missing_venue = combined_df['venue'].str.len() == 0
print(f"\n7. Missing Venue: {missing_venue.sum()} papers ({missing_venue.sum()/len(combined_df)*100:.1f}%)")

# 8. Missing categories
missing_categories = combined_df['categories'].apply(lambda x: len(x) if isinstance(x, list) else 0) == 0
print(f"\n8. Missing Categories: {missing_categories.sum()} papers ({missing_categories.sum()/len(combined_df)*100:.1f}%)")

print("\n" + "="*70)
print("VALIDATION SUMMARY")
print("="*70)
total_issues = (missing_abstracts.sum() + missing_titles.sum() + 
                missing_authors.sum() + invalid_years.sum() + 
                duplicates.sum() + short_abstracts.sum())
print(f"Total papers with issues: {total_issues}")
print(f"Clean papers: {len(combined_df) - total_issues}")
print("="*70)

## 1.6 Data Cleaning & Deduplication

In [None]:
print("\nCleaning dataset based on validation...")

# filter out invalid entries
cleaned_df = combined_df[
    (combined_df['title'].str.len() > 10) &
    (combined_df['abstract'].str.len() >= 50) &
    (combined_df['authors'].apply(len) > 0) &
    (combined_df['year'] >= 1990) &
    (combined_df['year'] <= 2025)
].copy()

print(f"  After filtering: {len(cleaned_df)} papers")
print(f"  Removed: {len(combined_df) - len(cleaned_df)} papers")

# deduplicate (priority: acl > s2orc > arxiv)
source_priority = {'acl': 1, 's2orc': 2, 'arxiv': 3}
cleaned_df['_priority'] = cleaned_df['source'].map(source_priority)
cleaned_df = cleaned_df.sort_values('_priority').drop_duplicates(subset=['title'], keep='first')
cleaned_df = cleaned_df.drop('_priority', axis=1)

print(f"  After deduplication: {len(cleaned_df)} papers")
print(f"  Duplicates removed: {len(combined_df) - len(cleaned_df) - (len(combined_df) - len(cleaned_df))}")

# add computed fields
cleaned_df['title_length'] = cleaned_df['title'].str.len()
cleaned_df['abstract_length'] = cleaned_df['abstract'].str.len()
cleaned_df['num_authors'] = cleaned_df['authors'].apply(len)

print("\n‚úì Data cleaning complete")

In [None]:
# save complete dataset
cleaned_df.to_parquet('data/raw/complete_dataset.parquet', index=False)

print(f"\n‚úì Saved {len(cleaned_df)} papers to data/raw/complete_dataset.parquet")
print(f"  File size: {os.path.getsize('data/raw/complete_dataset.parquet') / (1024*1024):.2f} MB")

print("\nSource distribution:")
print(cleaned_df['source'].value_counts())

---
# Part 2: Full Preprocessing Pipeline

## 2.1 Define Preprocessing Functions

In [None]:
def clean_text(text):
    """Remove URLs, emails, and extra whitespace."""
    if not isinstance(text, str):
        return ""
    # remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # remove emails
    text = re.sub(r'\S+@\S+', '', text)
    # remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def remove_special_chars(text):
    """Remove special characters but keep basic punctuation."""
    text = re.sub(r'[^a-zA-Z0-9\s.,!?-]', '', text)
    text = re.sub(r'([.,!?-])\1+', r'\1', text)
    return text

def detect_language(text):
    """Detect language of text."""
    try:
        return detect(text)
    except LangDetectException:
        return 'unknown'

def segment_sentences(text):
    """Segment text into sentences."""
    try:
        return sent_tokenize(text)
    except:
        return [text]

def tokenize_text(text):
    """Tokenize using spaCy."""
    doc = nlp(text)
    return [token.text for token in doc]

def remove_stopwords(tokens):
    """Remove stopwords."""
    return [token for token in tokens if token.lower() not in stop_words]

def lemmatize_tokens(tokens):
    """Lemmatize tokens."""
    text = ' '.join(tokens)
    doc = nlp(text)
    return [token.lemma_ for token in doc]

print("‚úì Preprocessing functions defined")

## 2.2 Complete Preprocessing Pipeline

In [None]:
def preprocess_paper(text, lowercase=True, remove_stops=True, lemmatize=True):
    """Complete preprocessing pipeline with language detection and sentence segmentation."""
    if not isinstance(text, str) or len(text) == 0:
        return {
            'cleaned_text': '',
            'language': 'unknown',
            'sentences': [],
            'num_sentences': 0,
            'tokens': [],
            'processed_text': ''
        }
    
    # detect language
    language = detect_language(text)
    
    # clean text
    text = clean_text(text)
    text = remove_special_chars(text)
    
    # sentence segmentation
    sentences = segment_sentences(text)
    
    if lowercase:
        text = text.lower()
    
    cleaned_text = text
    
    # tokenize
    tokens = tokenize_text(text)
    
    # remove stopwords
    if remove_stops:
        tokens = remove_stopwords(tokens)
    
    # lemmatize
    if lemmatize:
        tokens = lemmatize_tokens(tokens)
    
    # filter short and non-alphanumeric tokens
    tokens = [t for t in tokens if len(t) > 2 and t.isalnum()]
    
    processed_text = ' '.join(tokens)
    
    return {
        'cleaned_text': cleaned_text,
        'language': language,
        'sentences': sentences,
        'num_sentences': len(sentences),
        'tokens': tokens,
        'processed_text': processed_text
    }

print("‚úì Complete preprocessing pipeline defined")

## 2.3 Apply Preprocessing to Dataset

In [None]:
# load complete dataset
df = pd.read_parquet('data/raw/complete_dataset.parquet')

# sample for processing (or use full dataset)
sample_size = 500  # change to len(df) for full dataset
if len(df) > sample_size:
    sample_df = df.sample(n=sample_size, random_state=42).copy()
else:
    sample_df = df.copy()

print(f"‚úì Processing {len(sample_df)} papers")

In [None]:
# preprocess all papers
print(f"\nPreprocessing {len(sample_df)} papers...\n")

processed_data = []

for idx, row in tqdm(sample_df.iterrows(), total=len(sample_df)):
    result = preprocess_paper(row['abstract'])
    
    processed_data.append({
        'paper_id': row['paper_id'],
        'title': row['title'],
        'authors': row['authors'],
        'original_abstract': row['abstract'],
        'cleaned_text': result['cleaned_text'],
        'language': result['language'],
        'sentences': result['sentences'],
        'num_sentences': result['num_sentences'],
        'tokens': result['tokens'],
        'processed_text': result['processed_text'],
        'num_tokens': len(result['tokens']),
        'source': row['source'],
        'year': row['year'],
        'venue': row['venue']
    })

processed_df = pd.DataFrame(processed_data)

print(f"\n‚úì Preprocessing complete!")

## 2.4 Preprocessing Statistics

In [None]:
print("\n" + "="*60)
print("PREPROCESSING STATISTICS")
print("="*60)

print(f"\nToken Statistics:")
print(f"  Average tokens per paper: {processed_df['num_tokens'].mean():.1f}")
print(f"  Min tokens: {processed_df['num_tokens'].min()}")
print(f"  Max tokens: {processed_df['num_tokens'].max()}")
print(f"  Median tokens: {processed_df['num_tokens'].median():.1f}")

print(f"\nSentence Statistics:")
print(f"  Average sentences per paper: {processed_df['num_sentences'].mean():.1f}")
print(f"  Min sentences: {processed_df['num_sentences'].min()}")
print(f"  Max sentences: {processed_df['num_sentences'].max()}")

print(f"\nLanguage Distribution:")
print(processed_df['language'].value_counts().head(10))

# vocabulary
all_tokens = []
for tokens in processed_df['tokens']:
    all_tokens.extend(tokens)

token_counts = Counter(all_tokens)
print(f"\nVocabulary:")
print(f"  Vocabulary size: {len(token_counts):,}")
print(f"  Total tokens: {len(all_tokens):,}")

print(f"\nTop 20 most common tokens:")
for token, count in token_counts.most_common(20):
    print(f"  {token}: {count}")

print("\n" + "="*60)

In [None]:
# save cleaned dataset
processed_df.to_parquet('data/processed/cleaned_papers.parquet', index=False)

# save vocabulary
vocab_data = {
    'vocab_size': len(token_counts),
    'total_tokens': len(all_tokens),
    'vocabulary': sorted(list(token_counts.keys())),
    'token_frequencies': dict(token_counts.most_common(1000))
}

with open('data/processed/vocabulary.json', 'w') as f:
    json.dump(vocab_data, f, indent=2)

print(f"\n‚úì Saved cleaned dataset to data/processed/cleaned_papers.parquet")
print(f"  File size: {os.path.getsize('data/processed/cleaned_papers.parquet') / (1024*1024):.2f} MB")
print(f"‚úì Saved vocabulary to data/processed/vocabulary.json")

---
# Part 3: Embeddings Pipeline

## 3.1 Prepare Data for Embeddings

In [None]:
# load cleaned data
embed_df = pd.read_parquet('data/processed/cleaned_papers.parquet')

# prepare texts
abstracts = embed_df['processed_text'].tolist()
titles = embed_df['title'].tolist()
paper_ids = embed_df['paper_id'].tolist()

print(f"‚úì Loaded {len(embed_df)} papers for embedding generation")

## 3.2 Word2Vec Embeddings (Baseline)

In [None]:
print("\nTraining Word2Vec model...")

# prepare tokenized sentences
tokenized_abstracts = [text.split() for text in abstracts if text]

# train Word2Vec
w2v_model = Word2Vec(
    sentences=tokenized_abstracts,
    vector_size=100,
    window=5,
    min_count=2,
    workers=4,
    epochs=10
)

print(f"‚úì Word2Vec trained")
print(f"  Vocabulary size: {len(w2v_model.wv)}")
print(f"  Vector size: {w2v_model.wv.vector_size}")

# generate document embeddings (average of word vectors)
def get_w2v_embedding(text, model):
    words = text.split()
    vectors = [model.wv[word] for word in words if word in model.wv]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.wv.vector_size)

w2v_embeddings = np.array([get_w2v_embedding(text, w2v_model) for text in abstracts])

print(f"‚úì Generated Word2Vec embeddings: {w2v_embeddings.shape}")

## 3.3 SBERT Embeddings (Sentence-BERT)

In [None]:
print("\nLoading SBERT model...")

# load pre-trained SBERT model
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

print(f"‚úì SBERT model loaded")
print(f"  Model: all-MiniLM-L6-v2")
print(f"  Embedding dimension: {sbert_model.get_sentence_embedding_dimension()}")

In [None]:
# generate SBERT embeddings for abstracts
print("\nGenerating SBERT embeddings for abstracts...")
sbert_abstract_embeddings = sbert_model.encode(
    abstracts,
    show_progress_bar=True,
    batch_size=32,
    convert_to_numpy=True
)

print(f"‚úì SBERT abstract embeddings: {sbert_abstract_embeddings.shape}")

In [None]:
# generate SBERT embeddings for titles
print("\nGenerating SBERT embeddings for titles...")
sbert_title_embeddings = sbert_model.encode(
    titles,
    show_progress_bar=True,
    batch_size=32,
    convert_to_numpy=True
)

print(f"‚úì SBERT title embeddings: {sbert_title_embeddings.shape}")

## 3.4 SciBERT Embeddings (Scientific Text)

In [None]:
print("\nLoading SciBERT model...")

# load SciBERT model (optimized for scientific text)
scibert_model = SentenceTransformer('allenai-specter')

print(f"‚úì SciBERT model loaded")
print(f"  Model: allenai-specter")
print(f"  Embedding dimension: {scibert_model.get_sentence_embedding_dimension()}")

In [None]:
# generate SciBERT embeddings
print("\nGenerating SciBERT embeddings...")
scibert_embeddings = scibert_model.encode(
    abstracts,
    show_progress_bar=True,
    batch_size=16,  # smaller batch for larger model
    convert_to_numpy=True
)

print(f"‚úì SciBERT embeddings: {scibert_embeddings.shape}")

## 3.5 Compare Embedding Methods

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

print("\n" + "="*60)
print("EMBEDDING COMPARISON")
print("="*60)

# compare first 10 papers
sample_size = min(10, len(abstracts))

print(f"\nComparing embeddings for first {sample_size} papers:\n")

# Word2Vec
w2v_sim = cosine_similarity(w2v_embeddings[:sample_size])
print(f"Word2Vec:")
print(f"  Dimension: {w2v_embeddings.shape[1]}")
print(f"  Avg pairwise similarity: {np.mean(w2v_sim[np.triu_indices_from(w2v_sim, k=1)]):.3f}")

# SBERT
sbert_sim = cosine_similarity(sbert_abstract_embeddings[:sample_size])
print(f"\nSBERT:")
print(f"  Dimension: {sbert_abstract_embeddings.shape[1]}")
print(f"  Avg pairwise similarity: {np.mean(sbert_sim[np.triu_indices_from(sbert_sim, k=1)]):.3f}")

# SciBERT
scibert_sim = cosine_similarity(scibert_embeddings[:sample_size])
print(f"\nSciBERT:")
print(f"  Dimension: {scibert_embeddings.shape[1]}")
print(f"  Avg pairwise similarity: {np.mean(scibert_sim[np.triu_indices_from(scibert_sim, k=1)]):.3f}")

print("\n" + "="*60)
print("RECOMMENDATION: Use SBERT for general NLP tasks, SciBERT for scientific papers")
print("="*60)

## 3.6 Save Embeddings

In [None]:
# save embeddings
print("\nSaving embeddings...")

# Word2Vec
np.save('data/embeddings/word2vec_embeddings.npy', w2v_embeddings)
print(f"‚úì Saved Word2Vec embeddings: {w2v_embeddings.shape}")

# SBERT abstract
np.save('data/embeddings/sbert_abstract_embeddings.npy', sbert_abstract_embeddings)
print(f"‚úì Saved SBERT abstract embeddings: {sbert_abstract_embeddings.shape}")

# SBERT title
np.save('data/embeddings/sbert_title_embeddings.npy', sbert_title_embeddings)
print(f"‚úì Saved SBERT title embeddings: {sbert_title_embeddings.shape}")

# SciBERT
np.save('data/embeddings/scibert_embeddings.npy', scibert_embeddings)
print(f"‚úì Saved SciBERT embeddings: {scibert_embeddings.shape}")

# save paper index
paper_index = {
    'paper_ids': paper_ids,
    'titles': titles,
    'num_papers': len(paper_ids),
    'embedding_methods': ['word2vec', 'sbert_abstract', 'sbert_title', 'scibert'],
    'embedding_dims': {
        'word2vec': w2v_embeddings.shape[1],
        'sbert': sbert_abstract_embeddings.shape[1],
        'scibert': scibert_embeddings.shape[1]
    }
}

with open('data/embeddings/paper_index.pkl', 'wb') as f:
    pickle.dump(paper_index, f)

print(f"‚úì Saved paper index: {len(paper_ids)} papers")

# save metadata
embedding_metadata = {
    'created_at': datetime.now().isoformat(),
    'num_papers': len(paper_ids),
    'embeddings': {
        'word2vec': {
            'file': 'word2vec_embeddings.npy',
            'shape': list(w2v_embeddings.shape),
            'model': 'Word2Vec (gensim)',
            'params': {'vector_size': 100, 'window': 5}
        },
        'sbert_abstract': {
            'file': 'sbert_abstract_embeddings.npy',
            'shape': list(sbert_abstract_embeddings.shape),
            'model': 'all-MiniLM-L6-v2'
        },
        'sbert_title': {
            'file': 'sbert_title_embeddings.npy',
            'shape': list(sbert_title_embeddings.shape),
            'model': 'all-MiniLM-L6-v2'
        },
        'scibert': {
            'file': 'scibert_embeddings.npy',
            'shape': list(scibert_embeddings.shape),
            'model': 'allenai-specter'
        }
    }
}

with open('data/embeddings/embedding_metadata.json', 'w') as f:
    json.dump(embedding_metadata, f, indent=2)

print(f"‚úì Saved embedding metadata")

---
# Final Summary

In [None]:
print("\n" + "="*80)
print("DAY 1 DELIVERABLES - COMPLETE PIPELINE SUMMARY")
print("="*80)

print("\nüì• PART 1: DATA INGESTION & NORMALIZATION")
print(f"  ‚úì ArXiv papers ingested: {len(arxiv_df):,}")
print(f"  ‚úì ACL papers ingested: {len(acl_df):,}")
print(f"  ‚úì S2ORC papers ingested: {len(s2_df):,}")
print(f"  ‚úì Total combined: {len(combined_df):,}")
print(f"  ‚úì After validation & cleaning: {len(cleaned_df):,}")
print(f"  ‚úì Output: data/raw/complete_dataset.parquet")

print("\nüßπ PART 2: FULL PREPROCESSING PIPELINE")
print(f"  ‚úì Papers preprocessed: {len(processed_df):,}")
print(f"  ‚úì Language detection: ‚úì")
print(f"  ‚úì Sentence segmentation: ‚úì")
print(f"  ‚úì Avg tokens per paper: {processed_df['num_tokens'].mean():.1f}")
print(f"  ‚úì Vocabulary size: {len(token_counts):,}")
print(f"  ‚úì Output: data/processed/cleaned_papers.parquet")

print("\nüî¢ PART 3: EMBEDDINGS PIPELINE")
print(f"  ‚úì Word2Vec embeddings: {w2v_embeddings.shape}")
print(f"  ‚úì SBERT abstract embeddings: {sbert_abstract_embeddings.shape}")
print(f"  ‚úì SBERT title embeddings: {sbert_title_embeddings.shape}")
print(f"  ‚úì SciBERT embeddings: {scibert_embeddings.shape}")
print(f"  ‚úì Output: data/embeddings/*.npy, paper_index.pkl")

print("\nüìÅ OUTPUT FILES")
output_files = [
    'data/raw/complete_dataset.parquet',
    'data/processed/cleaned_papers.parquet',
    'data/processed/vocabulary.json',
    'data/embeddings/word2vec_embeddings.npy',
    'data/embeddings/sbert_abstract_embeddings.npy',
    'data/embeddings/sbert_title_embeddings.npy',
    'data/embeddings/scibert_embeddings.npy',
    'data/embeddings/paper_index.pkl',
    'data/embeddings/embedding_metadata.json'
]

total_size = 0
for filepath in output_files:
    if os.path.exists(filepath):
        size_mb = os.path.getsize(filepath) / (1024 * 1024)
        total_size += size_mb
        print(f"  ‚úì {filepath} ({size_mb:.2f} MB)")

print(f"\n  Total storage: {total_size:.2f} MB")

print("\n" + "="*80)
print("‚úÖ DAY 1 DELIVERABLES COMPLETE!")
print("="*80)

print("\nüìä Next Steps (Day 2):")
print("  1. Train classification models")
print("  2. Build topic models (LDA)")
print("  3. Implement retrieval system (BM25 + semantic)")
print("  4. Create research digest interface")

print("\n" + "="*80)