# Unified Metadata Creation

This notebook combines data from all three sources:
- ArXiv
- ACL Anthology
- S2ORC (Semantic Scholar)

**Goal:** Create a single unified dataset with consistent schema for downstream tasks.

In [None]:
!pip install pandas pyarrow -q

In [None]:
import os
import json
import pandas as pd
from datetime import datetime
from google.colab import drive

drive.mount('/content/drive')

In [None]:
os.makedirs('data/processed', exist_ok=True)

## Load data from all sources

In [None]:
# load ArXiv data
print("Loading ArXiv data...")
try:
    arxiv_df = pd.read_parquet('data/raw/arxiv_papers.parquet')
    print(f"  Loaded {len(arxiv_df)} papers")
except FileNotFoundError:
    print("  ArXiv data not found! Run notebook 01 first.")
    arxiv_df = pd.DataFrame()

# load ACL data
print("\nLoading ACL Anthology data...")
try:
    acl_df = pd.read_parquet('data/raw/acl_anthology_papers.parquet')
    print(f"  Loaded {len(acl_df)} papers")
except FileNotFoundError:
    print("  ACL data not found! Run notebook 02 first.")
    acl_df = pd.DataFrame()

# load S2ORC data
print("\nLoading S2ORC data...")
try:
    s2orc_df = pd.read_parquet('data/raw/s2orc_papers.parquet')
    print(f"  Loaded {len(s2orc_df)} papers")
except FileNotFoundError:
    print("  S2ORC data not found! Run notebook 03 first.")
    s2orc_df = pd.DataFrame()

In [None]:
# check columns in each dataset
print("ArXiv columns:", arxiv_df.columns.tolist() if not arxiv_df.empty else "N/A")
print("\nACL columns:", acl_df.columns.tolist() if not acl_df.empty else "N/A")
print("\nS2ORC columns:", s2orc_df.columns.tolist() if not s2orc_df.empty else "N/A")

## Define unified schema

Our standard schema will have these fields:
- `paper_id`: unique identifier
- `title`: paper title
- `authors`: list of author names
- `abstract`: paper abstract
- `venue`: publication venue
- `year`: publication year
- `categories`: topic categories/tags
- `source`: which dataset it came from (arxiv/acl/s2orc)
- `metadata`: any additional source-specific info

In [None]:
def normalize_arxiv(df):
    """
    Convert ArXiv dataframe to unified schema
    """
    if df.empty:
        return pd.DataFrame()
    
    normalized = pd.DataFrame({
        'paper_id': 'arxiv_' + df['paper_id'].astype(str),
        'title': df['title'],
        'authors': df['authors'],
        'abstract': df['abstract'],
        'venue': df['venue'],
        'year': df['year'],
        'categories': df['categories'],
        'source': 'arxiv',
        'metadata': df.apply(lambda row: {
            'primary_category': row.get('primary_category', ''),
            'published': row.get('published', ''),
            'pdf_url': row.get('pdf_url', '')
        }, axis=1)
    })
    
    return normalized

In [None]:
def normalize_acl(df):
    """
    Convert ACL dataframe to unified schema
    """
    if df.empty:
        return pd.DataFrame()
    
    # ACL papers might not have categories in the same format
    # we'll infer it from venue
    df['categories'] = df['venue'].apply(lambda x: ['NLP', 'Computational Linguistics'])
    
    normalized = pd.DataFrame({
        'paper_id': 'acl_' + df['paper_id'].astype(str).str.replace('/', '_'),
        'title': df['title'],
        'authors': df['authors'],
        'abstract': df['abstract'].fillna(''),  # some might be missing
        'venue': df['venue'],
        'year': df['year'],
        'categories': df['categories'],
        'source': 'acl',
        'metadata': df.apply(lambda row: {
            'url': row.get('url', ''),
            'doi': row.get('doi', ''),
            'pages': row.get('pages', '')
        }, axis=1)
    })
    
    return normalized

In [None]:
def normalize_s2orc(df):
    """
    Convert S2ORC dataframe to unified schema
    """
    if df.empty:
        return pd.DataFrame()
    
    # S2 has fields_of_study which we can use as categories
    
    normalized = pd.DataFrame({
        'paper_id': 's2_' + df['paper_id'].astype(str),
        'title': df['title'],
        'authors': df['authors'],
        'abstract': df['abstract'],
        'venue': df['venue'],
        'year': df['year'],
        'categories': df['fields_of_study'],
        'source': 's2orc',
        'metadata': df.apply(lambda row: {
            'citation_count': row.get('citation_count', 0),
            'reference_count': row.get('reference_count', 0),
            'publication_date': row.get('publication_date', ''),
            'external_ids': row.get('external_ids', {})
        }, axis=1)
    })
    
    return normalized

In [None]:
# normalize each dataset
print("Normalizing datasets...\n")

arxiv_normalized = normalize_arxiv(arxiv_df)
print(f"ArXiv normalized: {len(arxiv_normalized)} papers")

acl_normalized = normalize_acl(acl_df)
print(f"ACL normalized: {len(acl_normalized)} papers")

s2orc_normalized = normalize_s2orc(s2orc_df)
print(f"S2ORC normalized: {len(s2orc_normalized)} papers")

## Combine all datasets

In [None]:
# concatenate all dataframes
all_dfs = [arxiv_normalized, acl_normalized, s2orc_normalized]
all_dfs = [df for df in all_dfs if not df.empty]  # filter out empty ones

if all_dfs:
    unified_df = pd.concat(all_dfs, ignore_index=True)
    print(f"Combined dataset size: {len(unified_df)} papers")
else:
    print("No data to combine!")
    unified_df = pd.DataFrame()

In [None]:
# check the combined data
if not unified_df.empty:
    print("Unified dataset info:")
    print(f"Shape: {unified_df.shape}")
    print(f"\nColumns: {unified_df.columns.tolist()}")
    print(f"\nSources distribution:")
    print(unified_df['source'].value_counts())

## Data quality checks and cleaning

In [None]:
print("Initial data quality checks:\n")

# check for missing values
print("Missing values:")
print(unified_df.isnull().sum())

# check for empty strings
print("\nEmpty titles:", (unified_df['title'] == '').sum())
print("Empty abstracts:", (unified_df['abstract'] == '').sum())

In [None]:
# remove papers without titles or abstracts
print(f"\nBefore cleaning: {len(unified_df)} papers")

# filter requirements:
# - must have title
# - must have abstract (and not too short)
# - must have at least one author

cleaned_df = unified_df[
    (unified_df['title'].str.len() > 10) &
    (unified_df['abstract'].str.len() > 50) &
    (unified_df['authors'].apply(len) > 0)
].copy()

print(f"After cleaning: {len(cleaned_df)} papers")
print(f"Removed: {len(unified_df) - len(cleaned_df)} papers")

In [None]:
# check for potential duplicates based on title similarity
# exact title matches
print("Checking for duplicate titles...")
duplicate_titles = cleaned_df[cleaned_df.duplicated(subset=['title'], keep=False)]
print(f"Found {len(duplicate_titles)} papers with duplicate titles")

if len(duplicate_titles) > 0:
    print("\nExample duplicates:")
    sample_dup = duplicate_titles.groupby('title').first().head(3)
    for title in sample_dup.index:
        dups = duplicate_titles[duplicate_titles['title'] == title]
        print(f"\n'{title}'")
        print(f"  Sources: {dups['source'].tolist()}")

In [None]:
# for duplicates, keep the one from the best source
# priority: acl > s2orc > arxiv (ACL papers are peer-reviewed)

source_priority = {'acl': 1, 's2orc': 2, 'arxiv': 3}
cleaned_df['source_rank'] = cleaned_df['source'].map(source_priority)

# sort by source rank and keep first (best source)
deduped_df = cleaned_df.sort_values('source_rank').drop_duplicates(
    subset=['title'], 
    keep='first'
).drop('source_rank', axis=1)

print(f"\nAfter deduplication: {len(deduped_df)} papers")
print(f"Duplicates removed: {len(cleaned_df) - len(deduped_df)}")

## Add computed fields

In [None]:
# add some useful computed fields
print("Adding computed fields...\n")

# text lengths
deduped_df['title_length'] = deduped_df['title'].str.len()
deduped_df['abstract_length'] = deduped_df['abstract'].str.len()

# number of authors
deduped_df['num_authors'] = deduped_df['authors'].apply(len)

# number of categories
deduped_df['num_categories'] = deduped_df['categories'].apply(
    lambda x: len(x) if isinstance(x, list) else 0
)

print("Computed field statistics:")
print(f"\nTitle length: {deduped_df['title_length'].mean():.1f} chars (avg)")
print(f"Abstract length: {deduped_df['abstract_length'].mean():.1f} chars (avg)")
print(f"Authors per paper: {deduped_df['num_authors'].mean():.1f} (avg)")
print(f"Categories per paper: {deduped_df['num_categories'].mean():.1f} (avg)")

In [None]:
# year distribution
print("\nPapers per year:")
year_dist = deduped_df['year'].value_counts().sort_index()
print(year_dist.tail(10))

In [None]:
# source distribution
print("\nPapers per source:")
print(deduped_df['source'].value_counts())
print("\nPercentages:")
print(deduped_df['source'].value_counts(normalize=True) * 100)

## Save unified dataset

In [None]:
# save to processed directory
output_json = 'data/processed/unified_papers.json'
output_parquet = 'data/processed/unified_papers.parquet'
output_csv = 'data/processed/unified_papers.csv'

print("Saving unified dataset...\n")

# JSON
deduped_df.to_json(output_json, orient='records', indent=2, force_ascii=False)
print(f"Saved to {output_json}")

# Parquet (most efficient)
deduped_df.to_parquet(output_parquet, index=False)
print(f"Saved to {output_parquet}")

# CSV (for easy inspection)
# note: lists will be converted to strings in CSV
csv_df = deduped_df.copy()
csv_df['authors'] = csv_df['authors'].apply(lambda x: '|'.join(x))
csv_df['categories'] = csv_df['categories'].apply(
    lambda x: '|'.join(x) if isinstance(x, list) else ''
)
csv_df['metadata'] = csv_df['metadata'].apply(str)
csv_df.to_csv(output_csv, index=False)
print(f"Saved to {output_csv}")

In [None]:
# check file sizes
import os

print("\nFile sizes:")
for filepath in [output_json, output_parquet, output_csv]:
    if os.path.exists(filepath):
        size_mb = os.path.getsize(filepath) / (1024 * 1024)
        print(f"  {os.path.basename(filepath)}: {size_mb:.2f} MB")

## Create metadata summary

In [None]:
# create a summary of the dataset
summary = {
    'creation_date': datetime.now().isoformat(),
    'total_papers': len(deduped_df),
    'sources': {
        'arxiv': len(deduped_df[deduped_df['source'] == 'arxiv']),
        'acl': len(deduped_df[deduped_df['source'] == 'acl']),
        's2orc': len(deduped_df[deduped_df['source'] == 's2orc']),
    },
    'year_range': {
        'min': int(deduped_df['year'].min()) if not deduped_df['year'].isna().all() else None,
        'max': int(deduped_df['year'].max()) if not deduped_df['year'].isna().all() else None,
    },
    'statistics': {
        'avg_title_length': float(deduped_df['title_length'].mean()),
        'avg_abstract_length': float(deduped_df['abstract_length'].mean()),
        'avg_authors_per_paper': float(deduped_df['num_authors'].mean()),
        'avg_categories_per_paper': float(deduped_df['num_categories'].mean()),
    },
    'schema': {
        'fields': deduped_df.columns.tolist(),
        'description': 'Unified metadata from ArXiv, ACL Anthology, and S2ORC'
    }
}

# save summary
summary_path = 'data/processed/dataset_summary.json'
with open(summary_path, 'w') as f:
    json.dump(summary, f, indent=2)

print("Dataset Summary:")
print(json.dumps(summary, indent=2))

## Show sample papers from final dataset

In [None]:
# show samples from each source
print("Sample papers from unified dataset:\n")
print("=" * 80)

for source in ['arxiv', 'acl', 's2orc']:
    source_papers = deduped_df[deduped_df['source'] == source]
    if len(source_papers) > 0:
        print(f"\n{source.upper()} Sample:")
        sample = source_papers.iloc[0]
        print(f"Title: {sample['title']}")
        print(f"Authors: {', '.join(sample['authors'][:3])}{'...' if len(sample['authors']) > 3 else ''}")
        print(f"Year: {sample['year']} | Venue: {sample['venue']}")
        print(f"Categories: {sample['categories'][:3]}")
        print(f"Abstract: {sample['abstract'][:200]}...")
        print("-" * 80)

---
## Summary

✅ Loaded data from all three sources  
✅ Normalized to unified schema  
✅ Combined into single dataset  
✅ Cleaned and deduplicated  
✅ Added computed fields  
✅ Saved in multiple formats (JSON, Parquet, CSV)  
✅ Created dataset summary  

**Files created:**
- `data/processed/unified_papers.parquet` (main file for downstream tasks)
- `data/processed/unified_papers.json` (for inspection)
- `data/processed/unified_papers.csv` (for spreadsheet tools)
- `data/processed/dataset_summary.json` (metadata)

**Next steps:**
- Data preprocessing (text cleaning, tokenization)
- Embeddings generation
- Classification and topic modeling