# 📊 ArticleInfo Parquet Author Sampling Analysis

## 📋 Project Background

This notebook implements author name sampling analysis from the **articleInfo.parquet** dataset to examine author name quality and patterns for data cleaning purposes.

### 🎯 Analysis Objectives
- **Data Source**: `data/processed/articleInfo.parquet` (3.8GB, ~30M records)
- **Sampling Strategy**: 10 samples per author count group
- **Column Preservation**: All 16 original columns maintained
- **Output**: `new_creator_sample.parquet` for manual inspection

### 📊 Expected Workflow
1. Load ArticleInfo parquet data
2. Count authors using delimiter-based splitting
3. Sample 10 records per author count group
4. Export complete sample data with all columns
5. Display sample for manual quality inspection

---


## ⚙️ Environment Setup & Dependencies


In [None]:
#!/usr/bin/env python3
"""
Sample 10 rows for every distinct author-count (based on multi-delimiter split),
from articleInfo.parquet, preserving ALL original columns.
Print samples and write to `new_creator_sample.parquet`.

Adapted from: scripts/03_analysis/test_LLM_name_detect_parquet.py
"""

from pathlib import Path
import random
import re
import pyarrow.parquet as pq
import pyarrow as pa
import pandas as pd
import numpy as np
from IPython.display import display
import warnings
warnings.filterwarnings('ignore')

print("✅ Dependencies loaded successfully")


## 📁 Configuration & File Paths


In [None]:
# ---------------------------------------------------------------------
# File path configuration
BASE_DIR = Path().resolve().parent.parent  # Navigate to project root from notebooks/03_analysis
PARQUET_PATH = BASE_DIR / "data/processed/articleInfo.parquet"
OUT_PATH = BASE_DIR / "data/processed/new_creator_sample.parquet"
MAX_SAMPLES = 10

# Regex for author delimiters EXCLUDING comma (',' may appear in Last, First)
DELIM_RE = re.compile(r"\s*(?:;|&|\band\b|\+|/|\\)\s*", flags=re.IGNORECASE)

print(f"📂 Project root: {BASE_DIR}")
print(f"📊 Source parquet: {PARQUET_PATH}")
print(f"💾 Output parquet: {OUT_PATH}")
print(f"🔢 Max samples per group: {MAX_SAMPLES}")
print(f"🔍 Author delimiter regex: {DELIM_RE.pattern}")

# Verify source file exists
if not PARQUET_PATH.exists():
    raise SystemExit(f"❌ Parquet file not found: {PARQUET_PATH}")
else:
    print(f"✅ Source file verified: {PARQUET_PATH.name}")


## 📊 Data Loading & Author Count Analysis


In [None]:
# Load complete dataset preserving all columns
print(f"🚀 Loading dataset from {PARQUET_PATH.name} ...")

# Read all columns to preserve complete structure
table = pq.read_table(PARQUET_PATH)
df = table.to_pandas()

print(f"✅ Dataset loaded successfully:")
print(f"  📊 Shape: {df.shape}")
print(f"  📋 Columns: {list(df.columns)}")

# Verify authors column exists
if 'authors' not in df.columns:
    print("❌ 'authors' column not found!")
    print(f"Available columns: {list(df.columns)}")
    raise SystemExit("Missing 'authors' column")

# Show sample data
print(f"\n🔬 Sample data (first 3 rows):")
display_cols = ['id', 'title1', 'authors', 'year']
if all(col in df.columns for col in display_cols):
    display(df[display_cols].head(3))


In [None]:
# Compute author count using delimiter-based splitting
print("🧮 Computing author counts...")

# Count authors by splitting on delimiters (; & 'and' + / \) – comma is ignored
df["author_count"] = (
    df["authors"]
      .fillna("")            # avoid NaN
      .astype(str)
      .apply(
          lambda x: len([p for p in DELIM_RE.split(x) if p.strip()]) if x.strip() else 0
      )
)

# Analyze author count distribution
author_count_dist = df['author_count'].value_counts().sort_index()

print(f"\n📊 Author Count Distribution (top 10):")
for count, freq in author_count_dist.head(10).items():
    print(f"  {count:2d} authors: {freq:,} papers ({freq/len(df)*100:.2f}%)")

print(f"\n📈 Summary Statistics:")
print(f"  📊 Mean authors per paper: {df['author_count'].mean():.2f}")
print(f"  📊 Median authors per paper: {df['author_count'].median():.1f}")
print(f"  📊 Max authors per paper: {df['author_count'].max()}")
print(f"  📊 Papers with 0 authors: {(df['author_count'] == 0).sum():,}")


## 🎯 Stratified Sampling & Sample Display


In [None]:
# Group and sample by author count (same logic as original script)
print("🎯 Performing stratified sampling by author count...")

results = []
sample_frames = []

for count, grp in df.groupby("author_count"):
    if count == 0:       # skip empty rows for display but keep in sample
        continue
        
    sample_n = min(len(grp), MAX_SAMPLES)
    sample_df = grp.sample(sample_n, random_state=0).copy()
    sample_frames.append(sample_df)          # for parquet output

    # gather tuples for pretty printing (authors column only for display)
    sample_tuples = list(sample_df[['authors']].itertuples(index=False, name=None))
    results.append((count, sample_tuples))

print(f"✅ Sampling completed: {len(sample_frames)} groups processed")

# Print to console for manual inspection
print("\n" + "="*60)
print("👁️ SAMPLE DATA FOR MANUAL INSPECTION")
print("="*60)

for count, authors_list in sorted(results):
    print(f"\n📝 Author count = {count} (showing {len(authors_list)} samples)")
    print("-" * 50)
    for i, (author_str,) in enumerate(authors_list, 1):
        print(f"{i:2d}. {author_str}")

print("\n" + "="*60)


## 💾 Export Complete Sample with All Columns


In [None]:
# Write sample parquet preserving ALL original columns
if sample_frames:
    sample_df_total = pd.concat(sample_frames, ignore_index=True)
    
    # Ensure Arrow engine for compatibility
    sample_df_total.to_parquet(OUT_PATH, index=False, engine="pyarrow")
    
    print(f"✅ Sample parquet written successfully:")
    print(f"  📁 File: {OUT_PATH}")
    print(f"  📊 Records: {len(sample_df_total):,}")
    print(f"  📋 Columns: {len(sample_df_total.columns)} (all original columns preserved)")
    print(f"  💾 File size: {OUT_PATH.stat().st_size / 1024:.1f} KB")
    
    print(f"\n📋 All preserved columns:")
    for i, col in enumerate(sample_df_total.columns, 1):
        print(f"  {i:2d}. {col}")
    
    print(f"\n🔬 Sample verification (showing key columns):")
    key_cols = ['id', 'title1', 'authors', 'author_count', 'year', 'publisher1']
    available_key_cols = [col for col in key_cols if col in sample_df_total.columns]
    if available_key_cols:
        display(sample_df_total[available_key_cols].head(5))
    
else:
    print("⚠️  No non-empty author rows found; sample parquet not created.")

print(f"\n🎉 Analysis completed! Check {OUT_PATH.name} for complete sample data.")
