# UniProt Publication Data - Exploratory Analysis

This notebook provides examples for exploring the extracted UniProt publication data.

**Prerequisites:** Run the full pipeline first:
```bash
cd ..
./run_pipeline.sh
```

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import json

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

## 1. Load Data

In [None]:
# Define data paths
DATA_DIR = Path('../data')

# Load all datasets
print("Loading datasets...")

# Full publication data
publications = pd.read_parquet(DATA_DIR / 'processed' / 'all_publications.parquet')
print(f"✓ Loaded {len(publications):,} publications")

# Monthly time series
monthly = pd.read_csv(DATA_DIR / 'outputs' / 'monthly_publications_long.csv')
monthly['year_month'] = pd.to_datetime(monthly['year_month'])
print(f"✓ Loaded {len(monthly):,} monthly observations")

# Protein summaries
summaries = pd.read_csv(DATA_DIR / 'outputs' / 'protein_summaries.csv')
print(f"✓ Loaded summaries for {len(summaries):,} proteins")

# Global time series
global_ts = pd.read_csv(DATA_DIR / 'outputs' / 'global_monthly_timeseries.csv')
global_ts['year_month'] = pd.to_datetime(global_ts['year_month'])
print(f"✓ Loaded global time series ({len(global_ts)} months)")

# Statistics
with open(DATA_DIR / 'outputs' / 'aggregation_statistics.json') as f:
    stats = json.load(f)
print(f"✓ Loaded statistics")

print("\nAll data loaded successfully!")

## 2. Overview Statistics

In [None]:
print("=" * 60)
print("DATASET OVERVIEW")
print("=" * 60)
print(f"Total publications: {len(publications):,}")
print(f"Unique proteins: {publications['accession'].nunique():,}")
print(f"Unique genes: {publications['gene_name'].nunique():,}")
print(f"Date range: {publications['year'].min()} - {publications['year'].max()}")
print(f"\nPublications with:")
print(f"  - PMID: {publications['pmid'].notna().sum():,} ({publications['pmid'].notna().sum()/len(publications)*100:.1f}%)")
print(f"  - DOI: {publications['doi'].notna().sum():,} ({publications['doi'].notna().sum()/len(publications)*100:.1f}%)")
print(f"  - Month: {publications['month'].notna().sum():,} ({publications['month'].notna().sum()/len(publications)*100:.1f}%)")

## 3. Top Studied Proteins

In [None]:
# Top 20 proteins by publication count
top_proteins = summaries.nlargest(20, 'total_pubs')[['gene_name', 'accession', 'total_pubs', 'first_pub_year', 'last_pub_year']]
print("Top 20 Most-Studied Proteins:")
print(top_proteins.to_string(index=False))

In [None]:
# Visualize top 20
plt.figure(figsize=(12, 8))
plt.barh(range(20), top_proteins['total_pubs'].values[::-1])
plt.yticks(range(20), top_proteins['gene_name'].values[::-1])
plt.xlabel('Total Publications')
plt.title('Top 20 Most-Studied Human Proteins')
plt.tight_layout()
plt.show()

## 4. Publication Distribution

In [None]:
# Distribution of publications per protein
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Histogram (log scale)
axes[0].hist(summaries['total_pubs'], bins=50, edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Publications per Protein')
axes[0].set_ylabel('Number of Proteins')
axes[0].set_title('Distribution of Publications per Protein')
axes[0].set_yscale('log')

# Box plot
axes[1].boxplot(summaries['total_pubs'], vert=True)
axes[1].set_ylabel('Publications per Protein')
axes[1].set_title('Publication Count Distribution')
axes[1].set_yscale('log')

plt.tight_layout()
plt.show()

print(f"Mean: {summaries['total_pubs'].mean():.1f}")
print(f"Median: {summaries['total_pubs'].median():.1f}")
print(f"Std: {summaries['total_pubs'].std():.1f}")

## 5. Temporal Trends

In [None]:
# Global publication trends over time
fig, axes = plt.subplots(2, 1, figsize=(14, 10))

# Total publications per month
axes[0].plot(global_ts['year_month'], global_ts['total_pubs'], linewidth=1.5)
axes[0].set_xlabel('Date')
axes[0].set_ylabel('Total Publications')
axes[0].set_title('Global Publication Trends (All Proteins)')
axes[0].grid(True, alpha=0.3)

# Curated vs Mapped
axes[1].plot(global_ts['year_month'], global_ts['curated_pubs'], label='Curated (Swiss-Prot)', linewidth=1.5)
axes[1].plot(global_ts['year_month'], global_ts['mapped_pubs'], label='Mapped (TrEMBL)', linewidth=1.5)
axes[1].set_xlabel('Date')
axes[1].set_ylabel('Publications')
axes[1].set_title('Curated vs Mapped Publications')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 6. Specific Protein Analysis

In [None]:
# Analyze a specific protein (e.g., TP53)
GENE_NAME = 'TP53'  # Change this to any gene of interest

# Get protein info
protein_info = summaries[summaries['gene_name'] == GENE_NAME].iloc[0]
print(f"Protein: {protein_info['protein_name']}")
print(f"Gene: {protein_info['gene_name']}")
print(f"Accession: {protein_info['accession']}")
print(f"Total publications: {protein_info['total_pubs']:,}")
print(f"First publication: {protein_info['first_pub_year']}")
print(f"Last publication: {protein_info['last_pub_year']}")
print(f"Average monthly pubs: {protein_info['avg_monthly_pubs']:.2f}")

# Get time series data
protein_ts = monthly[monthly['gene_name'] == GENE_NAME].copy()
protein_ts = protein_ts.sort_values('year_month')

# Plot
plt.figure(figsize=(14, 6))
plt.plot(protein_ts['year_month'], protein_ts['total_pubs'], linewidth=2)
plt.fill_between(protein_ts['year_month'], protein_ts['total_pubs'], alpha=0.3)
plt.xlabel('Date')
plt.ylabel('Publications per Month')
plt.title(f'Publication Trends for {GENE_NAME}')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 7. Yearly Trends

In [None]:
# Publications per year
yearly_counts = publications.groupby('year').size()

plt.figure(figsize=(14, 6))
plt.bar(yearly_counts.index, yearly_counts.values, edgecolor='black', alpha=0.7)
plt.xlabel('Year')
plt.ylabel('Total Publications')
plt.title('Publications per Year (All Proteins)')
plt.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.show()

## 8. Compare Multiple Proteins

In [None]:
# Compare publication trends for multiple proteins
GENES_TO_COMPARE = ['TP53', 'BRCA1', 'EGFR', 'TNF', 'INS']

plt.figure(figsize=(14, 7))

for gene in GENES_TO_COMPARE:
    gene_data = monthly[monthly['gene_name'] == gene].sort_values('year_month')
    if len(gene_data) > 0:
        plt.plot(gene_data['year_month'], gene_data['total_pubs'], label=gene, linewidth=2, alpha=0.7)

plt.xlabel('Date')
plt.ylabel('Publications per Month')
plt.title('Publication Trends Comparison')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 9. Journal Analysis

In [None]:
# Top journals by publication count
journal_counts = publications['journal'].value_counts().head(20)

plt.figure(figsize=(12, 8))
plt.barh(range(20), journal_counts.values[::-1])
plt.yticks(range(20), journal_counts.index[::-1])
plt.xlabel('Number of Publications')
plt.title('Top 20 Journals')
plt.tight_layout()
plt.show()

## 10. Custom Analysis

Add your own analysis here!

In [None]:
# Your custom analysis code here
