# Claim 1: UPS Protein Analysis with PertPy
## Testing: "No significant UPS protein alterations across tau-positive versus tau-negative neurons"

This notebook uses PertPy's PyDESeq2 implementation to rigorously test whether UPS proteins show differential expression between tau-positive and tau-negative neurons.

In [None]:
import pertpy as pt
import scanpy as sc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

## 1. Load Prepared Data

In [None]:
# Load the prepared data
adata = sc.read_h5ad('../01_data_preparation/prepared_for_pertpy.h5ad')

print(f"Loaded data: {adata.shape}")
print(f"Tau-positive samples: {adata.obs['tau_positive'].sum()}")
print(f"Tau-negative samples: {(adata.obs['tau_positive'] == 0).sum()}")

## 2. Define Comprehensive UPS Protein List (132 proteins)

In [None]:
# Comprehensive UPS protein list from validated analysis
ups_proteins_comprehensive = [
    # Proteasome subunits (43)
    'PSMA1', 'PSMA2', 'PSMA3', 'PSMA4', 'PSMA5', 'PSMA6', 'PSMA7',
    'PSMB1', 'PSMB2', 'PSMB3', 'PSMB4', 'PSMB5', 'PSMB6', 'PSMB7', 'PSMB8', 'PSMB9', 'PSMB10',
    'PSMC1', 'PSMC2', 'PSMC3', 'PSMC4', 'PSMC5', 'PSMC6',
    'PSMD1', 'PSMD2', 'PSMD3', 'PSMD4', 'PSMD5', 'PSMD6', 'PSMD7', 'PSMD8', 'PSMD9', 'PSMD10',
    'PSMD11', 'PSMD12', 'PSMD13', 'PSMD14',
    'PSME1', 'PSME2', 'PSME3', 'PSMF1', 'PSMG1', 'PSMG3',
    
    # E3 ligases (19)
    'CBL', 'FBXO2', 'FBXO6', 'HECTD1', 'HECTD3', 'HECTD4', 'HERC1', 'HERC2',
    'HUWE1', 'ITCH', 'NEDD4L', 'PARK7', 'RNF31', 'SMURF1', 'TRIM25', 'TRIM32',
    'UBE3A', 'UBE3B', 'UBE3C',
    
    # E2 enzymes (18)
    'UBE2D1', 'UBE2D3', 'UBE2D4', 'UBE2E2', 'UBE2G1', 'UBE2H', 'UBE2I', 'UBE2K',
    'UBE2L3', 'UBE2L6', 'UBE2M', 'UBE2N', 'UBE2O', 'UBE2Q1', 'UBE2R2', 'UBE2V1', 'UBE2V2', 'UBE2Z',
    
    # E1 enzymes (7)
    'UBA1', 'UBA2', 'UBA3', 'UBA5', 'UBA6', 'UBB', 'UBC',
    
    # Deubiquitinases (28)
    'ATXN3', 'BRCC3', 'COPS5', 'COPS6', 'CYLD', 'OTUB1', 'OTUD6B', 'STAMBP',
    'UCHL1', 'UCHL3', 'UCHL5', 'USP4', 'USP5', 'USP7', 'USP8', 'USP9X', 'USP10',
    'USP11', 'USP14', 'USP15', 'USP19', 'USP24', 'USP25', 'USP30', 'USP32',
    'USP46', 'USP47', 'USP48',
    
    # UPS regulators (9)
    'BAG6', 'NBR1', 'OPTN', 'SQSTM1', 'TAX1BP1', 'UBQLN1', 'UBQLN2', 'UBQLN4', 'VCP',
    
    # Alternative modifiers (8)
    'ATG12', 'ISG15', 'NEDD8', 'SUMO2', 'SUMO3', 'SUMO4', 'UFM1', 'URM1'
]

print(f"Total UPS proteins to analyze: {len(ups_proteins_comprehensive)}")

# Check availability in dataset
protein_names = adata.var['protein_name'] if 'protein_name' in adata.var else adata.var.index
available_ups = [p for p in ups_proteins_comprehensive if p in protein_names.tolist()]
missing_ups = set(ups_proteins_comprehensive) - set(available_ups)

print(f"Available UPS proteins: {len(available_ups)}/{len(ups_proteins_comprehensive)}")
if missing_ups:
    print(f"Missing proteins: {len(missing_ups)}")
    print(f"Examples of missing: {list(missing_ups)[:5]}")

## 3. Subset Data to UPS Proteins

In [None]:
# Create subset with only UPS proteins
ups_indices = [i for i, p in enumerate(protein_names) if p in available_ups]
adata_ups = adata[:, ups_indices].copy()

print(f"UPS subset shape: {adata_ups.shape}")
print(f"Samples: {adata_ups.n_obs}")
print(f"UPS proteins: {adata_ups.n_vars}")

## 4. Run PyDESeq2 Analysis

In [None]:
# Initialize PyDESeq2 with simple design (tau status only)
print("Running PyDESeq2 analysis...")

# Use the counts layer for DESeq2
if 'counts' in adata_ups.layers:
    # Temporarily swap counts to X for PyDESeq2
    adata_ups.layers['log2'] = adata_ups.X.copy()
    adata_ups.X = adata_ups.layers['counts'].copy()

# Run PyDESeq2
try:
    # Initialize with tau status as the main factor
    pds2 = pt.tl.PyDESeq2(
        adata=adata_ups,
        design="~tau_status",
        refit_cooks=True
    )
    
    # Fit the model
    pds2.fit()
    
    # Test contrast between tau positive and negative
    results_df = pds2.test_contrasts(
        pds2.contrast(
            column="tau_status",
            baseline="negative",
            group_to_compare="positive"
        )
    )
    
    print("✓ PyDESeq2 analysis completed successfully")
    
except Exception as e:
    print(f"PyDESeq2 failed: {e}")
    print("Falling back to traditional t-test approach...")
    
    # Fallback: Traditional differential expression
    results_list = []
    
    tau_pos = adata_ups.obs['tau_status'] == 'positive'
    tau_neg = adata_ups.obs['tau_status'] == 'negative'
    
    for i in range(adata_ups.n_vars):
        expr_pos = adata_ups.X[tau_pos, i]
        expr_neg = adata_ups.X[tau_neg, i]
        
        # Calculate statistics
        log2fc = np.mean(expr_pos) - np.mean(expr_neg)
        tstat, pval = stats.ttest_ind(expr_pos, expr_neg)
        
        results_list.append({
            'protein': adata_ups.var.index[i],
            'log2FoldChange': log2fc,
            'pvalue': pval,
            'stat': tstat
        })
    
    results_df = pd.DataFrame(results_list)
    
    # Add adjusted p-values
    from statsmodels.stats.multitest import multipletests
    results_df['padj'] = multipletests(results_df['pvalue'], method='fdr_bh')[1]

## 5. Analyze Results

In [None]:
# Summary statistics
print("Differential Expression Results Summary:")
print("="*50)

# Count significant proteins
sig_threshold = 0.05
n_sig_nominal = (results_df['pvalue'] < sig_threshold).sum()
n_sig_adjusted = (results_df['padj'] < sig_threshold).sum()

print(f"Total UPS proteins tested: {len(results_df)}")
print(f"Significant (p < 0.05): {n_sig_nominal}")
print(f"Significant (FDR < 0.05): {n_sig_adjusted}")
print(f"Percentage significant (FDR): {n_sig_adjusted/len(results_df)*100:.1f}%")

# Get top differentially expressed proteins
results_df_sorted = results_df.sort_values('padj')
print("\nTop 10 Differentially Expressed UPS Proteins:")
print(results_df_sorted[['protein', 'log2FoldChange', 'pvalue', 'padj']].head(10))

# Calculate effect sizes
results_df['abs_log2FC'] = np.abs(results_df['log2FoldChange'])
large_effect = (results_df['abs_log2FC'] > 0.5).sum()
print(f"\nProteins with large effect (|log2FC| > 0.5): {large_effect}")

## 6. Create Volcano Plot

In [None]:
# Create volcano plot
fig, ax = plt.subplots(figsize=(10, 8))

# Calculate -log10(p-value)
results_df['neg_log10_pval'] = -np.log10(results_df['pvalue'] + 1e-300)

# Define significance thresholds
pval_threshold = -np.log10(0.05)
fc_threshold = 0.5

# Color points based on significance
colors = []
for _, row in results_df.iterrows():
    if row['padj'] < 0.05 and abs(row['log2FoldChange']) > fc_threshold:
        colors.append('red')  # Significant and large effect
    elif row['padj'] < 0.05:
        colors.append('orange')  # Significant but small effect
    elif abs(row['log2FoldChange']) > fc_threshold:
        colors.append('blue')  # Large effect but not significant
    else:
        colors.append('gray')  # Not significant

# Create scatter plot
scatter = ax.scatter(results_df['log2FoldChange'], 
                    results_df['neg_log10_pval'],
                    c=colors, alpha=0.6, s=50)

# Add threshold lines
ax.axhline(y=pval_threshold, color='black', linestyle='--', alpha=0.3, label='p = 0.05')
ax.axvline(x=fc_threshold, color='black', linestyle='--', alpha=0.3)
ax.axvline(x=-fc_threshold, color='black', linestyle='--', alpha=0.3)

# Label top proteins
top_proteins = results_df.nsmallest(10, 'padj')
for _, row in top_proteins.iterrows():
    if row['padj'] < 0.05:
        ax.annotate(row['protein'], 
                   (row['log2FoldChange'], row['neg_log10_pval']),
                   fontsize=8, alpha=0.7)

ax.set_xlabel('Log2 Fold Change (Tau+ vs Tau-)', fontsize=12)
ax.set_ylabel('-Log10(p-value)', fontsize=12)
ax.set_title('Volcano Plot: UPS Proteins Differential Expression\n(132 proteins analyzed)', 
            fontsize=14, fontweight='bold')

# Add legend
from matplotlib.patches import Patch
legend_elements = [
    Patch(facecolor='red', alpha=0.6, label='Significant & Large Effect'),
    Patch(facecolor='orange', alpha=0.6, label='Significant'),
    Patch(facecolor='blue', alpha=0.6, label='Large Effect Only'),
    Patch(facecolor='gray', alpha=0.6, label='Not Significant')
]
ax.legend(handles=legend_elements, loc='upper left')

plt.tight_layout()
plt.savefig('ups_volcano_plot.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"\nVolcano plot saved as 'ups_volcano_plot.png'")

## 7. Category-Specific Analysis

In [None]:
# Define UPS categories
ups_categories = {
    'Proteasome': ['PSMA', 'PSMB', 'PSMC', 'PSMD', 'PSME', 'PSMF', 'PSMG'],
    'E3_Ligases': ['CBL', 'FBXO', 'HECTD', 'HERC', 'HUWE', 'ITCH', 'NEDD4', 'PARK', 'RNF', 'SMURF', 'TRIM', 'UBE3'],
    'E2_Enzymes': ['UBE2'],
    'E1_Enzymes': ['UBA', 'UBB', 'UBC'],
    'DUBs': ['ATXN', 'BRCC', 'COPS', 'CYLD', 'OTUB', 'OTUD', 'STAMBP', 'UCHL', 'USP'],
    'Regulators': ['BAG', 'NBR', 'OPTN', 'SQSTM', 'TAX', 'UBQLN', 'VCP'],
    'Modifiers': ['ATG', 'ISG', 'NEDD8', 'SUMO', 'UFM', 'URM']
}

# Categorize results
category_results = {}
for cat_name, prefixes in ups_categories.items():
    cat_proteins = []
    for protein in results_df['protein']:
        if any(protein.startswith(prefix) for prefix in prefixes):
            cat_proteins.append(protein)
    
    if cat_proteins:
        cat_df = results_df[results_df['protein'].isin(cat_proteins)]
        n_sig = (cat_df['padj'] < 0.05).sum()
        pct_sig = n_sig / len(cat_df) * 100 if len(cat_df) > 0 else 0
        
        category_results[cat_name] = {
            'total': len(cat_df),
            'significant': n_sig,
            'percent_sig': pct_sig,
            'mean_log2FC': cat_df['log2FoldChange'].mean()
        }

# Display category results
print("\nCategory-Specific Analysis:")
print("="*60)
cat_summary = pd.DataFrame(category_results).T
cat_summary = cat_summary.sort_values('percent_sig', ascending=False)
print(cat_summary)

## 8. Test Claim Validity

In [None]:
# Evaluate the claim
print("\n" + "="*60)
print("CLAIM EVALUATION")
print("="*60)
print("Claim: 'No significant UPS protein alterations across tau-positive")
print("       versus tau-negative neurons'")
print()

# Decision criteria
fdr_threshold = 0.05
effect_threshold = 0.5
percent_threshold = 5  # If >5% proteins significant, claim is refuted

# Calculate metrics
percent_sig = n_sig_adjusted / len(results_df) * 100
n_large_effect = ((results_df['padj'] < fdr_threshold) & 
                  (results_df['abs_log2FC'] > effect_threshold)).sum()

print(f"Analysis Results:")
print(f"- Proteins tested: {len(results_df)}")
print(f"- Significant (FDR < {fdr_threshold}): {n_sig_adjusted} ({percent_sig:.1f}%)")
print(f"- With large effect size: {n_large_effect}")
print()

# Make decision
if percent_sig < percent_threshold:
    if n_sig_adjusted == 0:
        verdict = "STRONGLY SUPPORTED"
        explanation = "No UPS proteins show significant differential expression"
    else:
        verdict = "SUPPORTED"
        explanation = f"Only {percent_sig:.1f}% of UPS proteins show significant changes"
elif percent_sig < 10:
    verdict = "PARTIALLY SUPPORTED"
    explanation = f"{percent_sig:.1f}% of proteins changed, borderline significant"
elif percent_sig < 25:
    verdict = "PARTIALLY REFUTED"
    explanation = f"{percent_sig:.1f}% of UPS proteins significantly altered"
else:
    verdict = "REFUTED"
    explanation = f"{percent_sig:.1f}% of UPS proteins show significant alterations"

# Display verdict
print(f"VERDICT: {verdict}")
print(f"Explanation: {explanation}")
print()

# List most changed proteins if any
if n_sig_adjusted > 0:
    print("Most significantly changed UPS proteins:")
    sig_proteins = results_df[results_df['padj'] < fdr_threshold].sort_values('padj')
    for _, row in sig_proteins.head(5).iterrows():
        direction = "↑" if row['log2FoldChange'] > 0 else "↓"
        print(f"  {row['protein']}: {direction} log2FC={row['log2FoldChange']:.2f}, FDR={row['padj']:.3e}")

## 9. Save Results

In [None]:
# Save complete results
output_file = '../05_statistical_reports/claim1_ups_proteins_results.csv'
results_df.to_csv(output_file, index=False)
print(f"Results saved to: {output_file}")

# Save significant proteins only
sig_proteins_df = results_df[results_df['padj'] < 0.05]
if len(sig_proteins_df) > 0:
    sig_output = '../05_statistical_reports/claim1_significant_ups.csv'
    sig_proteins_df.to_csv(sig_output, index=False)
    print(f"Significant proteins saved to: {sig_output}")

# Create summary report
summary = {
    'claim': 'No significant UPS protein alterations',
    'verdict': verdict,
    'proteins_tested': len(results_df),
    'significant_fdr005': n_sig_adjusted,
    'percent_significant': percent_sig,
    'large_effects': n_large_effect,
    'top_protein': results_df_sorted.iloc[0]['protein'] if len(results_df_sorted) > 0 else None,
    'top_log2fc': results_df_sorted.iloc[0]['log2FoldChange'] if len(results_df_sorted) > 0 else None,
    'top_padj': results_df_sorted.iloc[0]['padj'] if len(results_df_sorted) > 0 else None
}

print("\nAnalysis Summary:")
for key, value in summary.items():
    print(f"  {key}: {value}")

## Summary

This PertPy-based analysis of UPS proteins provides:

1. **Comprehensive coverage**: 132 UPS proteins analyzed (not cherry-picked)
2. **Robust statistics**: PyDESeq2 or fallback methods with FDR correction
3. **Clear visualization**: Volcano plot showing all proteins
4. **Category breakdown**: Analysis by UPS component type
5. **Objective verdict**: Data-driven evaluation of the claim

The analysis determines whether the claim of "no significant UPS alterations" is supported or refuted based on the actual data.