# Data Preparation for PertPy DGE Analysis
## Preparing Proteomics Data for Differential Expression Analysis

This notebook prepares the pool_processed_v2.h5ad dataset for PertPy-based differential expression analysis.

In [None]:
# Import required packages
import scanpy as sc
import pandas as pd
import numpy as np
import anndata as ad
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set plotting parameters
sc.settings.verbosity = 3
sc.settings.set_figure_params(dpi=100, facecolor='white')
sns.set_style('whitegrid')

## 1. Load Proteomics Data

In [None]:
# Load the proteomics dataset
adata = sc.read_h5ad('../../data/pool_processed_v2.h5ad')

print(f"Dataset shape: {adata.shape}")
print(f"Number of cells/samples: {adata.n_obs}")
print(f"Number of proteins: {adata.n_vars}")
print("\nMetadata columns:")
print(adata.obs.columns.tolist())
print("\nProtein annotation columns:")
print(adata.var.columns.tolist())

## 2. Explore Key Variables for DGE

In [None]:
# Check tau status distribution
print("Tau Status Distribution:")
tau_column = 'TauStatus' if 'TauStatus' in adata.obs.columns else 'tau_status'
print(adata.obs[tau_column].value_counts())
print("\n" + "="*50)

# Check MC1 score distribution
mc1_column = 'MC1' if 'MC1' in adata.obs.columns else 'mc1_score'
if mc1_column in adata.obs.columns:
    print(f"\nMC1 Score Statistics:")
    print(adata.obs[mc1_column].describe())

# Check pseudotime distribution
pseudotime_column = 'Pseudotime' if 'Pseudotime' in adata.obs.columns else 'pseudotime'
if pseudotime_column in adata.obs.columns:
    print(f"\nPseudotime Statistics:")
    print(adata.obs[pseudotime_column].describe())

## 3. Prepare Design Matrix for PertPy

In [None]:
# Standardize column names for consistency
column_mapping = {
    'TauStatus': 'tau_status',
    'MC1': 'mc1_score', 
    'Pseudotime': 'pseudotime',
    'Age': 'age_at_death'
}

for old_name, new_name in column_mapping.items():
    if old_name in adata.obs.columns and new_name not in adata.obs.columns:
        adata.obs[new_name] = adata.obs[old_name]

# Ensure tau_status is categorical
if 'tau_status' in adata.obs.columns:
    adata.obs['tau_status'] = pd.Categorical(adata.obs['tau_status'])
    print(f"Tau status categories: {adata.obs['tau_status'].cat.categories.tolist()}")

# Create binary tau variable for cleaner analysis
adata.obs['tau_positive'] = (adata.obs['tau_status'] == 'positive').astype(int)
print(f"\nTau positive samples: {adata.obs['tau_positive'].sum()}")
print(f"Tau negative samples: {(adata.obs['tau_positive'] == 0).sum()}")

## 4. Data Quality Checks

In [None]:
# Check for missing values
print("Missing values in expression matrix:")
missing_expr = np.isnan(adata.X).sum()
print(f"Total missing: {missing_expr} ({missing_expr / adata.X.size * 100:.2f}%)")

# Check for zero inflation
zero_count = (adata.X == 0).sum()
print(f"\nZero values: {zero_count} ({zero_count / adata.X.size * 100:.2f}%)")

# Check expression value range (should be log2 transformed)
print(f"\nExpression value range:")
print(f"Min: {np.nanmin(adata.X):.3f}")
print(f"Max: {np.nanmax(adata.X):.3f}")
print(f"Mean: {np.nanmean(adata.X):.3f}")
print(f"Median: {np.nanmedian(adata.X):.3f}")

# Check if data appears to be log-transformed
if np.nanmax(adata.X) < 50:  # Typical for log-transformed data
    print("\n✓ Data appears to be log-transformed")
else:
    print("\n⚠ Data may not be log-transformed")

## 5. Prepare Protein Annotations

In [None]:
# Check protein name format
print("Sample protein names:")
print(adata.var.index[:5].tolist())

# If there's a gene name column, use it
if 'GeneName' in adata.var.columns:
    print("\nUsing GeneName for protein identification")
    adata.var['protein_name'] = adata.var['GeneName']
elif 'gene_name' in adata.var.columns:
    adata.var['protein_name'] = adata.var['gene_name']
else:
    # Use index as protein name
    adata.var['protein_name'] = adata.var.index

# Create a clean protein list
protein_list = adata.var['protein_name'].tolist()
print(f"\nTotal proteins available: {len(protein_list)}")
print(f"Sample proteins: {protein_list[:5]}")

## 6. Subset Data for Balanced Analysis (if needed)

In [None]:
# Check class balance
tau_counts = adata.obs['tau_status'].value_counts()
print("Current class distribution:")
print(tau_counts)

# Calculate imbalance ratio
imbalance_ratio = tau_counts.max() / tau_counts.min()
print(f"\nImbalance ratio: {imbalance_ratio:.2f}")

if imbalance_ratio > 2:
    print("\n⚠ Classes are imbalanced (ratio > 2)")
    print("Consider using balanced sampling or weighted analysis")
else:
    print("\n✓ Classes are reasonably balanced")

## 7. Create PertPy-Compatible Data Structure

In [None]:
# Ensure data is in dense format for PertPy
if hasattr(adata.X, 'toarray'):
    print("Converting sparse matrix to dense...")
    adata.X = adata.X.toarray()

# Create a copy for PertPy analysis
adata_pertpy = adata.copy()

# Add raw counts if not present (PertPy/DESeq2 expects counts)
if 'raw_counts' not in adata_pertpy.layers:
    # If data is log2 transformed, reverse it for DESeq2
    print("Creating pseudo-raw counts from log2 data...")
    # Convert log2 to linear scale and multiply by scaling factor
    adata_pertpy.layers['counts'] = np.power(2, adata_pertpy.X) * 1000
    # Round to integers for DESeq2
    adata_pertpy.layers['counts'] = np.round(adata_pertpy.layers['counts']).astype(int)
else:
    adata_pertpy.layers['counts'] = adata_pertpy.layers['raw_counts']

print(f"PertPy-ready data shape: {adata_pertpy.shape}")
print(f"Counts layer added: {adata_pertpy.layers['counts'].shape}")

## 8. Define Protein Sets for Analysis

In [None]:
# Define key protein sets for claims analysis
protein_sets = {
    'ups_proteins': [
        'PSMA1', 'PSMA2', 'PSMA3', 'PSMA4', 'PSMA5', 'PSMA6', 'PSMA7',
        'PSMB1', 'PSMB2', 'PSMB3', 'PSMB4', 'PSMB5', 'PSMB6', 'PSMB7',
        'PSMC1', 'PSMC2', 'PSMC3', 'PSMC4', 'PSMC5', 'PSMC6',
        'PSMD1', 'PSMD2', 'PSMD3', 'PSMD4', 'UBB', 'UBC', 'UCHL1', 'USP14'
    ],
    'mitochondrial': [
        'COX4I1', 'COX5A', 'COX6C', 'ATP5A1', 'ATP5B', 'VDAC1', 'VDAC2',
        'ANT1', 'ANT2', 'TOMM20', 'TOMM40', 'TIMM23', 'TIMM44'
    ],
    'autophagy': [
        'SQSTM1', 'LC3A', 'LC3B', 'GABARAP', 'GABARAPL1', 'GABARAPL2',
        'ATG5', 'ATG7', 'ATG12', 'BECN1', 'WIPI2'
    ],
    'vatpase': [
        'ATP6V0A1', 'ATP6V0A2', 'ATP6V0B', 'ATP6V0C', 'ATP6V0D1',
        'ATP6V1A', 'ATP6V1B1', 'ATP6V1B2', 'ATP6V1C1', 'ATP6V1D'
    ]
}

# Check availability of protein sets
for set_name, proteins in protein_sets.items():
    available = [p for p in proteins if p in protein_list]
    print(f"{set_name}: {len(available)}/{len(proteins)} proteins found")
    if len(available) < len(proteins):
        missing = set(proteins) - set(available)
        print(f"  Missing: {list(missing)[:5]}..." if len(missing) > 5 else f"  Missing: {list(missing)}")

## 9. Save Prepared Data

In [None]:
# Save the prepared data
output_file = 'prepared_for_pertpy.h5ad'
adata_pertpy.write_h5ad(output_file)
print(f"Prepared data saved to: {output_file}")

# Save protein sets as JSON for reuse
import json
with open('protein_sets.json', 'w') as f:
    json.dump(protein_sets, f, indent=2)
print("Protein sets saved to: protein_sets.json")

# Create summary statistics
summary = {
    'n_samples': adata_pertpy.n_obs,
    'n_proteins': adata_pertpy.n_vars,
    'tau_positive': int(adata_pertpy.obs['tau_positive'].sum()),
    'tau_negative': int((adata_pertpy.obs['tau_positive'] == 0).sum()),
    'has_pseudotime': 'pseudotime' in adata_pertpy.obs.columns,
    'has_mc1': 'mc1_score' in adata_pertpy.obs.columns,
    'data_format': 'log2_transformed',
    'counts_layer': 'counts' in adata_pertpy.layers
}

print("\n" + "="*50)
print("Data Preparation Summary:")
print("="*50)
for key, value in summary.items():
    print(f"{key}: {value}")

## 10. Visualize Data Distribution

In [None]:
# Create visualization of data distribution
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

# Plot 1: Expression distribution
axes[0, 0].hist(adata_pertpy.X.flatten(), bins=50, alpha=0.7, color='blue')
axes[0, 0].set_xlabel('Log2 Expression')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].set_title('Global Expression Distribution')

# Plot 2: Sample-wise mean expression
sample_means = np.nanmean(adata_pertpy.X, axis=1)
tau_colors = ['red' if x == 1 else 'blue' for x in adata_pertpy.obs['tau_positive']]
axes[0, 1].scatter(range(len(sample_means)), sample_means, c=tau_colors, alpha=0.6)
axes[0, 1].set_xlabel('Sample Index')
axes[0, 1].set_ylabel('Mean Log2 Expression')
axes[0, 1].set_title('Sample-wise Mean Expression (Red=Tau+, Blue=Tau-)')

# Plot 3: Protein coverage
protein_coverage = np.sum(~np.isnan(adata_pertpy.X), axis=0) / adata_pertpy.n_obs
axes[1, 0].hist(protein_coverage, bins=50, alpha=0.7, color='green')
axes[1, 0].set_xlabel('Coverage (fraction of samples)')
axes[1, 0].set_ylabel('Number of Proteins')
axes[1, 0].set_title('Protein Coverage Distribution')

# Plot 4: Tau status vs pseudotime (if available)
if 'pseudotime' in adata_pertpy.obs.columns:
    for tau_val, color, label in [(0, 'blue', 'Tau-'), (1, 'red', 'Tau+')]:
        mask = adata_pertpy.obs['tau_positive'] == tau_val
        axes[1, 1].scatter(adata_pertpy.obs.loc[mask, 'pseudotime'], 
                          adata_pertpy.obs.loc[mask, 'mc1_score'] if 'mc1_score' in adata_pertpy.obs.columns else np.zeros(mask.sum()),
                          c=color, label=label, alpha=0.6)
    axes[1, 1].set_xlabel('Pseudotime')
    axes[1, 1].set_ylabel('MC1 Score')
    axes[1, 1].set_title('Disease Progression Markers')
    axes[1, 1].legend()
else:
    axes[1, 1].text(0.5, 0.5, 'Pseudotime not available', 
                    ha='center', va='center', transform=axes[1, 1].transAxes)

plt.tight_layout()
plt.savefig('data_distribution_overview.png', dpi=150, bbox_inches='tight')
plt.show()

print("\n✓ Data preparation complete!")
print("Ready for PertPy differential expression analysis")

## Summary

This notebook has prepared the proteomics data for PertPy analysis:

1. ✓ Loaded pool_processed_v2.h5ad dataset
2. ✓ Standardized metadata columns
3. ✓ Created design matrix variables (tau_status, pseudotime, mc1_score)
4. ✓ Generated count matrix for DESeq2 analysis
5. ✓ Defined protein sets for claims analysis
6. ✓ Saved prepared data for downstream analysis

The data is now ready for differential expression analysis using PertPy's PyDESeq2 implementation.