# PDX Biomarker Analysis

This notebook performs comprehensive biomarker analysis on PDX data including:
- Differential gene expression analysis
- Pathway enrichment analysis  
- Correlation with tumor growth metrics
- Interactive visualizations

## Analysis Overview

We will analyze gene expression data from PDX models to identify biomarkers associated with treatment response. The analysis includes both statistical testing and biological interpretation of results.

In [None]:
# Import Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import spearmanr, pearsonr
import warnings
warnings.filterwarnings('ignore')

# Set up plotting parameters
plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['font.size'] = 12

# For reproducibility
np.random.seed(42)

print("Libraries imported successfully!")
print(f"NumPy version: {np.__version__}")
print(f"Pandas version: {pd.__version__}")
print(f"Matplotlib version: {plt.matplotlib.__version__}")
print(f"Seaborn version: {sns.__version__}")

## 1. Data Loading and Preprocessing

Load the expression data and tumor growth data for integrated analysis.

In [None]:
# Load expression data
try:
    expression_data = pd.read_csv('../data/expression_tpm_mock.csv', index_col=0)
    print(f"Expression data loaded: {expression_data.shape[0]} genes x {expression_data.shape[1]} samples")
    print(f"Sample names: {list(expression_data.columns)}")
    
    # Load tumor volume data
    tumor_data = pd.read_csv('../data/tumor_volumes_mock.csv')
    print(f"Tumor data loaded: {len(tumor_data)} measurements")
    print(f"Models: {sorted(tumor_data['Model'].unique())}")
    print(f"Treatment arms: {sorted(tumor_data['Arm'].unique())}")
    
except FileNotFoundError as e:
    print(f"Error loading data: {e}")
    print("Please ensure data files are in the correct location")

# Display basic statistics
print("\n=== Expression Data Summary ===")
print(f"Mean expression across all genes: {expression_data.values.mean():.2f}")
print(f"Median expression: {np.median(expression_data.values):.2f}")
print(f"Expression range: {expression_data.values.min():.2f} - {expression_data.values.max():.2f}")

print("\n=== Tumor Data Summary ===") 
print(tumor_data.groupby('Arm')['Volume_mm3'].describe())

In [None]:
# Create sample metadata from tumor data
sample_metadata = tumor_data[tumor_data['Day'] == 0][['Model', 'Arm']].copy()
sample_metadata = sample_metadata.rename(columns={'Model': 'Sample'})
sample_metadata = sample_metadata.set_index('Sample')

# Ensure expression samples match metadata
common_samples = list(set(expression_data.columns) & set(sample_metadata.index))
print(f"Samples with both expression and metadata: {len(common_samples)}")

# Filter data to common samples
expression_filtered = expression_data[common_samples]
metadata_filtered = sample_metadata.loc[common_samples]

print(f"Final expression data: {expression_filtered.shape}")
print(f"Sample groups: {metadata_filtered['Arm'].value_counts().to_dict()}")

# Log transform expression data (log2(TPM + 1))
expression_log = np.log2(expression_filtered + 1)
print(f"Log-transformed expression range: {expression_log.values.min():.2f} - {expression_log.values.max():.2f}")

## 2. Differential Gene Expression Analysis

Identify genes that are differentially expressed between treatment and control groups.

In [None]:
def perform_differential_expression(expression_data, metadata, group_col='Arm', 
                                   control_group='control', treatment_group='treatment'):
    """
    Perform differential expression analysis using t-tests
    """
    results = []
    
    control_samples = metadata[metadata[group_col] == control_group].index
    treatment_samples = metadata[metadata[group_col] == treatment_group].index
    
    print(f"Control samples: {len(control_samples)}")
    print(f"Treatment samples: {len(treatment_samples)}")
    
    for gene in expression_data.index:
        control_expr = expression_data.loc[gene, control_samples]
        treatment_expr = expression_data.loc[gene, treatment_samples]
        
        # Calculate fold change and log2 fold change
        mean_control = control_expr.mean()
        mean_treatment = treatment_expr.mean()
        fold_change = mean_treatment / mean_control if mean_control > 0 else np.inf
        log2_fc = np.log2(fold_change) if fold_change > 0 and np.isfinite(fold_change) else np.nan
        
        # Perform t-test
        try:
            t_stat, p_value = stats.ttest_ind(treatment_expr, control_expr)
        except:
            t_stat, p_value = np.nan, 1.0
        
        results.append({
            'Gene': gene,
            'Mean_Control': mean_control,
            'Mean_Treatment': mean_treatment,
            'FoldChange': fold_change,
            'Log2FoldChange': log2_fc,
            'T_statistic': t_stat,
            'P_value': p_value
        })
    
    results_df = pd.DataFrame(results)
    
    # Multiple testing correction (Benjamini-Hochberg)
    from scipy.stats import false_discovery_control
    results_df['P_adjusted'] = false_discovery_control(results_df['P_value'].fillna(1.0))
    
    # Add significance flags
    results_df['Significant'] = (results_df['P_adjusted'] < 0.05) & (np.abs(results_df['Log2FoldChange']) > 1)
    
    return results_df

# Perform differential expression analysis
deg_results = perform_differential_expression(expression_log, metadata_filtered)

print(f"Differential expression analysis completed for {len(deg_results)} genes")
print(f"Significant genes (|log2FC| > 1, FDR < 0.05): {deg_results['Significant'].sum()}")

# Display top upregulated and downregulated genes
deg_significant = deg_results[deg_results['Significant']].copy()
deg_significant = deg_significant.sort_values('Log2FoldChange', ascending=False)

print("\n=== Top 10 Upregulated Genes ===")
print(deg_significant[['Gene', 'Log2FoldChange', 'P_adjusted']].head(10))

print("\n=== Top 10 Downregulated Genes ===")
print(deg_significant[['Gene', 'Log2FoldChange', 'P_adjusted']].tail(10))

## 3. Data Visualization

Create publication-quality plots including volcano plot, heatmap, and PCA.

In [None]:
# Create volcano plot
def create_volcano_plot(deg_results, title="Volcano Plot"):
    """Create a volcano plot for differential expression results"""
    
    fig, ax = plt.subplots(figsize=(10, 8))
    
    # Remove infinite and NaN values for plotting
    plot_data = deg_results.dropna(subset=['Log2FoldChange', 'P_adjusted'])
    plot_data = plot_data[np.isfinite(plot_data['Log2FoldChange'])]
    
    # Calculate -log10(p-adjusted)
    plot_data['NegLog10P'] = -np.log10(plot_data['P_adjusted'] + 1e-10)  # Add small value to avoid log(0)
    
    # Color points based on significance
    colors = ['red' if sig else 'lightgray' for sig in plot_data['Significant']]
    
    # Create scatter plot
    scatter = ax.scatter(plot_data['Log2FoldChange'], plot_data['NegLog10P'], 
                        c=colors, alpha=0.6, s=30)
    
    # Add significance thresholds
    ax.axhline(y=-np.log10(0.05), color='blue', linestyle='--', alpha=0.7, label='FDR = 0.05')
    ax.axvline(x=1, color='blue', linestyle='--', alpha=0.7)
    ax.axvline(x=-1, color='blue', linestyle='--', alpha=0.7, label='|log2FC| = 1')
    
    # Labels and formatting
    ax.set_xlabel('Log2 Fold Change (Treatment vs Control)')
    ax.set_ylabel('-Log10(Adjusted P-value)')
    ax.set_title(title)
    ax.legend()
    ax.grid(True, alpha=0.3)
    
    # Add text with number of significant genes
    n_sig = plot_data['Significant'].sum()
    ax.text(0.02, 0.98, f'Significant genes: {n_sig}', transform=ax.transAxes, 
            verticalalignment='top', bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))
    
    plt.tight_layout()
    return fig

# Create volcano plot
volcano_fig = create_volcano_plot(deg_results)
plt.show()

print(f"Volcano plot created showing {deg_results['Significant'].sum()} significant genes")

In [None]:
# Create heatmap of top differentially expressed genes
def create_expression_heatmap(expression_data, metadata, deg_results, top_n=40):
    """Create heatmap of top differentially expressed genes"""
    
    # Get top significant genes
    significant_genes = deg_results[deg_results['Significant']].copy()
    if len(significant_genes) == 0:
        print("No significant genes found for heatmap")
        return None
    
    # Select top genes by absolute log2 fold change
    significant_genes['AbsLog2FC'] = np.abs(significant_genes['Log2FoldChange'])
    top_genes = significant_genes.nlargest(min(top_n, len(significant_genes)), 'AbsLog2FC')
    
    # Prepare data for heatmap
    heatmap_data = expression_data.loc[top_genes['Gene'], :]
    
    # Z-score normalization (row-wise)
    heatmap_data_zscore = heatmap_data.T
    heatmap_data_zscore = (heatmap_data_zscore - heatmap_data_zscore.mean()) / heatmap_data_zscore.std()
    heatmap_data_zscore = heatmap_data_zscore.T
    
    # Create annotation for sample groups
    sample_colors = {'control': 'lightblue', 'treatment': 'orange'}
    col_colors = [sample_colors[metadata.loc[sample, 'Arm']] for sample in heatmap_data.columns]
    
    # Create heatmap
    fig, ax = plt.subplots(figsize=(12, max(8, len(top_genes) * 0.3)))
    
    sns.heatmap(heatmap_data_zscore, 
                cmap='RdBu_r', center=0, 
                xticklabels=True, yticklabels=True,
                cbar_kws={'label': 'Z-score'},
                ax=ax)
    
    # Add color bar for sample groups
    from matplotlib.patches import Patch
    legend_elements = [Patch(facecolor=sample_colors['control'], label='Control'),
                      Patch(facecolor=sample_colors['treatment'], label='Treatment')]
    ax.legend(handles=legend_elements, loc='upper left', bbox_to_anchor=(1.05, 1))
    
    ax.set_title(f'Heatmap of Top {len(top_genes)} Differentially Expressed Genes')
    ax.set_xlabel('Samples')
    ax.set_ylabel('Genes')
    
    plt.tight_layout()
    return fig

# Create heatmap
heatmap_fig = create_expression_heatmap(expression_log, metadata_filtered, deg_results, top_n=40)
if heatmap_fig:
    plt.show()
else:
    print("Could not create heatmap - no significant genes found")

## 4. Principal Component Analysis (PCA)

Perform PCA to visualize sample relationships and identify patterns in the data.

In [None]:
# Perform PCA
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

def perform_pca_analysis(expression_data, metadata, n_components=None):
    """Perform PCA on expression data"""
    
    # Transpose data (samples as rows, genes as columns)
    data_for_pca = expression_data.T
    
    # Standardize the data
    scaler = StandardScaler()
    data_scaled = scaler.fit_transform(data_for_pca)
    
    # Perform PCA
    if n_components is None:
        n_components = min(data_for_pca.shape[0] - 1, 10)  # Use min of samples-1 or 10
    
    pca = PCA(n_components=n_components)
    pca_result = pca.fit_transform(data_scaled)
    
    # Create PCA results dataframe
    pca_df = pd.DataFrame(pca_result, 
                         columns=[f'PC{i+1}' for i in range(n_components)],
                         index=data_for_pca.index)
    
    # Add metadata
    pca_df = pca_df.join(metadata)
    
    return pca, pca_df

# Perform PCA
pca_model, pca_results = perform_pca_analysis(expression_log, metadata_filtered)

print(f"PCA completed with {pca_results.shape[1]-1} components")
print(f"Explained variance ratio: {pca_model.explained_variance_ratio_[:5]}")
print(f"Cumulative explained variance: {np.cumsum(pca_model.explained_variance_ratio_[:5])}")

# Create PCA plot
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# PC1 vs PC2 scatter plot
colors = {'control': 'blue', 'treatment': 'red'}
for arm in pca_results['Arm'].unique():
    subset = pca_results[pca_results['Arm'] == arm]
    ax1.scatter(subset['PC1'], subset['PC2'], 
               c=colors[arm], label=arm, alpha=0.7, s=100)

ax1.set_xlabel(f'PC1 ({pca_model.explained_variance_ratio_[0]:.1%} variance)')
ax1.set_ylabel(f'PC2 ({pca_model.explained_variance_ratio_[1]:.1%} variance)')
ax1.set_title('PCA Plot: PC1 vs PC2')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Scree plot (explained variance)
ax2.plot(range(1, len(pca_model.explained_variance_ratio_) + 1), 
         pca_model.explained_variance_ratio_ * 100, 'bo-')
ax2.set_xlabel('Principal Component')
ax2.set_ylabel('Explained Variance (%)')
ax2.set_title('Scree Plot')
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 5. Integration with Tumor Growth Data

Correlate gene expression with tumor growth metrics to identify predictive biomarkers.

In [None]:
# Calculate growth metrics for each model
def calculate_growth_metrics(tumor_data):
    """Calculate growth metrics from tumor volume data"""
    
    growth_metrics = []
    
    for model, group in tumor_data.groupby('Model'):
        group = group.sort_values('Day')
        
        if len(group) < 2:
            continue
            
        # Calculate growth rate using linear regression on log-transformed data
        log_volumes = np.log(group['Volume_mm3'] + 1)
        days = group['Day']
        
        try:
            # Linear fit: log(volume) = intercept + slope * day
            coeffs = np.polyfit(days, log_volumes, 1)
            growth_rate = coeffs[0]  # slope
            
            # Calculate doubling time
            doubling_time = np.log(2) / growth_rate if growth_rate > 0 else np.inf
            
            # Other metrics
            initial_volume = group['Volume_mm3'].iloc[0]
            final_volume = group['Volume_mm3'].iloc[-1]
            fold_change = final_volume / initial_volume
            
            growth_metrics.append({
                'Model': model,
                'Arm': group['Arm'].iloc[0],
                'InitialVolume': initial_volume,
                'FinalVolume': final_volume,
                'FoldChange': fold_change,
                'GrowthRate': growth_rate,
                'DoublingTime': doubling_time,
                'NumTimepoints': len(group)
            })
            
        except Exception as e:
            print(f"Error calculating growth for model {model}: {e}")
            continue
    
    return pd.DataFrame(growth_metrics)

# Calculate growth metrics
growth_metrics = calculate_growth_metrics(tumor_data)
print(f"Growth metrics calculated for {len(growth_metrics)} models")
print("\n=== Growth Metrics Summary ===")
print(growth_metrics.groupby('Arm')[['GrowthRate', 'DoublingTime', 'FoldChange']].mean())

# Merge with expression data for correlation analysis
common_models = list(set(growth_metrics['Model']) & set(expression_log.columns))
print(f"\nModels with both expression and growth data: {len(common_models)}")

if len(common_models) > 0:
    growth_for_corr = growth_metrics[growth_metrics['Model'].isin(common_models)].set_index('Model')
    expression_for_corr = expression_log[common_models]
    
    print(f"Data ready for correlation analysis: {expression_for_corr.shape[0]} genes x {len(common_models)} models")

In [None]:
# Correlation analysis between gene expression and growth metrics
def correlate_expression_with_growth(expression_data, growth_metrics, metric='GrowthRate'):
    """Correlate gene expression with growth metrics"""
    
    correlations = []
    
    for gene in expression_data.index:
        gene_expr = expression_data.loc[gene, growth_metrics.index]
        growth_values = growth_metrics[metric]
        
        # Remove any missing values
        valid_mask = ~(np.isnan(gene_expr) | np.isnan(growth_values) | np.isinf(growth_values))
        
        if valid_mask.sum() < 3:  # Need at least 3 points for correlation
            continue
            
        gene_expr_clean = gene_expr[valid_mask]
        growth_clean = growth_values[valid_mask]
        
        try:
            # Calculate Spearman correlation (rank-based, more robust)
            spear_corr, spear_p = spearmanr(gene_expr_clean, growth_clean)
            
            # Calculate Pearson correlation
            pears_corr, pears_p = pearsonr(gene_expr_clean, growth_clean)
            
            correlations.append({
                'Gene': gene,
                'Spearman_r': spear_corr,
                'Spearman_p': spear_p,
                'Pearson_r': pears_corr,
                'Pearson_p': pears_p,
                'N_samples': len(gene_expr_clean)
            })
            
        except Exception as e:
            continue
    
    corr_df = pd.DataFrame(correlations)
    
    # Multiple testing correction
    if len(corr_df) > 0:
        corr_df['Spearman_p_adj'] = false_discovery_control(corr_df['Spearman_p'].fillna(1.0))
        corr_df['Pearson_p_adj'] = false_discovery_control(corr_df['Pearson_p'].fillna(1.0))
        
        # Flag significant correlations
        corr_df['Significant_Spearman'] = (corr_df['Spearman_p_adj'] < 0.05) & (np.abs(corr_df['Spearman_r']) > 0.5)
        corr_df['Significant_Pearson'] = (corr_df['Pearson_p_adj'] < 0.05) & (np.abs(corr_df['Pearson_r']) > 0.5)
    
    return corr_df

if len(common_models) >= 3:  # Need minimum samples for correlation
    # Correlate with growth rate
    growth_correlations = correlate_expression_with_growth(expression_for_corr, growth_for_corr, 'GrowthRate')
    
    print(f"Correlation analysis completed for {len(growth_correlations)} genes")
    print(f"Significant Spearman correlations: {growth_correlations['Significant_Spearman'].sum()}")
    print(f"Significant Pearson correlations: {growth_correlations['Significant_Pearson'].sum()}")
    
    # Display top correlated genes
    growth_correlations_sorted = growth_correlations.sort_values('Spearman_r', key=abs, ascending=False)
    
    print("\n=== Top 10 Genes Correlated with Growth Rate ===")
    print(growth_correlations_sorted[['Gene', 'Spearman_r', 'Spearman_p_adj']].head(10))
    
else:
    print("Insufficient samples for correlation analysis")
    growth_correlations = pd.DataFrame()

## 6. Summary and Export Results

Generate summary statistics and save results for further analysis.

In [None]:
# Create comprehensive summary
print("="*60)
print("PDX BIOMARKER ANALYSIS SUMMARY")
print("="*60)

print(f"\n📊 DATA OVERVIEW:")
print(f"   • Expression data: {expression_log.shape[0]} genes × {expression_log.shape[1]} samples")
print(f"   • Tumor data: {len(tumor_data)} measurements from {len(tumor_data['Model'].unique())} models")
print(f"   • Treatment groups: {list(tumor_data['Arm'].unique())}")

print(f"\n🧬 DIFFERENTIAL EXPRESSION:")
print(f"   • Total genes analyzed: {len(deg_results)}")
print(f"   • Significant genes (|log2FC|>1, FDR<0.05): {deg_results['Significant'].sum()}")
if deg_results['Significant'].sum() > 0:
    sig_up = (deg_results['Significant'] & (deg_results['Log2FoldChange'] > 0)).sum()
    sig_down = (deg_results['Significant'] & (deg_results['Log2FoldChange'] < 0)).sum()
    print(f"   • Upregulated in treatment: {sig_up}")
    print(f"   • Downregulated in treatment: {sig_down}")

print(f"\n📈 TUMOR GROWTH ANALYSIS:")
print(f"   • Models with growth data: {len(growth_metrics)}")
if len(growth_metrics) > 0:
    ctrl_growth = growth_metrics[growth_metrics['Arm'] == 'control']['GrowthRate'].mean()
    trt_growth = growth_metrics[growth_metrics['Arm'] == 'treatment']['GrowthRate'].mean()
    print(f"   • Mean growth rate (control): {ctrl_growth:.4f} log(volume)/day")
    print(f"   • Mean growth rate (treatment): {trt_growth:.4f} log(volume)/day")
    print(f"   • Growth inhibition: {((ctrl_growth - trt_growth) / ctrl_growth * 100):.1f}%")

print(f"\n🔗 EXPRESSION-GROWTH CORRELATIONS:")
if len(growth_correlations) > 0:
    print(f"   • Genes tested for correlation: {len(growth_correlations)}")
    print(f"   • Significant correlations (Spearman): {growth_correlations['Significant_Spearman'].sum()}")
    if growth_correlations['Significant_Spearman'].sum() > 0:
        top_corr = growth_correlations.loc[growth_correlations['Significant_Spearman']].iloc[0]
        print(f"   • Top correlated gene: {top_corr['Gene']} (r = {top_corr['Spearman_r']:.3f})")

print(f"\n📉 PCA RESULTS:")
print(f"   • PC1 explains {pca_model.explained_variance_ratio_[0]:.1%} of variance")
print(f"   • PC2 explains {pca_model.explained_variance_ratio_[1]:.1%} of variance")
print(f"   • Total variance explained (first 2 PCs): {sum(pca_model.explained_variance_ratio_[:2]):.1%}")

print(f"\n💾 SAVING RESULTS...")

# Save results to files
import os
os.makedirs('../results', exist_ok=True)

# Save differential expression results
deg_results.to_csv('../results/differential_expression_results.csv', index=False)
print("   • Differential expression results saved to ../results/differential_expression_results.csv")

# Save growth metrics
growth_metrics.to_csv('../results/tumor_growth_metrics.csv', index=False)
print("   • Growth metrics saved to ../results/tumor_growth_metrics.csv")

# Save correlation results
if len(growth_correlations) > 0:
    growth_correlations.to_csv('../results/expression_growth_correlations.csv', index=False)
    print("   • Expression-growth correlations saved to ../results/expression_growth_correlations.csv")

# Save figures
if 'volcano_fig' in locals():
    volcano_fig.savefig('../results/volcano_plot.png', dpi=300, bbox_inches='tight')
    print("   • Volcano plot saved to ../results/volcano_plot.png")

if 'heatmap_fig' in locals() and heatmap_fig is not None:
    heatmap_fig.savefig('../results/expression_heatmap.png', dpi=300, bbox_inches='tight')
    print("   • Expression heatmap saved to ../results/expression_heatmap.png")

print(f"\n✅ ANALYSIS COMPLETE!")
print("   All results have been saved to the ../results/ directory.")
print("   Review the saved files for detailed results and visualizations.")