# PDX Integrated Analysis

This notebook demonstrates how to integrate multiple data types (tumor volumes, gene expression, and variants) for comprehensive PDX analysis and biomarker discovery.

## Learning Objectives
- Integrate multi-modal PDX data
- Identify molecular determinants of drug response
- Create integrated visualizations
- Build predictive models using combined features

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('default')
sns.set_palette("husl")

print("Libraries loaded successfully!")
print(f"Pandas: {pd.__version__}")
print(f"NumPy: {np.__version__}")
print(f"Matplotlib: {plt.matplotlib.__version__}")
print(f"Seaborn: {sns.__version__}")

## 1. Data Loading and Integration

Load all three data types and prepare them for integrated analysis.

In [None]:
# Load tumor volume data
try:
    # Try realistic data first (recommended)
    tumor_df = pd.read_csv('../data/tumor_volumes_realistic.csv')
    print("✓ Loaded realistic tumor volume data")
except FileNotFoundError:
    # Fallback to effective data
    try:
        tumor_df = pd.read_csv('../data/tumor_volumes_effective.csv') 
        print("✓ Loaded effective tumor volume data")
    except FileNotFoundError:
        print("❌ Error: No tumor volume data found. Please run:")
        print("python ../src/python/generate_realistic_pdx_data.py")
        raise

print(f"Tumor data shape: {tumor_df.shape}")
tumor_df.head()

## 2. Response Metric Calculation

Calculate treatment response metrics for each PDX model.

In [None]:
# Load realistic gene expression data
expression_file = '../data/expression_tpm_realistic.csv'
try:
    expr_df = pd.read_csv(expression_file)
    print("✅ Loaded realistic expression data")
except FileNotFoundError:
    print(f"❌ Error: Realistic expression data not found at {expression_file}")
    print("Please run: python ../src/python/generate_realistic_pdx_data.py")
    raise

print(f"Expression data shape: {expr_df.shape}")
print(f"Genes: {expr_df.shape[1]-1}, Samples: {expr_df.shape[0]}")
expr_df.head()

## 3. Gene Expression Analysis

Identify genes associated with treatment response.

In [None]:
# Load realistic variant data
variant_file = '../data/variants_realistic.csv'
try:
    variant_df = pd.read_csv(variant_file)
    print("✅ Loaded realistic variant data")
except FileNotFoundError:
    print(f"❌ Error: Realistic variant data not found at {variant_file}")
    print("Please run: python ../src/python/generate_realistic_pdx_data.py")
    raise

print(f"Variant data shape: {variant_df.shape}")
variant_df.head()

In [None]:
# Differential expression analysis
def find_response_associated_genes(expr_data, response_data, min_expression=1.0):
    """
    Find genes associated with treatment response
    """
    responders = response_data[response_data['IsResponder']].index
    non_responders = response_data[~response_data['IsResponder']].index
    
    print(f"Analyzing {len(responders)} responders vs {len(non_responders)} non-responders")
    
    # Filter genes with sufficient expression
    expressed_genes = expr_data.columns[expr_data.mean() >= min_expression]
    print(f"Analyzing {len(expressed_genes)} expressed genes")
    
    gene_results = []
    
    for gene in expressed_genes:
        responder_expr = expr_data.loc[responders, gene]
        non_responder_expr = expr_data.loc[non_responders, gene]
        
        # Statistical test
        if len(responder_expr) > 1 and len(non_responder_expr) > 1:
            stat, pval = stats.mannwhitneyu(responder_expr, non_responder_expr, 
                                          alternative='two-sided')
            
            # Effect size (fold change)
            mean_resp = responder_expr.mean()
            mean_non_resp = non_responder_expr.mean()
            fold_change = mean_resp / mean_non_resp if mean_non_resp > 0 else np.inf
            log2_fc = np.log2(fold_change) if fold_change > 0 and fold_change != np.inf else 0
            
            gene_results.append({
                'Gene': gene,
                'ResponderMean': mean_resp,
                'NonResponderMean': mean_non_resp,
                'FoldChange': fold_change,
                'Log2FC': log2_fc,
                'PValue': pval,
                'Significant': pval < 0.05
            })
    
    results_df = pd.DataFrame(gene_results)
    
    # Adjust p-values (Bonferroni)
    results_df['AdjustedPValue'] = results_df['PValue'] * len(results_df)
    results_df['AdjustedPValue'] = results_df['AdjustedPValue'].clip(upper=1.0)
    results_df['SignificantAdj'] = results_df['AdjustedPValue'] < 0.05
    
    return results_df.sort_values('PValue')

# Run differential expression analysis
de_results = find_response_associated_genes(expr_filtered, response_filtered)

print(f"\nDifferential Expression Results:")
print(f"Total genes tested: {len(de_results)}")
print(f"Significant genes (p < 0.05): {de_results['Significant'].sum()}")
print(f"Significant genes (adjusted p < 0.05): {de_results['SignificantAdj'].sum()}")

# Show top results
print("\nTop 10 genes associated with response:")
top_genes = de_results.head(10)
for _, row in top_genes.iterrows():
    print(f"  {row['Gene']}: FC={row['FoldChange']:.2f}, p={row['PValue']:.3e}")

de_results.head()

## 4. Variant Analysis Integration

Analyze how genomic variants contribute to treatment response.

In [None]:
# Create variant matrix
def create_variant_matrix(variant_data, models):
    """
    Create a binary matrix of variant presence/absence
    """
    # Create binary matrix for each gene
    variant_matrix = pd.DataFrame(index=models)
    
    for gene in variant_data['Gene'].unique():
        # Models with variants in this gene
        mutated_models = variant_data[variant_data['Gene'] == gene]['Model'].unique()
        
        # Create binary column
        variant_matrix[f"{gene}_mut"] = 0
        variant_matrix.loc[variant_matrix.index.isin(mutated_models), f"{gene}_mut"] = 1
    
    return variant_matrix

# Create variant matrix for common models
variant_matrix = create_variant_matrix(variant_df, common_models)
print(f"Variant matrix shape: {variant_matrix.shape}")

# Calculate variant-response associations
def analyze_variant_response_association(variant_matrix, response_data):
    """
    Analyze association between variants and treatment response
    """
    results = []
    
    for gene_col in variant_matrix.columns:
        gene = gene_col.replace('_mut', '')
        
        # Get mutation status and response
        mut_status = variant_matrix[gene_col]
        responses = response_data.loc[mut_status.index, 'IsResponder']
        
        # Calculate response rates
        mutated_models = mut_status[mut_status == 1].index
        wild_type_models = mut_status[mut_status == 0].index
        
        if len(mutated_models) > 0 and len(wild_type_models) > 0:
            mut_response_rate = responses[mutated_models].mean()
            wt_response_rate = responses[wild_type_models].mean()
            
            # Fisher's exact test (approximated with chi-square)
            from scipy.stats import chi2_contingency
            
            contingency = pd.crosstab(mut_status, responses)
            if contingency.shape == (2, 2):
                chi2, pval, _, _ = chi2_contingency(contingency)
            else:
                pval = 1.0
            
            results.append({
                'Gene': gene,
                'MutatedModels': len(mutated_models),
                'WildTypeModels': len(wild_type_models),
                'MutatedResponseRate': mut_response_rate,
                'WildTypeResponseRate': wt_response_rate,
                'ResponseDifference': mut_response_rate - wt_response_rate,
                'PValue': pval
            })
    
    return pd.DataFrame(results).sort_values('PValue')

# Run variant-response analysis
variant_response = analyze_variant_response_association(variant_matrix, response_filtered)

print(f"\nVariant-Response Association Results:")
print(f"Genes analyzed: {len(variant_response)}")
print(f"Significant associations (p < 0.05): {(variant_response['PValue'] < 0.05).sum()}")

# Show top results
print("\nTop variant-response associations:")
for _, row in variant_response.head(5).iterrows():
    print(f"  {row['Gene']}: {row['ResponseDifference']:.3f} difference, p={row['PValue']:.3f}")

variant_response.head()

## 5. Integrated Visualizations

Create comprehensive visualizations combining all data types.

In [None]:
# Create integrated visualization
fig, axes = plt.subplots(2, 3, figsize=(18, 12))

# 1. Response distribution
response_counts = response_df['ResponseClass'].value_counts()
axes[0,0].pie(response_counts.values, labels=response_counts.index, autopct='%1.1f%%')
axes[0,0].set_title('Treatment Response Distribution')

# 2. TGI distribution
axes[0,1].hist(response_df['TGI'], bins=15, edgecolor='black', alpha=0.7)
axes[0,1].axvline(response_df['TGI'].mean(), color='red', linestyle='--', label=f'Mean: {response_df["TGI"].mean():.1f}%')
axes[0,1].set_xlabel('Tumor Growth Inhibition (%)')
axes[0,1].set_ylabel('Number of Models')
axes[0,1].set_title('TGI Distribution')
axes[0,1].legend()

# 3. Top differential genes
top_de_genes = de_results.head(10)
y_pos = range(len(top_de_genes))
axes[0,2].barh(y_pos, top_de_genes['Log2FC'], 
               color=['red' if x > 0 else 'blue' for x in top_de_genes['Log2FC']])
axes[0,2].set_yticks(y_pos)
axes[0,2].set_yticklabels(top_de_genes['Gene'])
axes[0,2].set_xlabel('Log2 Fold Change')
axes[0,2].set_title('Top Differentially Expressed Genes')
axes[0,2].axvline(0, color='black', linestyle='-', alpha=0.3)

# 4. Variant frequency
if len(variant_df) > 0:
    gene_counts = variant_df['Gene'].value_counts().head(10)
    axes[1,0].barh(range(len(gene_counts)), gene_counts.values)
    axes[1,0].set_yticks(range(len(gene_counts)))
    axes[1,0].set_yticklabels(gene_counts.index)
    axes[1,0].set_xlabel('Number of Models')
    axes[1,0].set_title('Most Frequently Mutated Genes')
else:
    axes[1,0].text(0.5, 0.5, 'No variant data', ha='center', va='center')
    axes[1,0].set_title('Variant Frequency')

# 5. Variant-response association
if len(variant_response) > 0:
    top_var_resp = variant_response.head(8)
    y_pos = range(len(top_var_resp))
    colors = ['red' if x > 0 else 'blue' for x in top_var_resp['ResponseDifference']]
    axes[1,1].barh(y_pos, top_var_resp['ResponseDifference'], color=colors)
    axes[1,1].set_yticks(y_pos)
    axes[1,1].set_yticklabels(top_var_resp['Gene'])
    axes[1,1].set_xlabel('Response Rate Difference')
    axes[1,1].set_title('Variant-Response Associations')
    axes[1,1].axvline(0, color='black', linestyle='-', alpha=0.3)
else:
    axes[1,1].text(0.5, 0.5, 'No significant\nassociations', ha='center', va='center')
    axes[1,1].set_title('Variant-Response Associations')

# 6. Expression vs Response correlation
if len(de_results) > 0:
    # Volcano plot
    log_pvals = -np.log10(de_results['PValue'].clip(lower=1e-10))
    scatter = axes[1,2].scatter(de_results['Log2FC'], log_pvals, 
                               c=de_results['Significant'], cmap='coolwarm', alpha=0.6)
    axes[1,2].axhline(-np.log10(0.05), color='red', linestyle='--', alpha=0.5, label='p=0.05')
    axes[1,2].axvline(0, color='black', linestyle='-', alpha=0.3)
    axes[1,2].set_xlabel('Log2 Fold Change')
    axes[1,2].set_ylabel('-Log10(p-value)')
    axes[1,2].set_title('Expression Volcano Plot')
    axes[1,2].legend()
else:
    axes[1,2].text(0.5, 0.5, 'No expression\ndata available', ha='center', va='center')
    axes[1,2].set_title('Expression Analysis')

plt.tight_layout()
plt.savefig('../results/integrated_analysis_summary.png', dpi=300, bbox_inches='tight')
plt.show()

print("Integrated analysis visualization saved to: ../results/integrated_analysis_summary.png")

## 6. Principal Component Analysis

Perform PCA on expression data to identify patterns related to response.

In [None]:
# PCA on expression data
if len(expr_filtered) > 0:
    # Select top variable genes for PCA
    gene_vars = expr_filtered.var().sort_values(ascending=False)
    top_variable_genes = gene_vars.head(1000).index  # Top 1000 most variable genes
    
    # Standardize expression data
    scaler = StandardScaler()
    expr_scaled = scaler.fit_transform(expr_filtered[top_variable_genes])
    
    # Perform PCA
    pca = PCA(n_components=min(10, len(expr_filtered)))
    pca_result = pca.fit_transform(expr_scaled)
    
    # Create PCA DataFrame
    pca_df = pd.DataFrame(pca_result, index=expr_filtered.index)
    pca_df.columns = [f'PC{i+1}' for i in range(pca_result.shape[1])]
    
    # Add response information
    pca_df = pca_df.join(response_filtered[['ResponseClass', 'TGI', 'IsResponder']])
    
    # Plot PCA results
    fig, axes = plt.subplots(1, 3, figsize=(18, 5))
    
    # PC1 vs PC2 colored by response
    for response_class in pca_df['ResponseClass'].unique():
        mask = pca_df['ResponseClass'] == response_class
        axes[0].scatter(pca_df.loc[mask, 'PC1'], pca_df.loc[mask, 'PC2'], 
                       label=response_class, alpha=0.7, s=60)
    axes[0].set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)')
    axes[0].set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)')
    axes[0].set_title('PCA: Response Classes')
    axes[0].legend()
    axes[0].grid(True, alpha=0.3)
    
    # PC1 vs PC2 colored by TGI
    scatter = axes[1].scatter(pca_df['PC1'], pca_df['PC2'], 
                             c=pca_df['TGI'], cmap='viridis', s=60, alpha=0.7)
    plt.colorbar(scatter, ax=axes[1], label='TGI (%)')
    axes[1].set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)')
    axes[1].set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%% variance)')
    axes[1].set_title('PCA: TGI Values')
    axes[1].grid(True, alpha=0.3)
    
    # Explained variance
    axes[2].bar(range(1, len(pca.explained_variance_ratio_) + 1), 
               pca.explained_variance_ratio_)
    axes[2].set_xlabel('Principal Component')
    axes[2].set_ylabel('Explained Variance Ratio')
    axes[2].set_title('PCA Explained Variance')
    axes[2].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig('../results/pca_analysis.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print(f"PCA completed with {len(top_variable_genes)} genes")
    print(f"First 3 PCs explain {pca.explained_variance_ratio_[:3].sum():.1%} of variance")
    
else:
    print("Insufficient expression data for PCA")

## 7. Predictive Modeling

Build a predictive model using integrated features.

In [None]:
# Prepare features for modeling
def prepare_integrated_features():
    """
    Combine expression, variant, and other features for modeling
    """
    features_df = pd.DataFrame(index=common_models)
    
    # Add top differential expression genes
    if len(de_results) > 0:
        top_genes = de_results.head(20)['Gene'].tolist()  # Top 20 DE genes
        for gene in top_genes:
            if gene in expr_filtered.columns:
                features_df[f"{gene}_expr"] = expr_filtered[gene]
    
    # Add variant information
    if len(variant_matrix) > 0:
        # Add top variant-associated genes
        top_variant_genes = variant_response.head(10)['Gene'].tolist()
        for gene in top_variant_genes:
            col_name = f"{gene}_mut"
            if col_name in variant_matrix.columns:
                features_df[col_name] = variant_matrix[col_name]
    
    # Add PCA components if available
    if 'pca_df' in locals() and len(pca_df) > 0:
        for i in range(min(5, pca_result.shape[1])):  # Top 5 PCs
            features_df[f"PC{i+1}"] = pca_df[f"PC{i+1}"]
    
    return features_df

# Prepare features
features_df = prepare_integrated_features()
print(f"Integrated feature matrix: {features_df.shape}")
print(f"Features: {list(features_df.columns)}")

# Remove features with too many missing values
features_df = features_df.fillna(0)  # Fill missing values with 0

if len(features_df) > 0 and len(features_df.columns) > 0:
    # Build Random Forest model
    X = features_df
    y = response_filtered.loc[features_df.index, 'IsResponder']
    
    # Train model
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_model.fit(X, y)
    
    # Cross-validation
    cv_scores = cross_val_score(rf_model, X, y, cv=min(5, len(X)), scoring='accuracy')
    
    # Feature importance
    feature_importance = pd.DataFrame({
        'Feature': X.columns,
        'Importance': rf_model.feature_importances_
    }).sort_values('Importance', ascending=False)
    
    print(f"\nPredictive Model Results:")
    print(f"Cross-validation accuracy: {cv_scores.mean():.3f} ± {cv_scores.std():.3f}")
    print(f"Training accuracy: {rf_model.score(X, y):.3f}")
    
    print(f"\nTop 10 most important features:")
    for _, row in feature_importance.head(10).iterrows():
        print(f"  {row['Feature']}: {row['Importance']:.3f}")
    
    # Plot feature importance
    plt.figure(figsize=(12, 8))
    top_features = feature_importance.head(15)
    plt.barh(range(len(top_features)), top_features['Importance'])
    plt.yticks(range(len(top_features)), top_features['Feature'])
    plt.xlabel('Feature Importance')
    plt.title('Random Forest Feature Importance\n(Top 15 Features)')
    plt.tight_layout()
    plt.savefig('../results/feature_importance.png', dpi=300, bbox_inches='tight')
    plt.show()
    
else:
    print("Insufficient features for predictive modeling")

## 8. Summary and Export Results

Summarize the integrated analysis and export key results.

In [None]:
# Create comprehensive summary
print("=" * 60)
print("PDX INTEGRATED ANALYSIS SUMMARY")
print("=" * 60)

print(f"\n📊 DATA OVERVIEW:")
print(f"  • Models analyzed: {len(common_models)}")
print(f"  • Genes profiled: {expr_filtered.shape[1] if len(expr_filtered) > 0 else 0}")
print(f"  • Variants detected: {len(variant_df) if len(variant_df) > 0 else 0}")

print(f"\n🎯 RESPONSE CHARACTERISTICS:")
if len(response_df) > 0:
    print(f"  • Mean TGI: {response_df['TGI'].mean():.1f}%")
    print(f"  • Response rate: {response_df['IsResponder'].mean():.1%}")
    response_counts = response_df['ResponseClass'].value_counts()
    for resp_class, count in response_counts.items():
        print(f"    - {resp_class}: {count} models ({count/len(response_df):.1%})")

print(f"\n🧬 GENE EXPRESSION FINDINGS:")
if len(de_results) > 0:
    sig_genes = de_results['Significant'].sum()
    print(f"  • Significantly different genes: {sig_genes}")
    if sig_genes > 0:
        print(f"  • Top upregulated: {de_results[de_results['Log2FC'] > 0].head(1)['Gene'].iloc[0] if len(de_results[de_results['Log2FC'] > 0]) > 0 else 'None'}")
        print(f"  • Top downregulated: {de_results[de_results['Log2FC'] < 0].head(1)['Gene'].iloc[0] if len(de_results[de_results['Log2FC'] < 0]) > 0 else 'None'}")

print(f"\n🔬 VARIANT ASSOCIATIONS:")
if len(variant_response) > 0:
    sig_variants = (variant_response['PValue'] < 0.05).sum()
    print(f"  • Significant variant associations: {sig_variants}")
    if sig_variants > 0:
        top_variant = variant_response.iloc[0]
        print(f"  • Top association: {top_variant['Gene']} (Δ = {top_variant['ResponseDifference']:.3f})")

print(f"\n🤖 PREDICTIVE MODEL:")
if 'cv_scores' in locals():
    print(f"  • Cross-validation accuracy: {cv_scores.mean():.3f} ± {cv_scores.std():.3f}")
    print(f"  • Features used: {len(features_df.columns)}")
    if len(feature_importance) > 0:
        print(f"  • Most important feature: {feature_importance.iloc[0]['Feature']}")

print(f"\n📁 OUTPUT FILES:")
print(f"  • Integrated summary: ../results/integrated_analysis_summary.png")
if 'pca_df' in locals():
    print(f"  • PCA analysis: ../results/pca_analysis.png")
if 'feature_importance' in locals():
    print(f"  • Feature importance: ../results/feature_importance.png")

# Export key results
if len(de_results) > 0:
    de_results.to_csv('../results/differential_expression_results.csv', index=False)
    print(f"  • DE results: ../results/differential_expression_results.csv")

if len(variant_response) > 0:
    variant_response.to_csv('../results/variant_response_associations.csv', index=False)
    print(f"  • Variant associations: ../results/variant_response_associations.csv")

response_df.to_csv('../results/response_metrics.csv', index=False)
print(f"  • Response metrics: ../results/response_metrics.csv")

print(f"\n✅ INTEGRATED ANALYSIS COMPLETE!")
print("=" * 60)