# Pretraining Comparison: GET vs GETDiffusion

This notebook compares pretraining performance between two architectures:
1. **GETDiffusion Model**: GETRegionDiffusion pretraining
2. **GET Model**: GETRegionPretrain pretraining

Both were pretrained on the PBMC dataset. We'll compare their training curves and performance metrics.


In [None]:
import sys
import os

# Add project root to Python path
PROJECT_ROOT = '/home/yoyomanzoor/Documents/get_multimodel'
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)
os.chdir(PROJECT_ROOT)


In [None]:
# Create results directory if it doesn't exist
RESULTS_DIR = 'results'
os.makedirs(RESULTS_DIR, exist_ok=True)
print(f"Results will be saved to: {os.path.abspath(RESULTS_DIR)}")


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Set style
plt.rcParams['figure.dpi'] = 100
plt.rcParams['font.family'] = 'sans-serif'
sns.set_style('whitegrid')


## Load Pretraining CSV Logs


In [None]:
# Paths to pretraining CSV log files
diffusion_log_path = "/home/yoyomanzoor/Crucial/get_data/pretrain_GETDiffusion_old/output/pretrain_diffusion_testing/pretrain_GETDiffusion_old/csv_logs/lightning_logs/version_0/metrics.csv"
transformer_log_path = "/home/yoyomanzoor/Crucial/get_data/pretrain_GET/output/pretrain_diffusion_testing/pretrain_GET/csv_logs/lightning_logs/version_0/metrics.csv"

print(f"GETDiffusion pretrain log path: {diffusion_log_path}")
print(f"GETDiffusion log exists: {os.path.exists(diffusion_log_path)}")
print(f"\nGET pretrain log path: {transformer_log_path}")
print(f"GET log exists: {os.path.exists(transformer_log_path)}")


## Training Loss and Masked Pearson Correlation Comparison


In [None]:
# Load and plot training loss curves
if diffusion_log_path and os.path.exists(diffusion_log_path) and transformer_log_path and os.path.exists(transformer_log_path):
    # Load CSV logs
    df_diffusion = pd.read_csv(diffusion_log_path)
    df_transformer = pd.read_csv(transformer_log_path)
    
    # Create figure with subplots
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Plot training loss by step
    ax = axes[0]
    if 'step' in df_diffusion.columns and 'train_loss' in df_diffusion.columns:
        # Filter out NaN values
        df_diff_loss = df_diffusion[df_diffusion['train_loss'].notna()]
        if len(df_diff_loss) > 0:
            ax.plot(df_diff_loss['step'], df_diff_loss['train_loss'], 
                    label='GETDiffusion', color='#e74c3c', linewidth=2.5)
    if 'step' in df_transformer.columns and 'train_loss' in df_transformer.columns:
        # Filter out NaN values
        df_trans_loss = df_transformer[df_transformer['train_loss'].notna()]
        if len(df_trans_loss) > 0:
            ax.plot(df_trans_loss['step'], df_trans_loss['train_loss'], 
                    label='GET', color='#3498db', linewidth=2.5)
    ax.set_xlabel('Step', fontsize=12)
    ax.set_ylabel('Training Loss', fontsize=12)
    ax.set_title('Training Loss Over Steps', fontsize=12)
    ax.legend(fontsize=11)
    ax.grid(True, alpha=0.2, linestyle='--')
    
    # Add epoch labels on secondary x-axis (use diffusion model for epoch markers)
    if 'epoch' in df_diffusion.columns and 'step' in df_diffusion.columns:
        # Get unique epochs and their corresponding step values
        epoch_steps = df_diffusion[df_diffusion['epoch'].notna()].groupby('epoch')['step'].first()
        if len(epoch_steps) > 0:
            ax2 = ax.twiny()
            ax2.set_xlim(ax.get_xlim())
            ax2.set_xticks(epoch_steps.values)
            ax2.set_xticklabels([f'E{int(e)}' for e in epoch_steps.index], fontsize=9)
            ax2.set_xlabel('Epoch', fontsize=10)
    
    # Plot masked_pearson by step
    ax = axes[1]
    if 'step' in df_diffusion.columns and 'masked_pearson' in df_diffusion.columns:
        # Filter out NaN values
        df_diff_pearson = df_diffusion[df_diffusion['masked_pearson'].notna()]
        if len(df_diff_pearson) > 0:
            ax.plot(df_diff_pearson['step'], df_diff_pearson['masked_pearson'], 
                    label='GETDiffusion', color='#e74c3c', linewidth=2.5)
    if 'step' in df_transformer.columns and 'masked_pearson' in df_transformer.columns:
        # Filter out NaN values
        df_trans_pearson = df_transformer[df_transformer['masked_pearson'].notna()]
        if len(df_trans_pearson) > 0:
            ax.plot(df_trans_pearson['step'], df_trans_pearson['masked_pearson'], 
                    label='GET', color='#3498db', linewidth=2.5)
    ax.set_xlabel('Step', fontsize=12)
    ax.set_ylabel('Masked Pearson Correlation', fontsize=12)
    ax.set_title('Masked Pearson Correlation Over Steps', fontsize=12)
    ax.legend(fontsize=11)
    ax.grid(True, alpha=0.2, linestyle='--')
    
    # Add epoch labels on secondary x-axis (use diffusion model for epoch markers)
    if 'epoch' in df_diffusion.columns and 'step' in df_diffusion.columns:
        # Get unique epochs and their corresponding step values
        epoch_steps = df_diffusion[df_diffusion['epoch'].notna()].groupby('epoch')['step'].first()
        if len(epoch_steps) > 0:
            ax2 = ax.twiny()
            ax2.set_xlim(ax.get_xlim())
            ax2.set_xticks(epoch_steps.values)
            ax2.set_xticklabels([f'E{int(e)}' for e in epoch_steps.index], fontsize=9)
            ax2.set_xlabel('Epoch', fontsize=10)
    
    plt.suptitle('Pretraining Comparison: GETDiffusion vs GET', fontsize=14, y=1.02)
    plt.tight_layout()
    plt.savefig(f'{RESULTS_DIR}/pretrain_training_loss_comparison.png', dpi=150, bbox_inches='tight')
    plt.show()
    
    # Print summary statistics
    print("\n" + "="*70)
    print("PRETRAINING SUMMARY")
    print("="*70)
    if 'train_loss' in df_diffusion.columns:
        df_diff_loss = df_diffusion[df_diffusion['train_loss'].notna()]
        if len(df_diff_loss) > 0:
            print(f"\nGETDiffusion - Final Training Loss: {df_diff_loss['train_loss'].iloc[-1]:.6f}")
            print(f"GETDiffusion - Minimum Training Loss: {df_diff_loss['train_loss'].min():.6f}")
    if 'train_loss' in df_transformer.columns:
        df_trans_loss = df_transformer[df_transformer['train_loss'].notna()]
        if len(df_trans_loss) > 0:
            print(f"GET - Final Training Loss: {df_trans_loss['train_loss'].iloc[-1]:.6f}")
            print(f"GET - Minimum Training Loss: {df_trans_loss['train_loss'].min():.6f}")
    if 'masked_pearson' in df_diffusion.columns:
        df_diff_pearson = df_diffusion[df_diffusion['masked_pearson'].notna()]
        if len(df_diff_pearson) > 0:
            print(f"\nGETDiffusion - Final Masked Pearson: {df_diff_pearson['masked_pearson'].iloc[-1]:.6f}")
            print(f"GETDiffusion - Maximum Masked Pearson: {df_diff_pearson['masked_pearson'].max():.6f}")
    if 'masked_pearson' in df_transformer.columns:
        df_trans_pearson = df_transformer[df_transformer['masked_pearson'].notna()]
        if len(df_trans_pearson) > 0:
            print(f"GET - Final Masked Pearson: {df_trans_pearson['masked_pearson'].iloc[-1]:.6f}")
            print(f"GET - Maximum Masked Pearson: {df_trans_pearson['masked_pearson'].max():.6f}")
        
elif diffusion_log_path and os.path.exists(diffusion_log_path):
    print("Found GETDiffusion logs but not GET logs. Plotting GETDiffusion only...")
    df_diffusion = pd.read_csv(diffusion_log_path)
    
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Plot training loss by step
    ax = axes[0]
    if 'step' in df_diffusion.columns and 'train_loss' in df_diffusion.columns:
        df_diff_loss = df_diffusion[df_diffusion['train_loss'].notna()]
        if len(df_diff_loss) > 0:
            ax.plot(df_diff_loss['step'], df_diff_loss['train_loss'], 
                    label='GETDiffusion', color='#e74c3c', linewidth=2.5)
    ax.set_xlabel('Step', fontsize=12)
    ax.set_ylabel('Training Loss', fontsize=12)
    ax.set_title('Training Loss Over Steps (GETDiffusion)', fontsize=12)
    ax.legend(fontsize=11)
    ax.grid(True, alpha=0.2, linestyle='--')
    
    # Add epoch labels on secondary x-axis
    if 'epoch' in df_diffusion.columns and 'step' in df_diffusion.columns:
        epoch_steps = df_diffusion[df_diffusion['epoch'].notna()].groupby('epoch')['step'].first()
        if len(epoch_steps) > 0:
            ax2 = ax.twiny()
            ax2.set_xlim(ax.get_xlim())
            ax2.set_xticks(epoch_steps.values)
            ax2.set_xticklabels([f'E{int(e)}' for e in epoch_steps.index], fontsize=9)
            ax2.set_xlabel('Epoch', fontsize=10)
    
    # Plot masked_pearson by step
    ax = axes[1]
    if 'step' in df_diffusion.columns and 'masked_pearson' in df_diffusion.columns:
        df_diff_pearson = df_diffusion[df_diffusion['masked_pearson'].notna()]
        if len(df_diff_pearson) > 0:
            ax.plot(df_diff_pearson['step'], df_diff_pearson['masked_pearson'], 
                    label='GETDiffusion', color='#e74c3c', linewidth=2.5)
    ax.set_xlabel('Step', fontsize=12)
    ax.set_ylabel('Masked Pearson Correlation', fontsize=12)
    ax.set_title('Masked Pearson Correlation Over Steps (GETDiffusion)', fontsize=12)
    ax.legend(fontsize=11)
    ax.grid(True, alpha=0.2, linestyle='--')
    
    # Add epoch labels on secondary x-axis
    if 'epoch' in df_diffusion.columns and 'step' in df_diffusion.columns:
        epoch_steps = df_diffusion[df_diffusion['epoch'].notna()].groupby('epoch')['step'].first()
        if len(epoch_steps) > 0:
            ax2 = ax.twiny()
            ax2.set_xlim(ax.get_xlim())
            ax2.set_xticks(epoch_steps.values)
            ax2.set_xticklabels([f'E{int(e)}' for e in epoch_steps.index], fontsize=9)
            ax2.set_xlabel('Epoch', fontsize=10)
    
    plt.suptitle('Pretraining: GETDiffusion', fontsize=14, y=1.02)
    plt.tight_layout()
    plt.savefig(f'{RESULTS_DIR}/pretrain_training_loss_getdiffusion.png', dpi=150, bbox_inches='tight')
    plt.show()
    
else:
    print("CSV log files not found. Please check the log file paths above.")
    print("\nTo generate this plot, ensure CSV log files exist at:")
    print(f"  - GETDiffusion: {diffusion_log_path}")
    print(f"  - GET: {transformer_log_path}")


## Additional Metrics Comparison

Compare other metrics like R² and MSE.


In [None]:
# Plot additional metrics if available
if diffusion_log_path and os.path.exists(diffusion_log_path) and transformer_log_path and os.path.exists(transformer_log_path):
    df_diffusion = pd.read_csv(diffusion_log_path)
    df_transformer = pd.read_csv(transformer_log_path)
    
    # Create figure with subplots for additional metrics
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Plot masked_r2 by step
    ax = axes[0]
    if 'step' in df_diffusion.columns and 'masked_r2' in df_diffusion.columns:
        df_diff_r2 = df_diffusion[df_diffusion['masked_r2'].notna()]
        if len(df_diff_r2) > 0:
            ax.plot(df_diff_r2['step'], df_diff_r2['masked_r2'], 
                    label='GETDiffusion', color='#e74c3c', linewidth=2.5)
    if 'step' in df_transformer.columns and 'masked_r2' in df_transformer.columns:
        df_trans_r2 = df_transformer[df_transformer['masked_r2'].notna()]
        if len(df_trans_r2) > 0:
            ax.plot(df_trans_r2['step'], df_trans_r2['masked_r2'], 
                    label='GET', color='#3498db', linewidth=2.5)
    ax.set_xlabel('Step', fontsize=12)
    ax.set_ylabel('Masked R²', fontsize=12)
    ax.set_title('Masked R² Over Steps', fontsize=12)
    ax.legend(fontsize=11)
    ax.grid(True, alpha=0.2, linestyle='--')
    
    # Plot masked_mse by step
    ax = axes[1]
    if 'step' in df_diffusion.columns and 'masked_mse' in df_diffusion.columns:
        df_diff_mse = df_diffusion[df_diffusion['masked_mse'].notna()]
        if len(df_diff_mse) > 0:
            ax.plot(df_diff_mse['step'], df_diff_mse['masked_mse'], 
                    label='GETDiffusion', color='#e74c3c', linewidth=2.5)
    if 'step' in df_transformer.columns and 'masked_mse' in df_transformer.columns:
        df_trans_mse = df_transformer[df_transformer['masked_mse'].notna()]
        if len(df_trans_mse) > 0:
            ax.plot(df_trans_mse['step'], df_trans_mse['masked_mse'], 
                    label='GET', color='#3498db', linewidth=2.5)
    ax.set_xlabel('Step', fontsize=12)
    ax.set_ylabel('Masked MSE', fontsize=12)
    ax.set_title('Masked MSE Over Steps', fontsize=12)
    ax.legend(fontsize=11)
    ax.grid(True, alpha=0.2, linestyle='--')
    
    plt.suptitle('Pretraining Comparison: Additional Metrics', fontsize=14, y=1.02)
    plt.tight_layout()
    plt.savefig(f'{RESULTS_DIR}/pretrain_additional_metrics_comparison.png', dpi=150, bbox_inches='tight')
    plt.show()
    
    # Print summary for additional metrics
    print("\n" + "="*70)
    print("ADDITIONAL METRICS SUMMARY")
    print("="*70)
    if 'masked_r2' in df_diffusion.columns:
        df_diff_r2 = df_diffusion[df_diffusion['masked_r2'].notna()]
        if len(df_diff_r2) > 0:
            print(f"\nGETDiffusion - Final Masked R²: {df_diff_r2['masked_r2'].iloc[-1]:.6f}")
            print(f"GETDiffusion - Maximum Masked R²: {df_diff_r2['masked_r2'].max():.6f}")
    if 'masked_r2' in df_transformer.columns:
        df_trans_r2 = df_transformer[df_transformer['masked_r2'].notna()]
        if len(df_trans_r2) > 0:
            print(f"GET - Final Masked R²: {df_trans_r2['masked_r2'].iloc[-1]:.6f}")
            print(f"GET - Maximum Masked R²: {df_trans_r2['masked_r2'].max():.6f}")
    if 'masked_mse' in df_diffusion.columns:
        df_diff_mse = df_diffusion[df_diffusion['masked_mse'].notna()]
        if len(df_diff_mse) > 0:
            print(f"\nGETDiffusion - Final Masked MSE: {df_diff_mse['masked_mse'].iloc[-1]:.6f}")
            print(f"GETDiffusion - Minimum Masked MSE: {df_diff_mse['masked_mse'].min():.6f}")
    if 'masked_mse' in df_transformer.columns:
        df_trans_mse = df_transformer[df_transformer['masked_mse'].notna()]
        if len(df_trans_mse) > 0:
            print(f"GET - Final Masked MSE: {df_trans_mse['masked_mse'].iloc[-1]:.6f}")
            print(f"GET - Minimum Masked MSE: {df_trans_mse['masked_mse'].min():.6f}")
else:
    print("Cannot create additional metrics plots - CSV files not found.")
