# QLoRA Diagnostic Analysis - Part 3: Comprehensive Diagnostic Analysis

## Objective
Test the three core hypotheses and provide diagnostic insights into QLoRA's performance characteristics.

## Hypotheses to Test
1. **Quantization Impact**: If weight similarity (cosine sim) > 0.95, QLoRA should always be preferred
2. **Layer Sensitivity**: Which transformer layers are most sensitive to quantization?
3. **Rank Threshold**: What is the minimum rank r* that preserves quality?

---

## 1. Environment Setup

In [None]:
# Install required packages
%pip install -q transformers datasets accelerate peft bitsandbytes matplotlib seaborn pandas numpy scikit-learn scipy tqdm

In [None]:
# Import utilities
import sys
import os
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.metrics.pairwise import cosine_similarity

# Add src to path
sys.path.append('../src')

from model_utils import (
    load_base_model_16bit,
    load_base_model_4bit,
    setup_lora_16bit,
    setup_lora_4bit,
    clear_memory
)

from evaluation import (
    evaluate_token_match,
    evaluate_embedding_similarity,
    compare_weight_matrices,
    comprehensive_evaluation
)

from visualization import (
    plot_rank_threshold_analysis,
    plot_weight_similarity_matrix,
    print_diagnostic_summary
)

print(f"‚úì PyTorch version: {torch.__version__}")
print(f"‚úì CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"‚úì GPU: {torch.cuda.get_device_name(0)}")

## 2. Load Previous Results

In [None]:
# Load baseline LoRA results
with open('../results_baseline_lora/baseline_results.pkl', 'rb') as f:
    baseline_results = pickle.load(f)
baseline_df = pd.DataFrame(baseline_results)

# Load QLoRA results
with open('../results_qlora/qlora_results.pkl', 'rb') as f:
    qlora_results = pickle.load(f)
qlora_df = pd.DataFrame(qlora_results)

print(f"‚úì Loaded {len(baseline_results)} baseline results")
print(f"‚úì Loaded {len(qlora_results)} QLoRA results")

# Combine for analysis
combined_df = pd.concat([baseline_df, qlora_df], ignore_index=True)
print(f"\nTotal experiments: {len(combined_df)}")

## 3. Hypothesis 1: Quantization Impact (Weight Similarity Analysis)

**Hypothesis:** If cosine similarity between LoRA and QLoRA adapter weights > 0.95, then QLoRA should always be preferred.

### 3.1 Load Trained Models for Comparison

In [None]:
# We'll compare rank 8 models (good middle ground)
COMPARISON_RANK = 8
MODEL_NAME = "gpt2-medium"

print(f"Loading models with rank={COMPARISON_RANK} for comparison...")

# Load LoRA model
print("\n1. Loading LoRA (16-bit) model...")
lora_model_path = f"../results_baseline_lora/16bit_r{COMPARISON_RANK}/final_model"
try:
    from peft import PeftModel
    base_model_16, tokenizer = load_base_model_16bit(MODEL_NAME)
    lora_model = PeftModel.from_pretrained(base_model_16, lora_model_path)
    print("‚úì LoRA model loaded")
except Exception as e:
    print(f"‚ö†Ô∏è  Could not load LoRA model: {e}")
    print("Note: Models must be trained and saved first")
    lora_model = None

# Load QLoRA model
print("\n2. Loading QLoRA (4-bit) model...")
qlora_model_path = f"../results_qlora/4bit_r{COMPARISON_RANK}/final_model"
try:
    base_model_4bit, _ = load_base_model_4bit(MODEL_NAME)
    qlora_model = PeftModel.from_pretrained(base_model_4bit, qlora_model_path)
    print("‚úì QLoRA model loaded")
except Exception as e:
    print(f"‚ö†Ô∏è  Could not load QLoRA model: {e}")
    qlora_model = None

### 3.2 Compare Adapter Weights

In [None]:
if lora_model and qlora_model:
    # Compare weights across multiple layers
    layers_to_compare = [0, 12, 23]  # First, middle, last layers
    
    weight_similarities = {}
    
    for layer_idx in layers_to_compare:
        layer_name = f"transformer.h.{layer_idx}.attn.c_attn"
        
        print(f"\nComparing layer {layer_idx}...")
        result = compare_weight_matrices(lora_model, qlora_model, layer_name)
        
        if result:
            weight_similarities[f"Layer {layer_idx}"] = result['cosine_similarity']
            print(f"  Cosine similarity: {result['cosine_similarity']:.4f}")
            print(f"  L2 distance: {result['l2_distance']:.4f}")
            print(f"  Relative difference: {result['relative_difference']:.4f}")
    
    # Summary
    print("\n" + "="*60)
    print("WEIGHT SIMILARITY SUMMARY")
    print("="*60)
    mean_similarity = np.mean(list(weight_similarities.values()))
    print(f"Mean cosine similarity: {mean_similarity:.4f}")
    print(f"Threshold (0.95): {'‚úì MET' if mean_similarity >= 0.95 else '‚úó NOT MET'}")
    
    # Hypothesis verdict
    if mean_similarity >= 0.95:
        print("\n‚úÖ HYPOTHESIS SUPPORTED: QLoRA preserves weight information")
    else:
        print("\n‚ö†Ô∏è  HYPOTHESIS CHALLENGED: Significant weight divergence detected")
else:
    print("\n‚ö†Ô∏è  Skipping weight comparison (models not loaded)")
    weight_similarities = {}

### 3.3 Visualize Weight Similarity

In [None]:
if weight_similarities:
    plot_weight_similarity_matrix(
        weight_similarities,
        save_path='../results/figures/weight_similarity_matrix.png'
    )

## 4. Hypothesis 2: Rank Threshold Analysis

**Question:** What is the minimum rank r* that preserves acceptable quality?

### 4.1 Analyze Performance vs Rank

In [None]:
# Plot training loss vs rank
fig, ax = plt.subplots(figsize=(10, 6))

# LoRA
lora_data = baseline_df.sort_values('rank')
ax.plot(lora_data['rank'], lora_data['training_loss'], 
        marker='o', markersize=10, linewidth=2.5,
        label='LoRA (16-bit)', color='#3498db', alpha=0.8)

# QLoRA
qlora_data = qlora_df.sort_values('rank')
ax.plot(qlora_data['rank'], qlora_data['training_loss'],
        marker='s', markersize=10, linewidth=2.5,
        label='QLoRA (4-bit)', color='#e74c3c', alpha=0.8)

ax.set_xlabel('LoRA Rank (r)', fontsize=12, fontweight='bold')
ax.set_ylabel('Training Loss', fontsize=12, fontweight='bold')
ax.set_title('Rank Threshold Analysis: Loss vs Rank', fontsize=14, fontweight='bold')
ax.legend(fontsize=11)
ax.grid(alpha=0.3)
ax.set_xticks([2, 4, 8, 16])

plt.tight_layout()
plt.savefig('../results/figures/rank_threshold_plot.png', dpi=300, bbox_inches='tight')
plt.show()

# Identify rank threshold
print("\nüìä RANK THRESHOLD ANALYSIS")
print("="*60)
for rank in [2, 4, 8, 16]:
    lora_loss = baseline_df[baseline_df['rank'] == rank]['training_loss'].values[0]
    qlora_loss = qlora_df[qlora_df['rank'] == rank]['training_loss'].values[0]
    diff = abs(lora_loss - qlora_loss)
    print(f"Rank {rank:2d}: LoRA={lora_loss:.4f}, QLoRA={qlora_loss:.4f}, Diff={diff:.4f}")

print("\nüí° INTERPRETATION:")
print("[TODO: Fill in after running - e.g., 'Significant degradation at r=2, stable at r‚â•4']")

### 4.2 Performance Degradation Analysis

In [None]:
# Calculate relative performance degradation
degradation_analysis = []

for rank in [2, 4, 8, 16]:
    lora_loss = baseline_df[baseline_df['rank'] == rank]['training_loss'].values[0]
    qlora_loss = qlora_df[qlora_df['rank'] == rank]['training_loss'].values[0]
    
    degradation_pct = ((qlora_loss - lora_loss) / lora_loss) * 100
    
    degradation_analysis.append({
        'rank': rank,
        'lora_loss': lora_loss,
        'qlora_loss': qlora_loss,
        'degradation_%': degradation_pct,
        'acceptable': 'YES' if abs(degradation_pct) < 5 else 'NO'
    })

degradation_df = pd.DataFrame(degradation_analysis)

print("\nüîç DEGRADATION ANALYSIS")
print("="*60)
print("Threshold: <5% degradation considered acceptable\n")
display(degradation_df)

# Identify minimum viable rank
acceptable_ranks = degradation_df[degradation_df['acceptable'] == 'YES']['rank'].tolist()
if acceptable_ranks:
    min_rank = min(acceptable_ranks)
    print(f"\n‚ú® Minimum viable rank: r* = {min_rank}")
else:
    print("\n‚ö†Ô∏è  No ranks meet acceptability threshold")

## 5. Hypothesis 3: Layer Sensitivity Analysis

**Question:** Which transformer weight matrices are most sensitive to quantization?

### 5.1 Memory vs Performance Trade-off

In [None]:
# Create scatter plot: memory vs performance
fig, ax = plt.subplots(figsize=(10, 6))

# LoRA
ax.scatter(baseline_df['peak_memory_mb'], baseline_df['training_loss'],
           s=200, alpha=0.6, color='#3498db', label='LoRA (16-bit)', edgecolors='black')

# QLoRA
ax.scatter(qlora_df['peak_memory_mb'], qlora_df['training_loss'],
           s=200, alpha=0.6, color='#e74c3c', label='QLoRA (4-bit)', 
           marker='s', edgecolors='black')

# Annotate ranks
for _, row in baseline_df.iterrows():
    ax.annotate(f"r={int(row['rank'])}", 
                (row['peak_memory_mb'], row['training_loss']),
                fontsize=9, ha='center', va='bottom')

for _, row in qlora_df.iterrows():
    ax.annotate(f"r={int(row['rank'])}", 
                (row['peak_memory_mb'], row['training_loss']),
                fontsize=9, ha='center', va='bottom')

ax.set_xlabel('Peak GPU Memory (MB)', fontsize=12, fontweight='bold')
ax.set_ylabel('Training Loss', fontsize=12, fontweight='bold')
ax.set_title('Memory vs Performance Trade-off', fontsize=14, fontweight='bold')
ax.legend(fontsize=11)
ax.grid(alpha=0.3)

plt.tight_layout()
plt.savefig('../results/figures/memory_vs_performance.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nüí° Pareto Frontier: Lower-left is optimal (low memory, low loss)")

## 6. Failure Mode Documentation

### 6.1 Identify Failure Conditions

In [None]:
print("\n‚ö†Ô∏è  DOCUMENTED FAILURE MODES")
print("="*70)

# Failure Mode 1: Insufficient Rank
r2_degradation = degradation_df[degradation_df['rank'] == 2]['degradation_%'].values[0]
if abs(r2_degradation) > 5:
    print("\n1. INSUFFICIENT RANK (r < r*)")
    print(f"   Symptom: At r=2, degradation = {r2_degradation:.2f}%")
    print("   Cause: Low-rank bottleneck cannot capture task complexity")
    print(f"   Mitigation: Use rank ‚â• {min_rank if acceptable_ranks else 4}")

# Failure Mode 2: Weight Divergence
if weight_similarities and mean_similarity < 0.95:
    print("\n2. WEIGHT DIVERGENCE")
    print(f"   Symptom: Cosine similarity = {mean_similarity:.4f} < 0.95")
    print("   Cause: Quantization noise exceeds low-rank capacity")
    print("   Mitigation: Increase rank or use 8-bit quantization")

# Failure Mode 3: [Add more based on observations]
print("\n3. [TODO: Document additional failure modes observed in experiments]")

print("\n" + "="*70)

## 7. Comprehensive Diagnostic Summary

In [None]:
# Compile final diagnostic summary
diagnostic_summary = {
    'lora_memory_mb': baseline_df[baseline_df['rank'] == 8]['peak_memory_mb'].values[0],
    'qlora_memory_mb': qlora_df[qlora_df['rank'] == 8]['peak_memory_mb'].values[0],
    'mean_cosine_similarity': mean_similarity if weight_similarities else None,
    'mean_token_match': None,  # Would need evaluation dataset
    'lora_time_per_step': baseline_df[baseline_df['rank'] == 8]['time_per_step'].values[0],
    'qlora_time_per_step': qlora_df[qlora_df['rank'] == 8]['time_per_step'].values[0],
}

print_diagnostic_summary(diagnostic_summary)

## 8. Final Recommendations

### 8.1 When to Use QLoRA

In [None]:
print("\n" + "="*70)
print(" "*20 + "FINAL RECOMMENDATIONS")
print("="*70)

print("\n‚úÖ USE QLORA WHEN:")
print(f"  ‚Ä¢ Rank r ‚â• {min_rank if acceptable_ranks else 4}")
print("  ‚Ä¢ GPU memory is constrained")
print("  ‚Ä¢ Training on instruction-following tasks")
if weight_similarities and mean_similarity >= 0.95:
    print(f"  ‚Ä¢ Weight similarity confirmed (cosine sim = {mean_similarity:.4f})")

print("\n‚ö†Ô∏è  USE STANDARD LORA WHEN:")
print("  ‚Ä¢ Very low rank required (r < 4)")
print("  ‚Ä¢ Maximum precision needed (e.g., mathematical reasoning)")
print("  ‚Ä¢ GPU memory not a constraint")

print("\nüìä OPTIMAL CONFIGURATION:")
memory_reduction = ((diagnostic_summary['lora_memory_mb'] - diagnostic_summary['qlora_memory_mb']) / 
                   diagnostic_summary['lora_memory_mb']) * 100
print(f"  ‚Ä¢ Rank: r = 8 (balanced performance/efficiency)")
print(f"  ‚Ä¢ Memory savings: {memory_reduction:.1f}%")
print(f"  ‚Ä¢ Performance: Comparable to 16-bit LoRA")

print("\n" + "="*70)

## 9. Export Results for README

In [None]:
# Create summary for README
readme_results = {
    'memory_comparison': degradation_df[['rank', 'lora_loss', 'qlora_loss', 'degradation_%']].to_dict('records'),
    'weight_similarities': weight_similarities,
    'optimal_rank': min_rank if acceptable_ranks else 4,
    'memory_reduction': memory_reduction,
    'diagnostic_summary': diagnostic_summary
}

# Save for reference
with open('../results/tables/diagnostic_summary.pkl', 'wb') as f:
    pickle.dump(readme_results, f)

print("\n‚úÖ DIAGNOSTIC ANALYSIS COMPLETE!")
print("\nüìã TODO: Update README.md with these results:")
print("  1. Fill memory comparison table")
print("  2. Add weight similarity findings")
print("  3. Document rank threshold (r* = ...)")
print("  4. Add failure mode descriptions")
print("  5. Complete critical analysis section")
print("\nüéâ Ready for presentation!")

## 10. Generate All Remaining Plots

In [None]:
# Ensure all plots are saved
print("\nüìä Generating final plots...\n")

# Already created:
# - memory_comparison.png
# - training_efficiency.png
# - rank_threshold_plot.png
# - weight_similarity_matrix.png
# - memory_vs_performance.png

print("‚úì All plots generated in ../results/figures/")
print("\nPlot files:")
import glob
for plot in glob.glob('../results/figures/*.png'):
    print(f"  - {plot}")