# Phase 2 Validation Report (Final Execution)

This notebook verifies the integrity of the data generated in Phase 2:
1. **Baseline Occurrences**: `data/embeddings_baseline.parquet`
2. **DAPT Occurrences**: `data/embeddings_dapt.parquet`
3. **Baseline Anchors**: `data/anchors_baseline.parquet`
4. **DAPT Anchors**: `data/anchors_dapt.parquet`

It performs sanity checks (nulls, dimensions) and geometric comparison (PCA).

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import os

# Output paths
BASE_OCC_PATH = "../data/embeddings_baseline.parquet"
DAPT_OCC_PATH = "../data/embeddings_dapt.parquet"
BASE_ANC_PATH = "../data/anchors_baseline.parquet"
DAPT_ANC_PATH = "../data/anchors_dapt.parquet"

## 1. Occurrences Verification

In [None]:
def verify_dataset(path, name):
    if not os.path.exists(path):
        print(f"❌ {name} not found at {path}")
        return None
    
    df = pd.read_parquet(path)
    print(f"✅ {name}: Loaded {len(df)} rows.")
    print(f"   Columns: {df.columns.tolist()}")
    
    # Check dimensions
    if "embedding_last4_concat" in df.columns:
        dim = len(df.iloc[0]["embedding_last4_concat"])
        print(f"   Last4 Dim: {dim}")
        
    return df

df_base = verify_dataset(BASE_OCC_PATH, "Baseline Occs")
df_dapt = verify_dataset(DAPT_OCC_PATH, "DAPT Occs")

## 2. Anchors Verification

In [None]:
df_anc_base = verify_dataset(BASE_ANC_PATH, "Baseline Anchors")
df_anc_dapt = verify_dataset(DAPT_ANC_PATH, "DAPT Anchors")

## 3. Geometric Comparison (PCA: Baseline vs DAPT)
Visualizing how the DAPT training shifted the embeddings space for Anchors.

In [None]:
def plot_comparison(df1, df2, label1, label2, vector_col="embedding_last4_concat"):
    if df1 is None or df2 is None: return
    
    vecs1 = np.stack(df1[vector_col].values)
    vecs2 = np.stack(df2[vector_col].values)
    all_vecs = np.vstack([vecs1, vecs2])
    
    pca = PCA(n_components=2)
    coords = pca.fit_transform(all_vecs)
    
    c1 = coords[:len(vecs1)]
    c2 = coords[len(vecs1):]
    
    plt.figure(figsize=(10, 6))
    plt.scatter(c1[:,0], c1[:,1], alpha=0.5, label=label1, c='blue')
    plt.scatter(c2[:,0], c2[:,1], alpha=0.5, label=label2, c='red')
    plt.legend()
    plt.title("PCA Projection: Baseline (Blue) vs DAPT (Red)")
    plt.show()

plot_comparison(df_anc_base, df_anc_dapt, "Baseline", "DAPT")