# Phase 2 Data Verification & EDA

This notebook verifies the integrity of Phase 2 output data against Phase 3 requirements.

In [None]:
import pandas as pd
import numpy as np
import pyarrow.parquet as pq
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# Set up visual style
plt.style.use('seaborn-v0_8')
sns.set_context("talk")

In [None]:
FILES = {
    'Canonical (Target)': '../data/embeddings_occurrences.parquet',
    'Baseline (Old)': '../data/embeddings_baseline.parquet',
    'DAPT (Old)': '../data/embeddings_dapt.parquet'
}

def inspect_data(name, path):
    print(f"\n{'='*20} INSPECTING: {name} {'='*20}")
    print(f"Loading {path}...")
    try:
        df = pd.read_parquet(path)
    except Exception as e:
        print(f"FAILED to load parquet: {e}")
        return None
    
    print(f"Shape: {df.shape}")
    print("\nColumns:")
    print(df.columns.tolist())
    
    print("\nSample (first 3 rows):")
    display(df.head(3))
    
    print("\nSchema & Types:")
    print(df.dtypes)
    
    return df

In [None]:
def verify_requirements(df):
    print("\n--- VERIFYING REQUIREMENTS (Canonical) ---")
    
    # 2.2 A) Identificación
    req_cols_a = ['occurrence_id', 'run_id', 'model_id', 'model_variant', 'layer_strategy', 'pooling_strategy']
    missing_a = [c for c in req_cols_a if c not in df.columns]
    if missing_a:
        print(f"❌ MISSING A) Identificación columns: {missing_a}")
    else:
        print("✅ A) Identificación columns present")
        
    # 2.2 B) Información temporal
    req_cols_b = ['published_at', 'year', 'month', 'year_month']
    missing_b = [c for c in req_cols_b if c not in df.columns]
    if missing_b:
         print(f"❌ MISSING B) Temporal columns: {missing_b}")
    else:
        print("✅ B) Temporal columns present")
        
    # 2.2 C) Fuente
    req_cols_c = ['newspaper', 'source_api', 'url']
    missing_c = [c for c in req_cols_c if c not in df.columns]
    if missing_c:
        print(f"❌ MISSING C) Source columns: {missing_c}")
    else:
        print("✅ C) Source columns present")
        
    # 2.2 D) Lingüística (CRITICO)
    req_cols_d = ['keyword_canonical', 'keyword_matched', 'char_start', 'char_end', 'token_start', 'token_end']
    missing_d = [c for c in req_cols_d if c not in df.columns]
    if missing_d:
        print(f"❌ MISSING D) Linguistic columns: {missing_d}")
    else:
        print("✅ D) Linguistic columns present")

    # 2.2 E) Contexto
    req_cols_e = ['context_sentence'] # context_window is optional
    missing_e = [c for c in req_cols_e if c not in df.columns]
    if missing_e:
        print(f"❌ MISSING E) Context columns: {missing_e}")
    else:
        print("✅ E) Context columns present")

    # 2.2 F) Embeddings
    req_cols_f = ['embedding_contextual_last4', 'embedding_contextual_penultimate']
    missing_f = [c for c in req_cols_f if c not in df.columns]
    if missing_f:
        print(f"❌ MISSING F) Embedding columns: {missing_f}")
    else:
        print("✅ F) Embedding columns present")
        
    # 3. Validation Logic
    print("\n--- VALIDATION CHECKS ---")
    
    # Check nulls in critical columns
    critical_cols = req_cols_a + req_cols_b + req_cols_c + req_cols_d + req_cols_e + req_cols_f
    present_critical = [c for c in critical_cols if c in df.columns]
    nulls = df[present_critical].isnull().sum()
    if nulls.sum() > 0:
        print("⚠️ NULLS FOUND in critical columns:")
        print(nulls[nulls > 0])
    else:
        print("✅ No nulls in critical columns")
        
    # Check date ranges
    if 'published_at' in df.columns:
        print(f"\nDate Range: {df['published_at'].min()} to {df['published_at'].max()}")
        
    # Check Unique Model Variants
    if 'model_variant' in df.columns:
        print("\nModel Variants distribution:")
        print(df['model_variant'].value_counts())
        
    # Check Embedding Dimensions
    if 'embedding_contextual_last4' in df.columns:
        # Check first non-null
        sample_emb = df['embedding_contextual_last4'].dropna().iloc[0]
        print(f"\nEmbedding Last4 Shape: {len(sample_emb)}")
        
    if 'embedding_contextual_penultimate' in df.columns:
         s_series = df['embedding_contextual_penultimate'].dropna()
         if len(s_series) > 0:
             sample_emb = s_series.iloc[0]
             print(f"Embedding Penultimate Shape: {len(sample_emb)}")
         else:
             print("❌ Embedding Penultimate is ALL NULL")

In [None]:
def advanced_eda(df):
    print("\n--- ADVANCED EDA PLOTS ---")
    
    # Temporal Distribution
    if 'year_month' in df.columns:
        plt.figure(figsize=(12, 6))
        df['year_month'].sort_values().value_counts(sort=False).plot(kind='bar')
        plt.title("Distribution of Occurrences by Year-Month")
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()
        
    # Model x Layer Strategy
    if 'model_variant' in df.columns and 'layer_strategy' in df.columns:
        plt.figure(figsize=(10, 6))
        sns.countplot(data=df, x='model_variant', hue='layer_strategy')
        plt.title("Count of Occurrences by Model Variant and Layer Strategy")
        plt.show()

In [None]:
# EXECUTION
dfs = {}
for name, path in FILES.items():
    df = inspect_data(name, path)
    if df is not None:
        dfs[name] = df
        if name == 'Canonical (Target)':
            verify_requirements(df)
            advanced_eda(df)
        else:
             print(f"Skipping strict verification for {name} (Old Schema predicted)")
             if 'model_variant' in df.columns:
                 print(f"Model Variants: {df['model_variant'].unique()}")
             
             # Check for null dates which was a finding
             if 'published_at' in df.columns:
                 print(f"Null Dates: {df['published_at'].isnull().sum()} / {len(df)}")