**Verificar PASO 0: SCD-2 Market Cap Dimension**

In [None]:
# ============================================================================
# VERIFICACIÓN PASO 0: Dimensión SCD-2 Market Cap
# ============================================================================
import polars as pl
import json
from pathlib import Path
from datetime import date
import os

# IMPORTANTE: Cambiar al directorio raíz del proyecto
PROJECT_ROOT = Path(r"D:\04_TRADING_SMALLCAPS")
os.chdir(PROJECT_ROOT)
print(f"Working directory: {os.getcwd()}")

print("="*80)
print("VERIFICACIÓN PASO 0: SCD-2 MARKET CAP DIMENSION")
print("="*80)

# ----------------------------------------------------------------------------
# 1. VERIFICAR EXISTENCIA DE ARCHIVOS
# ----------------------------------------------------------------------------
print("\n[1] VERIFICACIÓN DE ARCHIVOS GENERADOS")
print("-" * 80)

scd2_dir = Path("processed/ref/market_cap_dim")
dim_file = scd2_dir / "market_cap_dim.parquet"
manifest_file = scd2_dir / "MANIFEST.json"
success_file = scd2_dir / "_SUCCESS"

files_ok = True
for f, name in [(dim_file, "market_cap_dim.parquet"), 
                 (manifest_file, "MANIFEST.json"), 
                 (success_file, "_SUCCESS")]:
    exists = f.exists()
    status = "✅" if exists else "❌"
    print(f"{status} {name}: {'EXISTS' if exists else 'MISSING'}")
    files_ok = files_ok and exists

if not files_ok:
    raise FileNotFoundError("❌ FALLO: Archivos del PASO 0 no encontrados")

print("\n✅ RESULTADO: Todos los archivos existen")

: 

In [None]:
# ----------------------------------------------------------------------------
# 2. CARGAR Y VERIFICAR MANIFEST
# ----------------------------------------------------------------------------
print("\n[2] VERIFICACIÓN DE MANIFEST.JSON")
print("-" * 80)

with open(manifest_file) as f:
    manifest = json.load(f)

print(f"Timestamp generación: {manifest['timestamp']}")
print(f"Total tickers: {manifest['total_tickers']:,}")
print(f"Total periodos SCD-2: {manifest['total_periods']:,}")
print(f"Rango temporal: {manifest['date_range']['min']} → {manifest['date_range']['max']}")
print(f"\nCobertura global:")
print(f"  - market_cap: {manifest['market_cap_coverage']['with_cap']:,} / "
      f"{manifest['market_cap_coverage']['total']:,} "
      f"({100*manifest['market_cap_coverage']['with_cap']/manifest['market_cap_coverage']['total']:.1f}%)")
print(f"  - shares_outstanding: {manifest['market_cap_coverage']['with_shares']:,} / "
      f"{manifest['market_cap_coverage']['total']:,} "
      f"({100*manifest['market_cap_coverage']['with_shares']/manifest['market_cap_coverage']['total']:.1f}%)")

# ----------------------------------------------------------------------------
# 3. CARGAR DIMENSIÓN SCD-2
# ----------------------------------------------------------------------------
print("\n[3] CARGA Y VALIDACIÓN DE DIMENSIÓN SCD-2")
print("-" * 80)

dim = pl.read_parquet(dim_file)

print(f"Shape: {dim.shape}")
print(f"\nSchema:")
for col, dtype in dim.schema.items():
    null_count = dim[col].null_count()
    null_pct = 100 * null_count / len(dim)
    print(f"  {col:25s} {str(dtype):15s} (nulls: {null_count:,} = {null_pct:.1f}%)")

In [None]:
# ----------------------------------------------------------------------------
# 4. VERIFICAR INTEGRIDAD SCD-2
# ----------------------------------------------------------------------------
print("\n[4] VERIFICACIÓN INTEGRIDAD SCD-2")
print("-" * 80)

# 4.1 Verificar que effective_to >= effective_from
invalid_ranges = dim.filter(pl.col("effective_to") < pl.col("effective_from"))
print(f"✅ Rangos válidos (effective_from < effective_to): {len(invalid_ranges) == 0}")
if len(invalid_ranges) > 0:
    print(f"   ❌ FALLO: {len(invalid_ranges)} rangos inválidos")
    print(invalid_ranges.head())

# 4.2 Verificar que no hay gaps ni overlaps por ticker
gaps_overlaps = (
    dim
    .sort(["ticker", "effective_from"])
    .with_columns([
        pl.col("effective_from").shift(-1).over("ticker").alias("next_from")
    ])
    .filter(
        (pl.col("next_from").is_not_null()) &
        (pl.col("effective_to") != pl.col("next_from"))
    )
)
print(f"✅ Sin gaps/overlaps entre periodos: {len(gaps_overlaps) == 0}")
if len(gaps_overlaps) > 0:
    print(f"   ⚠️ WARNING: {len(gaps_overlaps)} posibles gaps/overlaps")

# 4.3 Verificar que effective_to abierto = 2099-12-31
open_periods = dim.filter(pl.col("effective_to") == date(2099, 12, 31))
print(f"✅ Periodos abiertos (effective_to=2099-12-31): {len(open_periods):,}")

In [None]:
# ----------------------------------------------------------------------------
# 5. VERIFICAR COBERTURA PARA NUESTRO UNIVERSO HÍBRIDO
# ----------------------------------------------------------------------------
print("\n[5] VERIFICACIÓN COBERTURA UNIVERSO HÍBRIDO")
print("-" * 80)

# Obtener tickers de daily_cache (nuestro universo)
cache_root = Path("processed/daily_cache")
cache_tickers = set([p.name.replace('ticker=', '') 
                     for p in cache_root.glob('ticker=*')])

print(f"Tickers en daily_cache (universo actual): {len(cache_tickers):,}")

# Filtrar SCD-2 solo para nuestro universo
dim_universe = dim.filter(pl.col("ticker").is_in(list(cache_tickers)))

print(f"Tickers en SCD-2 (universo): {len(dim_universe):,}")

# Cobertura de market_cap y shares
cap_coverage = dim_universe.filter(pl.col("market_cap").is_not_null())
shares_coverage = dim_universe.filter(pl.col("shares_outstanding").is_not_null())

print(f"\nCobertura UNIVERSO HÍBRIDO:")
print(f"  ✅ market_cap: {len(cap_coverage):,} / {len(dim_universe):,} "
      f"({100*len(cap_coverage)/len(dim_universe):.1f}%)")
print(f"  ✅ shares_outstanding: {len(shares_coverage):,} / {len(dim_universe):,} "
      f"({100*len(shares_coverage)/len(dim_universe):.1f}%)")

# CRÍTICO: Debe ser 100% para continuar
if len(cap_coverage) < len(dim_universe):
    missing = dim_universe.filter(pl.col("market_cap").is_null())
    print(f"\n  ⚠️ WARNING: {len(missing)} tickers sin market_cap:")
    print(missing.select(["ticker", "effective_from", "effective_to"]).head(10))

In [None]:
# ----------------------------------------------------------------------------
# 6. VERIFICAR DISTRIBUCIÓN DE MARKET CAP
# ----------------------------------------------------------------------------
print("\n[6] DISTRIBUCIÓN DE MARKET CAP (UNIVERSO HÍBRIDO)")
print("-" * 80)

cap_stats = dim_universe.filter(
    pl.col("market_cap").is_not_null()
).select([
    pl.col("market_cap").min().alias("min"),
    pl.col("market_cap").quantile(0.25).alias("p25"),
    pl.col("market_cap").median().alias("median"),
    pl.col("market_cap").quantile(0.75).alias("p75"),
    pl.col("market_cap").max().alias("max"),
])

print("Estadísticas market_cap (USD):")
for stat in cap_stats.to_dicts()[0].items():
    name, value = stat
    print(f"  {name:10s}: ${value:,.0f}")

# Contar por rango de market cap
cap_ranges = dim_universe.filter(pl.col("market_cap").is_not_null()).with_columns([
    pl.when(pl.col("market_cap") < 50_000_000).then(pl.lit("< $50M (Nano)"))
    .when(pl.col("market_cap") < 300_000_000).then(pl.lit("$50M-$300M (Micro)"))
    .when(pl.col("market_cap") < 2_000_000_000).then(pl.lit("$300M-$2B (Small)"))
    .when(pl.col("market_cap") < 10_000_000_000).then(pl.lit("$2B-$10B (Mid)"))
    .otherwise(pl.lit("> $10B (Large)"))
    .alias("cap_range")
])

print("\nDistribución por rango:")
distribution = cap_ranges.group_by("cap_range").agg([
    pl.count().alias("count")
]).sort("count", descending=True)
print(distribution)

In [None]:
# ----------------------------------------------------------------------------
# 7. MUESTRA DE TICKERS ESPECÍFICOS
# ----------------------------------------------------------------------------
print("\n[7] MUESTRA DE TICKERS ESPECÍFICOS")
print("-" * 80)

# Seleccionar 5 tickers aleatorios de nuestro universo
sample_tickers = dim_universe.sample(n=5, seed=42).select("ticker")["ticker"].to_list()

print(f"Tickers seleccionados: {', '.join(sample_tickers)}")
print("\nDetalle:")

for ticker in sample_tickers:
    ticker_data = dim.filter(pl.col("ticker") == ticker)
    row = ticker_data.to_dicts()[0]
    
    print(f"\n  {ticker}:")
    print(f"    effective_from: {row['effective_from']}")
    print(f"    effective_to: {row['effective_to']}")
    print(f"    market_cap: ${row['market_cap']:,.0f}" if row['market_cap'] else "    market_cap: NULL")
    print(f"    shares_outstanding: {row['shares_outstanding']:,.0f}" if row['shares_outstanding'] else "    shares_outstanding: NULL")

In [None]:
# ----------------------------------------------------------------------------
# 8. VERIFICAR JOIN TEMPORAL (SIMULACIÓN)
# ----------------------------------------------------------------------------
print("\n[8] SIMULACIÓN JOIN TEMPORAL SCD-2")
print("-" * 80)

# Simular join para una fecha específica
test_date = date(2025, 10, 21)
print(f"Fecha de prueba: {test_date}")

# Join: effective_from <= test_date < effective_to
joined = dim_universe.filter(
    (pl.col("effective_from") <= test_date) &
    (pl.col("effective_to") > test_date)
)

print(f"Tickers con market_cap válido en {test_date}: {len(joined):,}")
print(f"Cobertura: {100*len(joined)/len(dim_universe):.1f}%")

# Verificar que no hay duplicados
duplicates = (
    joined
    .group_by("ticker")
    .agg(pl.count().alias("count"))
    .filter(pl.col("count") > 1)
)

if len(duplicates) == 0:
    print("✅ Sin duplicados en join temporal")
else:
    print(f"❌ FALLO: {len(duplicates)} tickers con múltiples periodos válidos")
    print(duplicates.head())

# Muestra del join
print("\nMuestra join (10 tickers):")
print(joined.sample(n=10, seed=42).select([
    "ticker", "effective_from", "effective_to", "market_cap", "shares_outstanding"
]))

In [None]:




# ----------------------------------------------------------------------------
# 9. VERIFICACIÓN FINAL
# ----------------------------------------------------------------------------
print("\n" + "="*80)
print("VERIFICACIÓN FINAL - PASO 0")
print("="*80)

checks = {
    "Archivos generados": files_ok,
    "Rangos SCD-2 válidos": len(invalid_ranges) == 0,
    "Sin gaps/overlaps": len(gaps_overlaps) == 0,
    "Cobertura universo 100%": len(cap_coverage) == len(dim_universe),
    "Join temporal sin duplicados": len(duplicates) == 0,
}

all_passed = all(checks.values())

for check, passed in checks.items():
    status = "✅" if passed else "❌"
    print(f"{status} {check}")

print("\n" + "="*80)
if all_passed:
    print("✅✅✅ PASO 0 COMPLETADO EXITOSAMENTE ✅✅✅")
    print("Dimensión SCD-2 lista para uso en PASO 1 (daily_cache)")
else:
    print("❌ FALLO: Verificar errores arriba")
print("="*80)