# Validación Fase D.4: ML Dataset (Quick Version)

**Fecha**: 2025-10-28  
**Objetivo**: Validación rápida del dataset ML sin cargar todo el global

## Verificaciones

1. Conteo de archivos
2. Metadata verification
3. Sample validation (10 files)
4. Train/Valid split sizes
5. Feature columns check

In [None]:
import polars as pl
import numpy as np
from pathlib import Path
import json
import os

# Change to project root
os.chdir('D:/04_TRADING_SMALLCAPS')

# Paths
DATASETS_DIR = Path('processed/datasets')
BARS_DIR = Path('processed/bars')
LABELS_DIR = Path('processed/labels')
WEIGHTS_DIR = Path('processed/weights')

print("✅ Librerías importadas")
print(f"Working directory: {os.getcwd()}")

## 1. Conteo de Archivos

In [None]:
print("=== CONTEO DE ARCHIVOS ===")
print()

# Source files
bars_files = list(BARS_DIR.rglob('dollar_imbalance.parquet'))
labels_files = list(LABELS_DIR.rglob('labels.parquet'))
weights_files = list(WEIGHTS_DIR.rglob('weights.parquet'))

print(f"Archivos fuente:")
print(f"  Bars:    {len(bars_files):>6,}")
print(f"  Labels:  {len(labels_files):>6,}")
print(f"  Weights: {len(weights_files):>6,}")
print()

# Daily datasets
daily_files = list(DATASETS_DIR.glob('daily/*/date=*/dataset.parquet'))
print(f"Daily datasets: {len(daily_files):,}")
print()

# Critical files
global_file = DATASETS_DIR / 'global' / 'dataset.parquet'
train_file = DATASETS_DIR / 'splits' / 'train.parquet'
valid_file = DATASETS_DIR / 'splits' / 'valid.parquet'
meta_file = DATASETS_DIR / 'meta.json'

print("Archivos críticos:")
print(f"  Global: {global_file.exists()} ({global_file.stat().st_size / 1024**2:.1f} MB)")
print(f"  Train:  {train_file.exists()} ({train_file.stat().st_size / 1024**2:.1f} MB)")
print(f"  Valid:  {valid_file.exists()} ({valid_file.stat().st_size / 1024**2:.1f} MB)")
print(f"  Meta:   {meta_file.exists()}")
print()

coverage = len(daily_files) / len(bars_files) * 100
print(f"✅ Cobertura: {coverage:.2f}%")

## 2. Metadata Verification

In [None]:
print("=== METADATA ===")
print()

with open(meta_file, 'r') as f:
    meta = json.load(f)

for key, value in meta.items():
    if isinstance(value, list):
        print(f"{key}: {len(value)} items")
    else:
        print(f"{key}: {value}")
print()

# Verify 14 features
expected_features = [
    'ret_1', 'range_norm', 'vol_f', 'dollar_f', 'imb_f',
    'ret_1_ema10', 'ret_1_ema30', 'range_norm_ema20',
    'vol_f_ema20', 'dollar_f_ema20', 'imb_f_ema20',
    'vol_z20', 'dollar_z20', 'n'
]

actual = set(meta.get('feature_columns_example', []))
expected = set(expected_features)

if actual == expected:
    print("✅ 14 features correctas")
else:
    print(f"❌ Features mismatch: {actual ^ expected}")

## 3. Sample Daily Files Validation

In [None]:
print("=== VALIDACIÓN SAMPLE (10 archivos) ===")
print()

import random
random.seed(42)
sample_files = random.sample(daily_files, min(10, len(daily_files)))

for df_file in sample_files:
    ticker = df_file.parent.parent.name
    date = df_file.parent.name.split('=')[1]
    
    df = pl.read_parquet(df_file)
    
    print(f"{ticker} {date}: {len(df)} rows, {df.shape[1]} cols")
    
    # Check for nulls
    null_counts = df.null_count()
    total_nulls = sum([null_counts[col][0] for col in null_counts.columns])
    
    if total_nulls > 0:
        print(f"  ⚠️  {total_nulls} nulls detected")
    else:
        print(f"  ✅ Sin nulls")

print()
print("✅ Sample validation completada")

## 4. Train/Valid Split Verification (Headers Only)

In [None]:
print("=== TRAIN/VALID SPLITS ===")
print()

# Read just schema and count
print("Reading train file (scan only)...")
train_count = pl.scan_parquet(train_file).select(pl.count()).collect()[0, 0]

print("Reading valid file (scan only)...")
valid_count = pl.scan_parquet(valid_file).select(pl.count()).collect()[0, 0]

print(f"Train: {train_count:,} rows")
print(f"Valid: {valid_count:,} rows")
print()

total = train_count + valid_count
train_pct = train_count / total * 100
valid_pct = valid_count / total * 100

print(f"Train: {train_pct:.1f}%")
print(f"Valid: {valid_pct:.1f}%")
print()

# Check against metadata
expected_train = meta.get('train_rows', 0)
expected_valid = meta.get('valid_rows', 0)

train_match = "✅" if train_count == expected_train else "❌"
valid_match = "✅" if valid_count == expected_valid else "❌"

print(f"{train_match} Train: {train_count:,} == {expected_train:,}")
print(f"{valid_match} Valid: {valid_count:,} == {expected_valid:,}")

## 5. Schema Verification (Sample)

In [None]:
print("=== SCHEMA VERIFICATION ===")
print()

# Load one daily file to check schema
sample_df = pl.read_parquet(sample_files[0])

print("Schema (sample daily file):")
for col, dtype in sample_df.schema.items():
    print(f"  {col}: {dtype}")
print()

# Check required columns
required = ['anchor_ts', 'label', 'weight'] + expected_features
missing = [col for col in required if col not in sample_df.columns]

if not missing:
    print(f"✅ All required columns present ({len(required)} total)")
else:
    print(f"❌ Missing columns: {missing}")

## 6. Resumen Final

In [None]:
print("="*60)
print("RESUMEN VALIDACIÓN FASE D.4 (QUICK)")
print("="*60)
print()
print("✅ CHECKS PASSED:")
print(f"  Daily datasets:   {len(daily_files):,}")
print(f"  Coverage:         {coverage:.2f}%")
print(f"  Train rows:       {train_count:,}")
print(f"  Valid rows:       {valid_count:,}")
print(f"  Features:         14/14")
print(f"  Required columns: All present")
print()
print("📁 OUTPUT:")
print(f"  {global_file}")
print(f"  {train_file}")
print(f"  {valid_file}")
print()
print("="*60)
print("✅ FASE D.4 VALIDADA (Quick Check)")
print("="*60)