In [1]:
# ========================================
# CELDA 1: Imports y configuración
# ========================================
import pandas as pd
import polars as pl
from pathlib import Path
import numpy as np

# Configuración
BASE_DIR = Path(r"D:\04_TRADING_SMALLCAPS")  # Ruta absoluta a tu proyecto
DAILY_DIR = BASE_DIR / "raw" / "polygon" / "ohlcv_daily"
INTRADAY_DIR = BASE_DIR / "raw" / "polygon" / "ohlcv_intraday_1m"

# Tickers de ejemplo para inspección
# SAMPLE_TICKERS = ["AAPL", "MSFT", "GOOGL", "TSLA", "AMZN"]

print(f"📂 Base dir: {BASE_DIR}")
print(f"📊 Daily dir: {DAILY_DIR.exists()}")
print(f"📊 Intraday dir: {INTRADAY_DIR.exists()}")

📂 Base dir: D:\04_TRADING_SMALLCAPS
📊 Daily dir: True
📊 Intraday dir: True


In [20]:
# ========================================
# CELDA 2: Inspeccionar datos DAILY
# ========================================

# ========================================
# CELDA 2 (ALTERNATIVA): Elegir ticker específico
# ========================================

# Obtener lista de tickers descargados
available_tickers = sorted([d.name for d in DAILY_DIR.iterdir() if d.is_dir()])
print(f"📊 Total tickers disponibles: {len(available_tickers):,}")
print(f"\n🔝 Primeros 50 tickers:")
for i, t in enumerate(available_tickers[:50], 1):
    print(f"{i:3d}. {t}", end="   ")
    if i % 5 == 0:
        print()  # Nueva línea cada 5 tickers

# Seleccionar ticker (cámbialo por el que quieras)
ticker = available_tickers[10]  # Puedes cambiar el índice
# O buscar uno específico:
# ticker = next((t for t in available_tickers if t.startswith('A')), available_tickers[0])

print(f"\n\n🔍 Ticker seleccionado: {ticker}")
ticker_dir = DAILY_DIR / ticker

# Leer todos los archivos parquet del ticker
parquet_files = list(ticker_dir.rglob("*.parquet"))
print(f"📁 Archivos encontrados: {len(parquet_files)}")

# Cargar y concatenar todos los años
dfs_daily = []
for pq_file in parquet_files:
    df = pd.read_parquet(pq_file)
    dfs_daily.append(df)

df_daily = pd.concat(dfs_daily, ignore_index=True)

# Ordenar por fecha
if 'date' in df_daily.columns:
    df_daily = df_daily.sort_values('date').reset_index(drop=True)

print(f"\n{'='*60}")
print(f"📊 DAILY DATA - {ticker}")
print(f"{'='*60}")
print(f"Total filas: {len(df_daily):,}")
print(f"Total columnas: {len(df_daily.columns)}")
if 'date' in df_daily.columns:
    print(f"Período: {df_daily['date'].min()} → {df_daily['date'].max()}")

📊 Total tickers disponibles: 8,618

🔝 Primeros 50 tickers:
  1. AABA     2. AAC     3. AACB     4. AACI     5. AACQ   
  6. AACT     7. AADI     8. AAGR     9. AAI    10. AAIC   
 11. AAM    12. AAME    13. AAMI    14. AAN    15. AAN.A   
 16. AANw    17. AAOI    18. AAPC    19. AAQC    20. AARD   
 21. AAT    22. AATC    23. AAUC    24. AAV    25. AAWW   
 26. ABAC    27. ABAT    28. ABAX    29. ABC    30. ABCD   
 31. ABCL    32. ABCO    33. ABDC    34. ABDw    35. ABEO   
 36. ABGI    37. ABH    38. ABHw    39. ABIL    40. ABIO   
 41. ABK    42. ABL    43. ABLV    44. ABMD    45. ABN   
 46. ABOS    47. ABP    48. ABS    49. ABSI    50. ABST   


🔍 Ticker seleccionado: AAM
📁 Archivos encontrados: 2

📊 DAILY DATA - AAM
Total filas: 263
Total columnas: 10
Período: 2024-09-16 → 2025-10-24


In [21]:
# ========================================
# CELDA 3: Ver columnas y tipos de datos DAILY
# ========================================

print("📋 Columnas disponibles (DAILY):")
print(df_daily.dtypes)
print(f"\nTotal atributos: {len(df_daily.columns)}")
# ========================================
# CELDA 4: Ver primeras 5 filas DAILY
# ========================================

print("🔝 Primeras 5 filas (DAILY):")
df_daily.head(5)
# ========================================
# CELDA 5: Estadísticas descriptivas DAILY
# ========================================

print("📊 Estadísticas descriptivas (DAILY):")
df_daily.describe()
# ========================================
# CELDA 6: Info adicional DAILY
# ========================================

print(f"📅 Rango temporal:")
print(f"   Inicio: {df_daily['date'].min()}")
print(f"   Fin:    {df_daily['date'].max()}")
print(f"   Días:   {len(df_daily):,}")

if 'v' in df_daily.columns:
    print(f"\n💰 Volumen:")
    print(f"   Promedio: {df_daily['v'].mean():,.0f}")
    print(f"   Máximo:   {df_daily['v'].max():,.0f}")
    print(f"   Mínimo:   {df_daily['v'].min():,.0f}")
    print(f"   Días sin volumen: {(df_daily['v'] == 0).sum()}")

if 'c' in df_daily.columns and 'o' in df_daily.columns:
    df_daily['return_pct'] = ((df_daily['c'] - df_daily['o']) / df_daily['o']) * 100
    print(f"\n📈 Retorno diario (%):")
    print(f"   Promedio: {df_daily['return_pct'].mean():.2f}%")
    print(f"   Máximo:   {df_daily['return_pct'].max():.2f}%")
    print(f"   Mínimo:   {df_daily['return_pct'].min():.2f}%")

📋 Columnas disponibles (DAILY):
ticker     object
date       object
t           int64
o         float64
h         float64
l         float64
c         float64
v         float64
n           int64
vw        float64
dtype: object

Total atributos: 10
🔝 Primeras 5 filas (DAILY):
📊 Estadísticas descriptivas (DAILY):
📅 Rango temporal:
   Inicio: 2024-09-16
   Fin:    2025-10-24
   Días:   263

💰 Volumen:
   Promedio: 130,584
   Máximo:   4,034,485
   Mínimo:   102
   Días sin volumen: 0

📈 Retorno diario (%):
   Promedio: -0.00%
   Máximo:   1.26%
   Mínimo:   -0.93%


In [22]:
# ========================================
# CELDA 7: Inspeccionar datos INTRADAY (1-MINUTE)
# ========================================

# Seleccionar mismo ticker
print(f"🔍 Analizando INTRADAY: {ticker}")

# Buscar año más reciente (ej: 2024)
year_dir = ticker_dir / "year=2024"
if not year_dir.exists():
    # Buscar último año disponible
    year_dirs = sorted([d for d in ticker_dir.iterdir() if d.is_dir()])
    year_dir = year_dirs[-1] if year_dirs else None

if year_dir:
    print(f"📂 Año: {year_dir.name}")
    
    # Leer primer mes como muestra (o todos si quieres)
    parquet_files = list(year_dir.rglob("*.parquet"))[:1]  # Solo 1 mes para no saturar
    print(f"📁 Archivo muestra: {parquet_files[0].parent.name}/{parquet_files[0].name}")
    
    df_minute = pd.read_parquet(parquet_files[0])
    
    # Ordenar por minute
    if 'minute' in df_minute.columns:
        df_minute = df_minute.sort_values('minute').reset_index(drop=True)
    
    print(f"\n{'='*60}")
    print(f"📊 INTRADAY DATA (1-MINUTE) - {ticker}")
    print(f"{'='*60}")
    print(f"Total filas: {len(df_minute):,}")
    print(f"Total columnas: {len(df_minute.columns)}")
    if 'minute' in df_minute.columns:
        print(f"Período: {df_minute['minute'].min()} → {df_minute['minute'].max()}")

🔍 Analizando INTRADAY: AAM
📂 Año: year=2024
📁 Archivo muestra: year=2024/daily.parquet

📊 INTRADAY DATA (1-MINUTE) - AAM
Total filas: 74
Total columnas: 10


In [23]:
# ========================================
# CELDA 8: Ver columnas y tipos de datos INTRADAY
# ========================================

print("📋 Columnas disponibles (INTRADAY 1-MINUTE):")
print(df_minute.dtypes)
print(f"\nTotal atributos: {len(df_minute.columns)}")
# ========================================
# CELDA 9: Ver primeras 5 filas INTRADAY
# ========================================

print("🔝 Primeras 5 filas (INTRADAY 1-MINUTE):")
df_minute.head(5)
# ========================================
# CELDA 10: Estadísticas descriptivas INTRADAY
# ========================================

print("📊 Estadísticas descriptivas (INTRADAY 1-MINUTE):")
df_minute.describe()

📋 Columnas disponibles (INTRADAY 1-MINUTE):
ticker     object
date       object
t           int64
o         float64
h         float64
l         float64
c         float64
v         float64
n           int64
vw        float64
dtype: object

Total atributos: 10
🔝 Primeras 5 filas (INTRADAY 1-MINUTE):
📊 Estadísticas descriptivas (INTRADAY 1-MINUTE):


Unnamed: 0,t,o,h,l,c,v,n,vw
count,74.0,74.0,74.0,74.0,74.0,74.0,74.0,74.0
mean,1730927000000.0,10.045746,10.050616,10.037393,10.042736,323708.6,65.662162,10.042765
std,2707477000.0,0.03401,0.034138,0.031339,0.03144,668027.4,94.112186,0.031328
min,1726459000000.0,9.98,9.98,9.96,9.98,407.0,6.0,9.9708
25%,1728554000000.0,10.02,10.02715,10.0157,10.02,9152.75,11.75,10.02
50%,1730912000000.0,10.05,10.06,10.04,10.05,76008.5,29.5,10.05005
75%,1733267000000.0,10.06,10.07125,10.0539,10.0675,248059.2,90.25,10.061175
max,1735621000000.0,10.18,10.18,10.1,10.11,4034485.0,553.0,10.102


In [25]:

# ========================================
# CELDA 11: Info adicional INTRADAY
# ========================================

if 'minute' in df_minute.columns:
    print(f"⏰ Rango temporal:")
    print(f"   Inicio: {df_minute['minute'].min()}")
    print(f"   Fin:    {df_minute['minute'].max()}")
    print(f"   Total minutos: {len(df_minute):,}")

if 'v' in df_minute.columns:
    print(f"\n💰 Volumen por minuto:")
    print(f"   Promedio: {df_minute['v'].mean():,.0f}")
    print(f"   Máximo:   {df_minute['v'].max():,.0f}")
    print(f"   Minutos sin vol: {(df_minute['v'] == 0).sum()}")

if 'n' in df_minute.columns:
    print(f"\n🔢 Transacciones por minuto:")
    print(f"   Promedio: {df_minute['n'].mean():.1f}")
    print(f"   Máximo:   {df_minute['n'].max():,.0f}")
    print(f"   Mínimo:   {df_minute['n'].min():,.0f}")


💰 Volumen por minuto:
   Promedio: 323,709
   Máximo:   4,034,485
   Minutos sin vol: 0

🔢 Transacciones por minuto:
   Promedio: 65.7
   Máximo:   553
   Mínimo:   6


In [26]:

# ========================================
# CELDA 12: Contar todos los tickers descargados
# ========================================

# Contar tickers en daily
daily_tickers = [d.name for d in DAILY_DIR.iterdir() if d.is_dir()]
print(f"📊 Total tickers DAILY:    {len(daily_tickers):,}")

# Contar tickers en intraday
intraday_tickers = [d.name for d in INTRADAY_DIR.iterdir() if d.is_dir()]
print(f"📊 Total tickers INTRADAY: {len(intraday_tickers):,}")

# Intersección
common = set(daily_tickers) & set(intraday_tickers)
print(f"📊 En ambos datasets:      {len(common):,}")

# Diferencias
only_daily = set(daily_tickers) - set(intraday_tickers)
only_intraday = set(intraday_tickers) - set(daily_tickers)

if only_daily:
    print(f"\n⚠️  Solo en daily: {len(only_daily)}")
    print(f"   Ejemplos: {sorted(list(only_daily))[:10]}")

if only_intraday:
    print(f"\n⚠️  Solo en intraday: {len(only_intraday)}")
    print(f"   Ejemplos: {sorted(list(only_intraday))[:10]}")

📊 Total tickers DAILY:    8,618
📊 Total tickers INTRADAY: 8,621
📊 En ambos datasets:      8,615

⚠️  Solo en daily: 3
   Ejemplos: ['ADSw', 'AEBIV', 'HW']

⚠️  Solo en intraday: 6
   Ejemplos: ['ADSW', 'ASTI', 'Hw', 'MURAV', 'RNVA', '_batch_temp']


`ojo, hay que normalizar texto`

In [None]:
# ========================================
# CELDA 13: Comparar múltiples tickers 
# ========================================

# Cargar varios tickers para comparar
comparison_tickers = ['AABA','AAC','AACB','AACI','AACQ']
comparison_data = {}

for ticker in comparison_tickers:
    ticker_dir = DAILY_DIR / ticker
    if ticker_dir.exists():
        parquet_files = list(ticker_dir.rglob("*.parquet"))
        dfs = [pd.read_parquet(f) for f in parquet_files]
        df = pd.concat(dfs, ignore_index=True)
        comparison_data[ticker] = {
            'rows': len(df),
            'date_min': df['date'].min() if 'date' in df.columns else None,
            'date_max': df['date'].max() if 'date' in df.columns else None,
            'columns': len(df.columns),
            'avg_volume': df['v'].mean() if 'v' in df.columns else None
        }
  
# Mostrar comparación
comparison_df = pd.DataFrame(comparison_data).T
print("📊 Comparación entre tickers (DAILY):")
comparison_df

📊 Comparación entre tickers (DAILY):


Unnamed: 0,rows,date_min,date_max,columns,avg_volume
AABA,577,2017-06-19,2019-10-02,10,8304419.242634
AAC,3659,2004-01-02,2023-11-06,10,145948.593926
AACB,475,2004-01-02,2025-10-24,10,31782.010526
AACI,545,2021-11-10,2025-10-24,10,56618.730275
AACQ,202,2020-09-04,2021-06-24,10,1443952.30198
