# Data Exploration

Testing the data processing utilities with base table and static_0.

In [None]:
import sys
sys.path.insert(0, "..")

import polars as pl
from src.data_processing import (
    load_table_group,
    downcast_dtypes,
    drop_high_missing_cols,
    drop_high_cardinality_string_cols,
    preprocess_table,
    get_table_info,
)

In [None]:
DATA_PATH = "../data/"

## Load Base Table

In [None]:
# Load the base table
base = load_table_group(DATA_PATH, "base", split="train")
print(f"Base table shape: {base.shape}")
base.head()

In [None]:
# Check base table info
get_table_info(base)

In [None]:
# Preprocess base table
base_processed = preprocess_table(base)
print(f"\nAfter preprocessing: {base_processed.shape}")
get_table_info(base_processed)

## Load Static_0 Table

This table has multiple chunks (static_0_0, static_0_1, etc.) that need to be concatenated.

In [None]:
# Load static_0 - this will concatenate all chunks
static_0 = load_table_group(DATA_PATH, "static_0", split="train")
print(f"Static_0 table shape: {static_0.shape}")
static_0.head()

In [None]:
# Check static_0 info before preprocessing
info_before = get_table_info(static_0)
print(f"Shape: {info_before['shape']}")
print(f"Memory: {info_before['estimated_memory_mb']:.2f} MB")
print(f"Dtype counts: {info_before['dtype_counts']}")
print(f"Columns with >50% missing: {len(info_before['columns_with_high_missing'])}")

In [None]:
# Test downcast_dtypes
static_0_downcasted = downcast_dtypes(static_0)
info_downcasted = get_table_info(static_0_downcasted)
print(f"Memory before downcast: {info_before['estimated_memory_mb']:.2f} MB")
print(f"Memory after downcast: {info_downcasted['estimated_memory_mb']:.2f} MB")
print(f"Memory reduction: {(1 - info_downcasted['estimated_memory_mb']/info_before['estimated_memory_mb'])*100:.1f}%")

In [None]:
# Test drop_high_missing_cols
print(f"Columns before: {static_0.shape[1]}")
static_0_no_missing = drop_high_missing_cols(static_0, threshold=0.98)
print(f"Columns after (threshold=0.98): {static_0_no_missing.shape[1]}")

In [None]:
# Test drop_high_cardinality_string_cols
static_0_no_high_card = drop_high_cardinality_string_cols(static_0, max_unique=10_000)
print(f"Columns after dropping high-cardinality strings: {static_0_no_high_card.shape[1]}")

In [None]:
# Apply full preprocessing pipeline
static_0_processed = preprocess_table(static_0)
print(f"\nFinal shape after full preprocessing: {static_0_processed.shape}")
get_table_info(static_0_processed)

## Summary

The data processing utilities provide:
- `load_table_group()`: Load and concatenate chunked parquet files
- `downcast_dtypes()`: Reduce memory by casting float64→float32, int64→int32
- `drop_high_missing_cols()`: Remove columns with missing rate > threshold
- `drop_high_cardinality_string_cols()`: Remove string columns with too many unique values
- `preprocess_table()`: Apply all preprocessing steps in one call