In [1]:
import os
import shutil
from datetime import datetime, timezone
import feature_engineering.technical_indicators as ti
import kagglehub
import polars as pl
from dotenv import load_dotenv
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


# Cleaning and Exploration

### Download Parquet files

In [2]:
def sync_and_save_parquet(dataset_slug: str, target_dirname: str = "data"):
    # 1. Load credentials from .env
    load_dotenv()
    
    if not os.getenv("KAGGLE_USERNAME") or not os.getenv("KAGGLE_KEY"):
        raise EnvironmentError("KAGGLE_USERNAME or KAGGLE_KEY not found in .env file.")

    # 2. Define data path relative to where the code is running
    project_root = Path.cwd()
    local_data_dir = project_root / target_dirname
    local_data_dir.mkdir(exist_ok=True)

    # 3. Download the dataset (Bypassing local cache)
    print(f"Fetching latest data from Kaggle: {dataset_slug}...")
    # force_download=True ensures kagglehub checks for the latest version and re-downloads
    cache_path = Path(kagglehub.dataset_download(dataset_slug, force_download=True))

    # 4. Find all parquet files in the fresh download
    parquet_files = list(cache_path.glob("*.parquet"))
    
    if not parquet_files:
        print("No parquet files found in the dataset.")
        return

    # 5. Move/Save to the local data folder
    print(f"Syncing {len(parquet_files)} files to {local_data_dir}...")
    for file in parquet_files:
        destination = local_data_dir / file.name
        
        # Using shutil.copy2 to preserve metadata and save CPU/Memory 
        # instead of reading/writing via Polars
        shutil.copy2(file, destination)
        print(f" -> Updated {file.name}")

    print(f"\nSync Complete. Files are located in: {local_data_dir}")

# Execution
# Using your specific dataset slug
DATASET_SLUG = "braydenmcarthur/10x-crypto-ohlcv-2024-2025"

try:
    sync_and_save_parquet(DATASET_SLUG)
except Exception as e:
    print(f"Error: {e}")

Fetching latest data from Kaggle: braydenmcarthur/10x-crypto-ohlcv-2024-2025...
Downloading to /home/zoltesh/.cache/kagglehub/datasets/braydenmcarthur/10x-crypto-ohlcv-2024-2025/3.archive...


100%|██████████| 38.7M/38.7M [00:00<00:00, 83.2MB/s]

Extracting files...





Syncing 10 files to /home/zoltesh/projects/intelligent_diversification/src/data...
 -> Updated DOGE-USDC.parquet
 -> Updated XRP-USDC.parquet
 -> Updated BTC-USDC.parquet
 -> Updated BCH-USDC.parquet
 -> Updated LINK-USDC.parquet
 -> Updated ADA-USDC.parquet
 -> Updated LTC-USDC.parquet
 -> Updated ETH-USDC.parquet
 -> Updated SOL-USDC.parquet
 -> Updated AVAX-USDC.parquet

Sync Complete. Files are located in: /home/zoltesh/projects/intelligent_diversification/src/data


### Create the lazyframes

In [3]:
ada_lf = pl.scan_parquet('data/ADA-USDC.parquet')
avax_lf = pl.scan_parquet('data/AVAX-USDC.parquet')
bch_lf = pl.scan_parquet('data/BCH-USDC.parquet')
btc_lf = pl.scan_parquet('data/BTC-USDC.parquet')
doge_lf = pl.scan_parquet('data/DOGE-USDC.parquet')
eth_lf = pl.scan_parquet('data/ETH-USDC.parquet')
link_lf = pl.scan_parquet('data/LINK-USDC.parquet')
ltc_lf = pl.scan_parquet('data/LTC-USDC.parquet')
sol_lf = pl.scan_parquet('data/SOL-USDC.parquet')
xrp_lf = pl.scan_parquet('data/XRP-USDC.parquet')

all_lf = {
    'ADA': ada_lf,
    'AVAX': avax_lf,
    'BCH': bch_lf,
    'BTC': btc_lf,
    'DOGE': doge_lf,
    'ETH': eth_lf,
    'LINK': link_lf,
    'LTC': ltc_lf,
    'SOL': sol_lf,
    'XRP': xrp_lf
}

### Identify sparsity

In [4]:
# Print percentage missing out of 210,528 rows per file
# Print percentage missing out of total rows
total_rows = 0
total_missing = 0
for symbol, lf in all_lf.items():
    missing = (210_528 - lf.collect().shape[0]) / 210_528 * 100
    total_missing += missing
    print(f"{symbol}: {missing:.2f}% missing")
    total_rows += lf.collect().shape[0]

total_sparsity = (2_105_280 - total_rows) / 2_105_280 * 100
print(f"Total sparsity: {total_sparsity:.2f}%")


ADA: 0.06% missing
AVAX: 0.08% missing
BCH: 0.06% missing
BTC: 0.04% missing
DOGE: 0.05% missing
ETH: 0.05% missing
LINK: 0.05% missing
LTC: 0.05% missing
SOL: 0.05% missing
XRP: 0.05% missing
Total sparsity: 0.05%


In [5]:
total = 0
for symbol, lf in all_lf.items():
    total += lf.collect().shape[0]
print(total)


2104132


# Detect any gaps/missing timestamp and forward fill them

In [6]:
OUT_DIR = Path("data_cleaned")
OUT_DIR.mkdir(parents=True, exist_ok=True)

INTERVAL = "5m"
PRICE_COLS = ["open", "high", "low", "close"]
VOLUME_COLS = ["volume"]
VALUE_COLS = PRICE_COLS + VOLUME_COLS

def impute_gaps_ffill(lf: pl.LazyFrame, every: str = "5m") -> tuple[pl.LazyFrame, int]:
    base = (
        lf.select(["timestamp", *VALUE_COLS])
          .with_columns(
              pl.col("timestamp").cast(pl.Datetime("ms")),
              *[pl.col(c).cast(pl.Float64) for c in VALUE_COLS],
          )
          .unique(subset="timestamp", keep="first")
          .sort("timestamp")
    )

    # derive start/end from the data (small collect)
    bounds = base.select(
        pl.col("timestamp").min().alias("start"),
        pl.col("timestamp").max().alias("end"),
    ).collect()
    start = bounds["start"][0]
    end = bounds["end"][0]

    full_index = pl.datetime_range(
        start, end, interval=every, time_unit="ms", eager=True
    )

    skeleton = pl.DataFrame({"timestamp": full_index}).lazy()

    joined = skeleton.join(base, on="timestamp", how="left")

    imputed_count = int(
        joined.select(pl.col("open").is_null().sum()).collect().item()
    )

    out = (
        joined.with_columns(
            pl.col(VALUE_COLS).fill_null(strategy="forward")  # includes volume now
        )
        .with_columns(pl.col("timestamp").dt.timestamp("ms").cast(pl.Int64()))
        .select(["timestamp", *VALUE_COLS])
    )

    return out, imputed_count

for symbol, lf in all_lf.items():
    out_lf, imputed = impute_gaps_ffill(lf, every=INTERVAL)
    out_path = OUT_DIR / f"{symbol}-USDC.parquet"
    out_lf.sink_parquet(out_path)
    print(f"{symbol}: imputed {imputed} rows -> {out_path}")

ADA: imputed 121 rows -> data_cleaned/ADA-USDC.parquet
AVAX: imputed 161 rows -> data_cleaned/AVAX-USDC.parquet
BCH: imputed 129 rows -> data_cleaned/BCH-USDC.parquet
BTC: imputed 94 rows -> data_cleaned/BTC-USDC.parquet
DOGE: imputed 105 rows -> data_cleaned/DOGE-USDC.parquet
ETH: imputed 101 rows -> data_cleaned/ETH-USDC.parquet
LINK: imputed 106 rows -> data_cleaned/LINK-USDC.parquet
LTC: imputed 107 rows -> data_cleaned/LTC-USDC.parquet
SOL: imputed 112 rows -> data_cleaned/SOL-USDC.parquet
XRP: imputed 112 rows -> data_cleaned/XRP-USDC.parquet


In [7]:
CLEANED_DIR = Path("data_cleaned")
STEP_MS = 300_000  # 5 minutes

print(f"{'File':<20} | {'Rows':>8} | {'DupTS':>5} | {'Gaps':>4} | {'Nulls':>5} | {'Status'}")
print("-" * 70)

for path in sorted(CLEANED_DIR.glob("*.parquet")):
    df = pl.read_parquet(path).select(["timestamp", "open", "high", "low", "close", "volume"]).sort("timestamp")

    ts = df["timestamp"]
    d = ts.diff().drop_nulls()

    has_gaps = not (d == STEP_MS).all()
    has_dups = ts.n_unique() != ts.len()

    # expected row count for a perfectly continuous series over [min, max]
    expected_rows = int((ts.max() - ts.min()) // STEP_MS + 1)
    wrong_count = df.height != expected_rows

    nulls = int(df.null_count().sum_horizontal().item())

    status = "OK" if (nulls == 0 and not has_gaps and not has_dups and not wrong_count) else "ERROR"
    print(f"{path.name:<20} | {df.height:>8} | {str(has_dups):>5} | {str(has_gaps):>4} | {nulls:>5} | {status}")

    if status == "ERROR":
        # show the first few offending deltas
        bad = df.select(
            pl.col("timestamp"),
            pl.col("timestamp").diff().alias("dt"),
        ).filter(pl.col("dt").is_not_null() & (pl.col("dt") != STEP_MS)).head(10)
        if bad.height:
            print(bad)

File                 |     Rows | DupTS | Gaps | Nulls | Status
----------------------------------------------------------------------
ADA-USDC.parquet     |   210528 | False | False |     0 | OK
AVAX-USDC.parquet    |   210528 | False | False |     0 | OK
BCH-USDC.parquet     |   210528 | False | False |     0 | OK
BTC-USDC.parquet     |   210528 | False | False |     0 | OK
DOGE-USDC.parquet    |   210528 | False | False |     0 | OK
ETH-USDC.parquet     |   210528 | False | False |     0 | OK
LINK-USDC.parquet    |   210528 | False | False |     0 | OK
LTC-USDC.parquet     |   210528 | False | False |     0 | OK
SOL-USDC.parquet     |   210528 | False | False |     0 | OK
XRP-USDC.parquet     |   210528 | False | False |     0 | OK


# Create Cleaned LazyFrames

In [8]:
ada_lf_cleaned = pl.scan_parquet('data_cleaned/ADA-USDC.parquet')
avax_lf_cleaned = pl.scan_parquet('data_cleaned/AVAX-USDC.parquet')
bch_lf_cleaned = pl.scan_parquet('data_cleaned/BCH-USDC.parquet')
btc_lf_cleaned = pl.scan_parquet('data_cleaned/BTC-USDC.parquet')
doge_lf_cleaned = pl.scan_parquet('data_cleaned/DOGE-USDC.parquet')
eth_lf_cleaned = pl.scan_parquet('data_cleaned/ETH-USDC.parquet')
link_lf_cleaned = pl.scan_parquet('data_cleaned/LINK-USDC.parquet')
ltc_lf_cleaned = pl.scan_parquet('data_cleaned/LTC-USDC.parquet')
sol_lf_cleaned = pl.scan_parquet('data_cleaned/SOL-USDC.parquet')
xrp_lf_cleaned = pl.scan_parquet('data_cleaned/XRP-USDC.parquet')

all_lf_cleaned = {
    'ADA': ada_lf_cleaned,
    'AVAX': avax_lf_cleaned,
    'BCH': bch_lf_cleaned,
    'BTC': btc_lf_cleaned,
    'DOGE': doge_lf_cleaned,
    'ETH': eth_lf_cleaned,
    'LINK': link_lf_cleaned,
    'LTC': ltc_lf_cleaned,
    'SOL': sol_lf_cleaned,
    'XRP': xrp_lf_cleaned
}

In [9]:
# Print min and max timestamps for each symbol
for symbol, lf in all_lf_cleaned.items():
    ts_min, ts_max = (
        lf.select(
            pl.from_epoch("timestamp", time_unit="ms").min().dt.replace_time_zone("UTC").alias("min_dt_utc"),
            pl.from_epoch("timestamp", time_unit="ms").max().dt.replace_time_zone("UTC").alias("max_dt_utc"),
        )
        .collect()
        .row(0)
    )
    print(f"{symbol} {ts_min} {ts_max}")

ADA 2024-01-01 00:00:00+00:00 2025-12-31 23:55:00+00:00
AVAX 2024-01-01 00:00:00+00:00 2025-12-31 23:55:00+00:00
BCH 2024-01-01 00:00:00+00:00 2025-12-31 23:55:00+00:00
BTC 2024-01-01 00:00:00+00:00 2025-12-31 23:55:00+00:00
DOGE 2024-01-01 00:00:00+00:00 2025-12-31 23:55:00+00:00
ETH 2024-01-01 00:00:00+00:00 2025-12-31 23:55:00+00:00
LINK 2024-01-01 00:00:00+00:00 2025-12-31 23:55:00+00:00
LTC 2024-01-01 00:00:00+00:00 2025-12-31 23:55:00+00:00
SOL 2024-01-01 00:00:00+00:00 2025-12-31 23:55:00+00:00
XRP 2024-01-01 00:00:00+00:00 2025-12-31 23:55:00+00:00


# Technical Indicators

### RSI

In [14]:

new_df = ti.add_atr(df=btc_lf_cleaned.collect(), tf='15m', period=14)

In [15]:
new_df.tail(100)

timestamp,open,high,low,close,volume,atr_14_15m
i64,f64,f64,f64,f64,f64,f64
1752547800000,118295.96,118414.72,118228.98,118382.33,18.311497,524.665625
1752552000000,117148.03,117210.02,117000.0,117003.81,60.289319,545.086205
1752564300000,117000.0,117057.39,116899.99,116900.0,106.246233,535.855581
1752584400000,117179.93,117311.7,117133.38,117311.7,48.193969,532.899496
1752592800000,116220.32,116220.32,115840.31,115915.81,143.871487,555.244508
…,…,…,…,…,…,…
1753367700000,118508.16,118775.99,118469.69,118765.41,49.183092,567.936498
1753379100000,119258.14,119301.48,119249.81,119286.36,16.155273,567.177772
1753391700000,118650.37,118672.6,118629.0,118630.86,17.802014,569.324968
1753399200000,118478.07,118478.07,118388.16,118422.12,37.764929,561.548183
