In [32]:
# Pull data 

API_KEY = os.getenv("COINGECKO_API_KEY")  # REQUIRED
assert API_KEY, "Set COINGECKO_API_KEY in your environment."

BASE = "https://pro-api.coingecko.com/api/v3"   # Pro endpoint only
VS = "usd"

# ---- rate limiting (tune to your Pro plan) ----
CALLS_PER_MIN = 200
SLEEP_SEC = 60.0 / CALLS_PER_MIN

def _get(url, params=None, max_retries=3):
    headers = {"x-cg-pro-api-key": API_KEY}
    for i in range(max_retries):
        r = requests.get(url, params=params, headers=headers, timeout=30)
        if r.status_code == 200:
            return r.json()
        if r.status_code in (429, 500, 502, 503, 504):
            time.sleep(SLEEP_SEC * (i+1) * 2)
            continue
        r.raise_for_status()
    raise RuntimeError(f"GET failed: {url} {r.status_code} {r.text[:200]}")

def list_top_coins(n=300, vs=VS):
    out, page = [], 1
    while len(out) < n:
        params = dict(vs_currency=vs, order="market_cap_desc",
                      per_page=250, page=page, price_change_percentage="24h")
        data = _get(f"{BASE}/coins/markets", params=params)
        if not data: break
        out.extend([{"id": c["id"], "symbol": c["symbol"], "name": c["name"]} for c in data])
        page += 1
        time.sleep(SLEEP_SEC)
    return out[:n]

def market_chart_close(coin_id, days, vs=VS):
    """
    /coins/{id}/market_chart
    Granularity is automatic: ~5min for last day, hourly up to 90d, daily beyond.
    """
    params = dict(vs_currency=vs, days=days)
    data = _get(f"{BASE}/coins/{coin_id}/market_chart", params=params)
    time.sleep(SLEEP_SEC)
    df = pd.DataFrame(data.get("prices", []), columns=["timestamp_ms", "close"])
    if df.empty:
        return df
    df["timestamp"] = pd.to_datetime(df["timestamp_ms"], unit="ms", utc=True)
    df = df.drop(columns=["timestamp_ms"]).set_index("timestamp").sort_index()
    df = df[~df.index.duplicated(keep="last")]
    return df

# ---------- Daily (max history) ----------
def fetch_daily_max(coin_id):
    df = market_chart_close(coin_id, days="max")
    if df.empty:
        return df
    # Strict daily at 00:00 UTC; last sample per day
    return df.resample("1D").last().dropna()

# ---------- 5-minute RETURNS (last ~24h) ----------
def fetch_5min_returns_last_day(coin_id, log_returns=False):
    """
    Build a strict 5-minute grid from last ~1 day native samples (~5m),
    then compute returns on that 5-minute grid.
    - Simple returns: pct_change()
    - Log returns: diff(log(price))
    """
    df1 = market_chart_close(coin_id, days=1)  # native ~5m stamps (irregular)
    if df1.empty:
        return df1

    # Align to exact 5-minute grid using last observed close within each bin.
    m5_close = df1.resample("5min").last()

    # Drop leading/trailing bins with no data at all
    m5_close = m5_close.loc[df1.index.min():df1.index.max()].dropna(how="all")

    if m5_close.empty:
        return m5_close  # nothing to compute

    if log_returns:
        # log returns: ln(P_t) - ln(P_{t-1})
        m5_ret = (m5_close.apply(lambda s: pd.Series(pd.Series(s).apply(lambda v: None if pd.isna(v) else v)))  # no-op, keeps dtype stable
                  .apply(lambda col: (col.apply(lambda v: pd.NA if pd.isna(v) else v)).astype(float)))
        m5_ret = (m5_close["close"].apply(lambda v: None if pd.isna(v) else v)).astype(float)
        m5_ret = pd.Series(m5_ret, index=m5_close.index).apply(lambda x: pd.NA if pd.isna(x) else x)
        m5_ret = pd.Series(pd.Series(m5_close["close"]).astype(float)).apply(lambda x: x)
        # Simpler, clearer:
        import numpy as np
        m5_ret = pd.Series(np.log(m5_close["close"]), index=m5_close.index).diff().to_frame(name="return")
    else:
        # simple percentage returns
        m5_ret = m5_close["close"].pct_change().to_frame(name="return")

    # Drop the first NaN return
    m5_ret = m5_ret.dropna()
    return m5_ret

def fetch_many(top_n=50, log_returns=False):
    coins = list_top_coins(n=top_n)
    out = {}
    for c in coins:
        cid = c["id"]
        try:
            daily = fetch_daily_max(cid)
            m5_returns = fetch_5min_returns_last_day(cid, log_returns=log_returns)
            out[cid] = {
                "meta": c,
                "daily": daily,
                "m5_returns": m5_returns,   # column name 'return'
            }
            print(f"OK {cid:20s}  m5_ret={len(m5_returns):5d}  daily={len(daily):6d}")
        except Exception as e:
            print(f"ERR {cid}: {e}")
    return out

# ---------- Main: wide pivots & saves (DAILY + 5-MIN RETURNS) ----------
if __name__ == "__main__":
    TOP_N = 250
    LOG_RETURNS = False   # set True for log returns

    data = fetch_many(top_n=TOP_N, log_returns=LOG_RETURNS)
    keep = list(data.keys())[:TOP_N]

    # Wide DAILY closes
    wide_daily = pd.concat(
        [data[cid]["daily"].rename(columns={"close": cid}) for cid in keep if not data[cid]["daily"].empty],
        axis=1
    ).sort_index()

    wide_daily = wide_daily.sort_index().ffill()  # Fill NA values
    
    wide_daily.to_parquet("coingecko_daily_close_topN.parquet")

    # Wide 5-MIN RETURNS (last ~24h)
    wide_5m_returns = pd.concat(
        [data[cid]["m5_returns"].rename(columns={"return": cid}) for cid in keep if not data[cid]["m5_returns"].empty],
        axis=1
    ).sort_index()

    # Drop timestamps where all coins are NaN (edge bins)
    wide_5m_returns = wide_5m_returns.dropna(how="all")

    wide_5m_returns = wide_5m_returns.sort_index().ffill()  # Fill NA values

    wide_5m_returns.to_parquet("coingecko_5min_returns_topN.parquet")

    print("DAILY tail:\n", wide_daily.tail())
    print("5-MIN RETURNS tail:\n", wide_5m_returns.tail())

OK bitcoin               m5_ret=  283  daily=  4496
OK ethereum              m5_ret=  282  daily=  3666
OK ripple                m5_ret=  284  daily=  4395
OK tether                m5_ret=  282  daily=  3807
OK binancecoin           m5_ret=  282  daily=  2894
OK solana                m5_ret=  283  daily=  1958
OK usd-coin              m5_ret=  281  daily=  2512
OK staked-ether          m5_ret=  282  daily=  1703
OK tron                  m5_ret=  282  daily=  2842
OK dogecoin              m5_ret=  283  daily=  4264
OK cardano               m5_ret=  283  daily=  2864
OK chainlink             m5_ret=  282  daily=  2842
OK wrapped-steth         m5_ret=  283  daily=  1414
OK wrapped-bitcoin       m5_ret=  285  daily=  2393
OK hyperliquid           m5_ret=  283  daily=   265
OK wrapped-beacon-eth    m5_ret=  284  daily=   842
OK stellar               m5_ret=  284  daily=  4027
OK sui                   m5_ret=  282  daily=   841
OK ethena-usde           m5_ret=  283  daily=   614
OK wrapped-e

In [31]:
#wide_daily.head()
#wide_daily.columns
#wide_5m_returns.head()
#wide_5m_returns.tail()
#wide_15m.columns
# Plain CSV (includes timestamp index as first column)
#wide_daily.to_csv("coingecko_daily_close_topN.csv", index=True, index_label="timestamp")
#wide_5m_returns.to_csv("coingecko_5min_returns_topN.csv", index=True, index_label="timestamp")


In [35]:
# Process 5 minute returns/prep for use in model

import numpy as np
from darts import TimeSeries
from darts.dataprocessing.transformers import Scaler, MissingValuesFiller
from darts.utils.timeseries_generation import datetime_attribute_timeseries as dt_attrs

# --- Assumptions ---
# wide_5m_returns: pandas DataFrame, index is 5-minute UTC timestamps,
# one column per coin (e.g., "bitcoin", "ethereum", ...), values are 5-min returns.
# If you have prices instead, change the comments accordingly.

# 0) Housekeeping: make sure the index is named and sorted
wide_5m_returns = wide_5m_returns.sort_index()
wide_5m_returns.index.name = "timestamp"

# wide_5m_returns: ensure UTC-naive DateTimeIndex (Darts requirement)
wide_5m_returns = wide_5m_returns.copy()

# If index isn't datetime yet, parse as UTC first
if not isinstance(wide_5m_returns.index, pd.DatetimeIndex):
    wide_5m_returns.index = pd.to_datetime(wide_5m_returns.index, utc=True)

# If tz-aware, convert to UTC then drop tz info
if wide_5m_returns.index.tz is not None:
    wide_5m_returns.index = (
        wide_5m_returns.index.tz_convert("UTC").tz_localize(None)
    )

# 1) Build per-coin TimeSeries list from WIDE data
series_list = []
coin_ids = list(wide_5m_returns.columns)
for coin in coin_ids:
    df_coin = wide_5m_returns[[coin]].reset_index().rename(columns={coin: "y"})
    # fill_missing_dates=True ensures a regular 5-min grid even if there were gaps
    ts = TimeSeries.from_dataframe(
        df_coin,
        time_col="timestamp",
        value_cols="y",
        fill_missing_dates=True,
        freq="5min",
    )
    series_list.append(ts)

# 2) Handle missing values in the target
# For RETURNS, a common choice is to fill missing with 0 (no move).
filler_y = MissingValuesFiller(fill="auto")  # uses pandas interpolate()
series_list = [filler_y.transform(ts) for ts in series_list]

# (If your target were PRICES, you’d more likely use forward-fill:
#   MissingValuesFiller(fill="ffill")
# )

# 3) Build past covariates per series (time-of-day & day-of-week)
#    We generate them on each series' own time index so they align perfectly.
covs_list = []
for s in series_list:
    tod = dt_attrs(s.time_index, attribute="hour", one_hot=True, dtype=np.float32)
    dow = dt_attrs(s.time_index, attribute="day_of_week", one_hot=True, dtype=np.float32)
    covs = tod.stack(dow)  # multivariate TimeSeries with all one-hot columns
    covs_list.append(covs)

# (Optional) You can add more past covariates here, e.g.:
# - minute-of-hour one-hot (attribute="minute")
# - rolling volatility computed from returns and wrapped into a TimeSeries
# Just ensure the covariates share the same time index as each target series.

# 4) Scale targets and covariates globally
#    Using a single scaler per “family” helps global models.
scaler_y = Scaler()   # targets (returns)
scaler_x = Scaler()   # covariates (one-hots)

series_list_scaled = scaler_y.fit_transform(series_list)
covs_list_scaled   = scaler_x.fit_transform(covs_list)


In [36]:
# Define model
from darts.models import TFTModel
from darts.utils.likelihood_models import QuantileRegression

H = 12         # horizon in steps (e.g., 12 * 5-min = next hour)
K = 400        # number of sampled paths per forecast
RANDOM_STATE = 42

# Simple, robust TFT; tune later
model = TFTModel(
    input_chunk_length=96,          # 8 hours of lookback
    output_chunk_length=H,
    hidden_size=32,
    lstm_layers=2,
    dropout=0.10,
    add_relative_index=True,
    likelihood=QuantileRegression(quantiles=[0.05, 0.5, 0.95]),
    batch_size=64,
    n_epochs=40,
    random_state=RANDOM_STATE,
)


In [37]:
# train the model
# Fit on scaled returns with scaled covariates (from your earlier code)
model.fit(series=series_list_scaled, past_covariates=covs_list_scaled, verbose=True)


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

   | Name                              | Type                             | Params | Mode 
------------------------------------------------------------------------------------------------
0  | train_metrics                     | MetricCollection                 | 0      | train
1  | val_metrics                       | MetricCollection                 | 0      | train
2  | input_embeddings                  | _MultiEmbedding                  | 0      | train
3  | static_covariates_vsn             | _VariableSelectionNetwork        | 0      | train
4  | encoder_vsn                       | _VariableSelectionNetwork        | 42.5 K | train
5  | decoder_vsn                       | _VariableSelectionNetwork        | 896    | train
6  | static_context_grn                | _GatedResidualNetwork            | 4.3 K  | train
7  | static_context_hidden_encoder_grn | _GatedResidualNetwork 

Training: |                                                                                                   …

`Trainer.fit` stopped: `max_epochs=40` reached.


TFTModel(output_chunk_shift=0, hidden_size=32, lstm_layers=2, num_attention_heads=4, full_attention=False, feed_forward=GatedResidualNetwork, dropout=0.1, hidden_continuous_size=8, categorical_embedding_sizes=None, add_relative_index=True, loss_fn=None, likelihood=QuantileRegression(quantiles=[0.05, 0.5, 0.95], prior_strength=1.0), norm_type=LayerNorm, use_static_covariates=True, input_chunk_length=96, output_chunk_length=12, batch_size=64, n_epochs=40, random_state=42)

In [None]:
# Helper functions to assit convert outputs into more tangible results
import numpy as np
import pandas as pd

# Uses your existing market_chart_close()
def get_last_price_usd(coin_id: str) -> float:
    df1 = market_chart_close(coin_id, days=1)  # last ~day
    if df1.empty:
        return np.nan
    return float(df1["close"].iloc[-1])

def summarize_paths_from_returns(returns_paths: np.ndarray, last_price: float):
    """
    returns_paths: shape (H, K) of *simple* returns (r_t)
    last_price:    current USD price

    Outputs dict with prob_up and summary stats in log space and USD.
    """
    # cumulative log-return on each path (robust composition)
    cum_log = np.sum(np.log1p(returns_paths), axis=0)   # (K,)
    prob_up = float((cum_log > 0).mean())

    # summary in log space
    p50_log = float(np.median(cum_log))
    p05_log = float(np.quantile(cum_log, 0.05))
    p95_log = float(np.quantile(cum_log, 0.95))
    exp_log = float(cum_log.mean())

    # translate to USD change on each path
    # price_T = last_price * exp(cum_log); USD_change = price_T - last_price
    usd_changes = last_price * (np.exp(cum_log) - 1.0)  # (K,)

    p50_usd = float(np.median(usd_changes))
    p05_usd = float(np.quantile(usd_changes, 0.05))
    p95_usd = float(np.quantile(usd_changes, 0.95))
    exp_usd = float(usd_changes.mean())

    return {
        "prob_up": prob_up,
        "exp_logret": exp_log,
        "p50_logret": p50_log,
        "p05_logret": p05_log,
        "p95_logret": p95_log,
        "exp_usd": exp_usd,
        "p50_usd": p50_usd,
        "p05_usd": p05_usd,
        "p95_usd": p95_usd,
    }


In [None]:
# Another set of helpers and a test run
def forecast_coin_summary(coin_id: str, idx: int) -> dict:
    """
    coin_id: coin string (e.g., 'bitcoin'), matching coin_ids[idx]
    idx:     index into series_list_scaled, covs_list_scaled

    Returns a dict with coin, last_price, and summary stats.
    """
    # 1) Predict K sampled paths in *scaled* space
    samples_scaled = model.predict(
        n=H,
        series=series_list_scaled[idx],
        past_covariates=covs_list_scaled[idx],
        num_samples=K,
        verbose=False,
    )
    # 2) Invert scaling back to RETURNS
    samples = scaler_y.inverse_transform(samples_scaled)

    # 3) Extract array: (H, 1, K) -> (H, K)
    arr = samples.all_values(copy=False)[:, 0, :]  # returns

    # 4) Latest price (USD)
    last_price = get_last_price_usd(coin_id)

    # 5) Summarize paths
    stats = summarize_paths_from_returns(arr, last_price)
    stats.update({"coin": coin_id, "last_price": float(last_price)})
    return stats

# Run for a few coins
summaries = []
for i, coin in enumerate(coin_ids[:20]):  # e.g., first 20
    try:
        summaries.append(forecast_coin_summary(coin, i))
    except Exception as e:
        print(f"ERR {coin}: {e}")

summary_df = pd.DataFrame(summaries).set_index("coin")
print(summary_df.head())
