# Prep Environment

In [11]:
import os

import pandas as pd
import numpy as np
import yfinance as yf
from datetime import date
from tqdm import tqdm

from statsmodels.tsa.stattools import acf
import statsmodels.api as sm

# Prep Utilities

In [None]:
def calc_autocorr(series, lag=1):
    """Compute lag-1 autocorrelation"""
    return series.autocorr(lag=lag)

def calc_half_life(series):
    """Estimate mean reversion half-life from AR(1) process"""
    series = series.dropna()
    lagged = series.shift(1).dropna()
    delta = series.diff().dropna()

    # Align
    lagged = lagged.loc[delta.index]
    delta = delta.loc[lagged.index]

    # Regress delta ~ lagged
    model = sm.OLS(delta, sm.add_constant(lagged)).fit()
    beta = model.params[1]

    if beta >= 0:
        halflife = -np.log(2) / np.log(1 + beta) if beta < 1 else np.inf
    else:
        halflife = np.nan  # unstable
    return halflife

def calc_hurst(series):
    """Estimate Hurst exponent using R/S method"""
    series = series.dropna()
    N = len(series)
    if N < 100:
        return np.nan
    
    lags = range(2, min(100, N // 2))
    tau = [np.std(series[lag:] - series[:-lag]) for lag in lags]
    if np.any(np.array(tau) <= 0):
        return np.nan
    
    poly = np.polyfit(np.log(lags), np.log(tau), 1)
    hurst = poly[0] * 2.0
    return hurst

def rolling_metrics(data, window_autocorr=120, window_halflife=252, window_hurst=500):
    df = data.copy()

    df["Autocorr"] = df["Return"].rolling(window_autocorr).apply(calc_autocorr, raw=False)
    df["HalfLife"] = df["Return"].rolling(window_halflife).apply(calc_half_life, raw=False)
    df["Hurst"] = df["Return"].rolling(window_hurst).apply(calc_hurst, raw=False)

    return df

# Ingest Data

In [5]:
asset_list_df = pd.read_csv("../config/50 Biggest Market Capitalization - Aug 2025.csv")
asset_list_df

Unnamed: 0,Code,Listed Stocks,Number of Listed Shares,Market Capitalization IDR,Market Capitalization %
0,BREN,PT Barito Renewables Energy Tbk.,133786220000,1207420636,8.51
1,BBCA,Bank Central Asia Tbk.,122042299500,985491568,6.95
2,DCII,DCI Indonesia Tbk,2383745900,811605885,5.72
3,DSSA,Dian Swastatika Sentosa Tbk,7705523200,764387901,5.39
4,TPIA,PT Chandra Asri Pacific Tbk,86511545092,713720247,5.03
5,BBRI,PT Bank Rakyat Indonesia (Persero) Tbk,150043411587,607675817,4.28
6,BYAN,Bayan Resources Tbk,33333335000,605000030,4.27
7,AMMN,PT Amman Mineral Internasional Tbk.,72518217656,569268009,4.01
8,BMRI,Bank Mandiri (Persero) Tbk.,92399999996,437052000,3.08
9,TLKM,Telkom Indonesia (Persero) Tbk.,99062216600,310064738,2.19


In [13]:
today = date.today()
target_date = date(2022, 1, 1)
# target_date = date(2025, 8, 15)
time_difference = today - target_date
number_of_days = time_difference.days
market_code = (asset_list_df['Code'] + ".JK").tolist()
all_data = []

file_path = "../data/yfinance_idx_ticker_data.csv"

if os.path.isfile(file_path):
    print("File Already Exists")
    raw_df = pd.read_csv(file_path)    
else:
    print("File Doesn't Exists, Downloading...")
    all_data = []
    for ticker in tqdm(market_code, desc="Fetching OHLCV"):
        try:
            df = yf.download(
                ticker,
                period=str(number_of_days) + "d",
                progress=False,
                threads=False,
                auto_adjust=False,
            )
            if df.empty:
                raise ValueError("No data returned")
                
            df = df.reset_index()
            df.columns = df.columns.droplevel(level=1)
            df["Ticker"] = ticker
            all_data.append(df)
        except Exception as e:
            print("Ticker", ticker, "Error:", e)
            pass
    
    raw_df = pd.concat(all_data, axis=0).reset_index(drop=True)
    raw_df.to_csv(file_path, index=False)

raw_df

File Already Exists


Unnamed: 0,Date,Adj Close,Close,High,Low,Open,Volume,Ticker
0,2023-10-09,973.590942,975.0,975.0,975.0,975.0,22298500,BREN.JK
1,2023-10-10,1213.244019,1215.0,1215.0,1060.0,1060.0,13625200,BREN.JK
2,2023-10-11,1512.810425,1515.0,1515.0,1515.0,1515.0,38431400,BREN.JK
3,2023-10-12,1887.268555,1890.0,1890.0,1820.0,1890.0,219262300,BREN.JK
4,2023-10-13,2356.589111,2360.0,2360.0,2240.0,2360.0,39504100,BREN.JK
...,...,...,...,...,...,...,...,...
58724,2025-09-16,474.000000,474.0,490.0,472.0,488.0,262315700,MBMA.JK
58725,2025-09-17,480.000000,480.0,484.0,464.0,474.0,247355800,MBMA.JK
58726,2025-09-18,470.000000,470.0,484.0,466.0,484.0,199379800,MBMA.JK
58727,2025-09-19,470.000000,470.0,490.0,468.0,470.0,250337700,MBMA.JK


# Data Preprocess

In [None]:
base_df = raw_df.copy()

