# Prep Environment

In [15]:
import os

import pandas as pd
import numpy as np
import yfinance as yf
from datetime import date
from tqdm import tqdm
import matplotlib.pyplot as plt

from statsmodels.tsa.stattools import acf
import statsmodels.api as sm
from hurst import compute_Hc, random_walk

# Prep Utilities

In [81]:
def calc_autocorr(series, lag=1):
    """Compute lag-1 autocorrelation"""
    return series.autocorr(lag=lag)

def calc_half_life(series, method='ar1'):
    """
    Estimate half-life of mean reversion for a time series.
    
    Parameters
    ----------
    series : array-like or pd.Series
        1D time series of values (assumed equally spaced in time).
    method : {'diff', 'ar1'}, default='diff'
        'diff' - regress Δx_t on x_{t-1}  (common in mean-reversion literature).
        'ar1'  - regress x_t on x_{t-1}   (simple AR(1) estimate).
    
    Returns
    -------
    float
        Estimated half-life in time steps. Returns np.nan if not mean-reverting.
    """
    x = pd.Series(series).dropna().astype(float)

    if len(x) < 5:
        raise ValueError("Series too short to estimate half-life (need >= 5 observations)")

    if method == 'diff':
        # Δx_t = alpha + beta * x_{t-1} + ε_t
        x_lag = x.shift(1).iloc[1:]
        dx = x.diff().iloc[1:]
        X = sm.add_constant(x_lag.values.reshape(-1, 1))
        model = sm.OLS(dx.values, X).fit()
        beta = model.params[1]
        phi = 1.0 + beta
    elif method == 'ar1':
        # x_t = alpha + phi * x_{t-1} + ε_t
        x_lag = x.shift(1).iloc[1:]
        x_t = x.iloc[1:]
        X = sm.add_constant(x_lag.values.reshape(-1, 1))
        model = sm.OLS(x_t.values, X).fit()
        phi = model.params[1]
    else:
        raise ValueError("Unknown method: choose 'diff' or 'ar1'")

    # Check stability
    if phi <= 0 or phi >= 1:
        return np.nan
    
    return -np.log(2) / np.log(phi)

def calc_hurst(series):
    """Estimate Hurst exponent using R/S method"""
    series = series.dropna()
    H, c, data= compute_Hc(series, kind='price', simplified=True)
    return H

# Ingest Data

In [3]:
asset_list_df = pd.read_csv("../config/50 Biggest Market Capitalization - Aug 2025.csv")
asset_list_df

Unnamed: 0,Code,Listed Stocks,Number of Listed Shares,Market Capitalization IDR,Market Capitalization %
0,BREN,PT Barito Renewables Energy Tbk.,133786220000,1207420636,8.51
1,BBCA,Bank Central Asia Tbk.,122042299500,985491568,6.95
2,DCII,DCI Indonesia Tbk,2383745900,811605885,5.72
3,DSSA,Dian Swastatika Sentosa Tbk,7705523200,764387901,5.39
4,TPIA,PT Chandra Asri Pacific Tbk,86511545092,713720247,5.03
5,BBRI,PT Bank Rakyat Indonesia (Persero) Tbk,150043411587,607675817,4.28
6,BYAN,Bayan Resources Tbk,33333335000,605000030,4.27
7,AMMN,PT Amman Mineral Internasional Tbk.,72518217656,569268009,4.01
8,BMRI,Bank Mandiri (Persero) Tbk.,92399999996,437052000,3.08
9,TLKM,Telkom Indonesia (Persero) Tbk.,99062216600,310064738,2.19


In [83]:
today = date.today()
target_date = date(2020, 1, 1)
# target_date = date(2025, 8, 15)
time_difference = today - target_date
number_of_days = time_difference.days
market_code = (asset_list_df['Code'] + ".JK").tolist()
all_data = []

file_path = "../data/yfinance_idx_ticker_data.csv"

if os.path.isfile(file_path):
    print("File Already Exists")
    raw_df = pd.read_csv(file_path)    
else:
    print("File Doesn't Exists, Downloading...")
    all_data = []
    for ticker in tqdm(market_code, desc="Fetching OHLCV"):
        try:
            df = yf.download(
                ticker,
                period=str(number_of_days) + "d",
                progress=False,
                threads=False,
                auto_adjust=False,
            )
            if df.empty:
                raise ValueError("No data returned")
                
            df = df.reset_index()
            df.columns = df.columns.droplevel(level=1)
            df["Ticker"] = ticker
            all_data.append(df)
        except Exception as e:
            print("Ticker", ticker, "Error:", e)
            pass
    
    raw_df = pd.concat(all_data, axis=0).reset_index(drop=True)
    raw_df.to_csv(file_path, index=False)

raw_df

File Doesn't Exists, Downloading...


Fetching OHLCV: 100%|██████████| 50/50 [00:18<00:00,  2.75it/s]


Price,Date,Adj Close,Close,High,Low,Open,Volume,Ticker
0,2023-10-09,973.590942,975.0,975.0,975.0,975.0,22298500,BREN.JK
1,2023-10-10,1213.244019,1215.0,1215.0,1060.0,1060.0,13625200,BREN.JK
2,2023-10-11,1512.810425,1515.0,1515.0,1515.0,1515.0,38431400,BREN.JK
3,2023-10-12,1887.268555,1890.0,1890.0,1820.0,1890.0,219262300,BREN.JK
4,2023-10-13,2356.589355,2360.0,2360.0,2240.0,2360.0,39504100,BREN.JK
...,...,...,...,...,...,...,...,...
85880,2025-09-17,480.000000,480.0,484.0,464.0,474.0,247355800,MBMA.JK
85881,2025-09-18,470.000000,470.0,484.0,466.0,484.0,199379800,MBMA.JK
85882,2025-09-19,470.000000,470.0,490.0,468.0,470.0,250337700,MBMA.JK
85883,2025-09-22,510.000000,510.0,510.0,476.0,476.0,556976700,MBMA.JK


# Data Preprocess

In [84]:
base_df = raw_df.copy()
base_df['Return'] = base_df.sort_values(by=['Ticker', 'Date']).groupby('Ticker')['Close'].pct_change()
base_df = base_df.dropna().reset_index(drop=True)
base_df

Price,Date,Adj Close,Close,High,Low,Open,Volume,Ticker,Return
0,2023-10-10,1213.244019,1215.0,1215.0,1060.0,1060.0,13625200,BREN.JK,0.246154
1,2023-10-11,1512.810425,1515.0,1515.0,1515.0,1515.0,38431400,BREN.JK,0.246914
2,2023-10-12,1887.268555,1890.0,1890.0,1820.0,1890.0,219262300,BREN.JK,0.247525
3,2023-10-13,2356.589355,2360.0,2360.0,2240.0,2360.0,39504100,BREN.JK,0.248677
4,2023-10-16,2746.025635,2750.0,2950.0,2400.0,2940.0,227720100,BREN.JK,0.165254
...,...,...,...,...,...,...,...,...,...
85830,2025-09-17,480.000000,480.0,484.0,464.0,474.0,247355800,MBMA.JK,0.012658
85831,2025-09-18,470.000000,470.0,484.0,466.0,484.0,199379800,MBMA.JK,-0.020833
85832,2025-09-19,470.000000,470.0,490.0,468.0,470.0,250337700,MBMA.JK,0.000000
85833,2025-09-22,510.000000,510.0,510.0,476.0,476.0,556976700,MBMA.JK,0.085106


# Find Revertiveness

In [None]:
result_csv_path = '../output/revertiveness_result.csv'

if os.path.isfile(result_csv_path):
    print("File Already Exists")
    result_df = pd.read_csv(result_csv_path)    
else:
    result_df = base_df.copy()
    result_df = result_df.sort_values(by=['Ticker', 'Date'])

    window_autocorr=60
    window_halflife=120
    window_hurst=200

    temp_metric = result_df.groupby('Ticker')['Return'].rolling(window_autocorr).apply(calc_autocorr, raw=False)
    temp_metric.index = temp_metric.index.get_level_values(1)
    temp_metric = temp_metric.sort_index()
    result_df['Autocorr'] = temp_metric

    temp_metric = result_df.groupby('Ticker')['Return'].rolling(window_halflife).apply(calc_half_life, raw=False)
    temp_metric.index = temp_metric.index.get_level_values(1)
    temp_metric = temp_metric.sort_index()
    result_df['HalfLife'] = temp_metric

    # temp_metric = result_df.groupby('Ticker')['Close'].rolling(window_hurst).apply(calc_hurst, raw=False)
    # temp_metric.index = temp_metric.index.get_level_values(1)
    # temp_metric = temp_metric.sort_index()
    # result_df['Hurst'] = temp_metric

    result_df.to_csv(result_csv_path, index=False)

result_df

Price,Date,Adj Close,Close,High,Low,Open,Volume,Ticker,Return,Autocorr,HalfLife
75125,2024-12-06,7975.0,7975.0,7975.0,7975.0,7975.0,446600,AADI.JK,0.199248,,
75126,2024-12-09,9550.0,9550.0,9550.0,9550.0,9550.0,43658100,AADI.JK,0.197492,,
75127,2024-12-10,10275.0,10275.0,11375.0,9600.0,10500.0,228636600,AADI.JK,0.075916,,
75128,2024-12-11,9600.0,9600.0,10450.0,9575.0,10275.0,100219900,AADI.JK,-0.065693,,
75129,2024-12-12,9200.0,9200.0,9775.0,9025.0,9525.0,71848600,AADI.JK,-0.041667,,
...,...,...,...,...,...,...,...,...,...,...,...
59896,2025-09-17,1735.0,1735.0,1765.0,1715.0,1740.0,12700500,UNVR.JK,-0.002874,-0.242029,
59897,2025-09-18,1710.0,1710.0,1755.0,1705.0,1745.0,11695200,UNVR.JK,-0.014409,-0.227606,
59898,2025-09-19,1715.0,1715.0,1725.0,1700.0,1715.0,22797600,UNVR.JK,0.002924,-0.239715,
59899,2025-09-22,1715.0,1715.0,1730.0,1710.0,1725.0,12021400,UNVR.JK,0.000000,-0.239740,0.158388
