In [2]:
import numpy as np # fast math + arrays
import pandas as pd # read CSVs, data tables
import scipy.stats as st # probability models + .fit()
import matplotlib.pyplot as plt # plotting
from statsmodels.distributions.empirical_distribution import ECDF # ECDF: nonparametric cumulative curve

plt.rcParams.update({
"axes.spines.top": False, # cleaner axes
"axes.spines.right": False,
"axes.grid": True, # light grid helps read values
"grid.alpha": 0.25,
})

def plot_ecdf(data, ax=None, **kw):

    """
    Make an ECDF step plot.
    WHY: shows all data points; no bins; great for comparing to model CDFs.
    """

    ec = ECDF(data) # compute ECDF object (x,y steps)
    ax = ax or plt.gca() # use passed axes OR current axes
    ax.step(ec.x, ec.y, where="post", **kw)
    ax.set_xlabel("Wait (min)")
    ax.set_ylabel("F_hat(x)") # F-hat = empirical CDF
    return ax

def aic(logL, k):

    """
    Compute Akaike Information Criterion.
    AIC = 2*k - 2*logL
    logL = maximized log-likelihood; k = # free parameters in model.
    Lower AIC -> better tradeoff fit vs complexity (same dataset only!).
    """

    return 2*k - 2*logL

def mean_from_params(dist_name, params):
    
    """
    Convert SciPy .fit() parameters -> model mean (in minutes).
    SciPy returns (shape[s], loc, scale) in this order for most continuous dists.
    NOTE: ’loc’ shifts the distribution; ’scale’ stretches it.
    """

    if dist_name == "gamma":
        a, loc, scale = params # a = shape (k)
        return loc + a*scale # Gamma mean = loc + k*theta
    elif dist_name == "weibull_min":
        c, loc, scale = params # c = shape
        # Weibull mean uses the gamma function:
        return loc + scale*st.gamma(1 + 1/c)
    elif dist_name == "lognorm":
        s, loc, scale = params # s = sigma (log-scale SD); scale = exp(mu)
        # mean = loc + scale * exp( sigma^2 / 2 )
        return loc + scale * np.exp(s**2 / 2)
    else:
        return np.nan # unknown model

In [35]:
df = pd.read_csv('./data/vix.csv', parse_dates = ['Open time'])

df = df.drop(['Open', 'High', 'Low', 'Volume', 'Trade count', 'VWAP'], axis = 1)

split_date = pd.Timestamp('2024-07-15')

three_mo_before = pd.Timestamp('2024-04-15')
three_mo_after = pd.Timestamp('2024-10-15')

pre = df.loc[(df['Open time'] >= three_mo_before) & (df['Open time'] < split_date)].copy()
post = df.loc[(df['Open time'] <= three_mo_after) & (df['Open time'] >= split_date)].copy()

pre = pre.drop(['Open time'], axis = 1).astype(float)
post = post.drop(['Open time'], axis = 1).astype(float)

pre = pre.to_numpy()
post = post.to_numpy()

print(f"PRE n={pre.size}, mean=${pre.mean():.2f}")
print(f"POST n={post.size}, mean=${post.mean():.2f}")

PRE n=370, mean=$47.47
POST n=390, mean=$49.97
