In [1]:
# Data manipulation
import pandas as pd
import numpy as np

# Plots
import matplotlib.pyplot as plt

# Models
import pmdarima as pm
from pmdarima.arima import auto_arima
from arch import arch_model

In [2]:
# Read and preprocess data

aapl = pd.read_csv("data_files/AAPL_combined.csv.gz")
nee = pd.read_csv("data_files/NEE_combined.csv.gz")
lly = pd.read_csv("data_files/LLY_combined.csv.gz")


def data_preprocess(df):
    # Rename columns
    df.rename(columns={'Unnamed: 0': 'timestamp',
                       '1. open': 'open',
                       '2. high': 'high',
                       '3. low': 'low',
                       '4. close': 'close',
                       '5. volume': 'volume'}, inplace=True)
    
    # Drop unnecessary columns
    if 'Unnamed: 0.1' in df.columns.tolist():
        df.drop(columns=['Unnamed: 0.1'], inplace=True)

    # Handle data types
    df['timestamp'] = pd.to_datetime(df['timestamp'])

    return df

aapl = data_preprocess(aapl)
nee = data_preprocess(nee)
lly = data_preprocess(lly)

In [3]:
# Compute daily volatility

def daily_volatility(df):
    """
    Aggregates to daily level and calculates volatility based on the 
    logic: sqrt(log(1 + (close - open) / open))
    """
    # Select columns and drop NAs
    df = df[['timestamp', 'open', 'close']].dropna().copy()
    
    # Convert timestamp to datetime and extract the date
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['date'] = df['timestamp'].dt.date
    
    # Group by date to get first open and last close
    daily = df.groupby('date').agg(
        open=('open', 'first'),
        close=('close', 'last')
    ).reset_index()
    
    # Calculate log return
    # equivalent to R: log(1 + (close - open) / open) -> log(close / open)
    daily['log_return'] = np.log(1 + (daily['close'] - daily['open']) / daily['open'])
    
    # Calculate volatility
    # Note: If log_return is negative, sqrt will result in NaN. 
    # We use np.sqrt where valid, otherwise NaN.
    daily['volatility'] = daily['log_return'].apply(lambda x: np.sqrt(x) if x > 0 else np.nan)
    
    # Return only date and volatility, dropping any rows where calculation failed (NaNs)
    return daily[['date', 'volatility']].dropna()

aapl_vol = daily_volatility(aapl)
nee_vol = daily_volatility(nee)
lly_vol = daily_volatility(lly)


In [4]:
# Compute daily returns

def get_daily_returns(df):

    df = df[['timestamp', 'open', 'close']].dropna().copy()
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['date'] = df['timestamp'].dt.date
    
    # Aggregation
    daily = df.groupby('date').agg(
        open=('open', 'first'),
        close=('close', 'last')
    ).reset_index()
    
    # Calculate Log Return
    daily['log_return'] = np.log(daily['close'] / daily['open'])
    daily['scaled_return'] = daily['log_return'] * 100
    
    return daily[['date', 'scaled_return']].dropna()

aapl_returns = get_daily_returns(aapl)
nee_returns = get_daily_returns(nee)
lly_returns = get_daily_returns(lly)

# Build  Baseline Models

### EWMA

### ARIMA

In [7]:
def fit_arima(df):

    series = df['volatility'].values

    model = auto_arima(series, seasonal=False, error_action='ignore', suppress_warnings=True)

    return model

In [8]:
aapl_arima = fit_arima(aapl_vol)
nee_arima = fit_arima(nee_vol)
lly_arima = fit_arima(lly_vol)

### GARCH

In [10]:
def fit_garch_pq(df, p, q):    
    # Model
    model = arch_model(df['scaled_return'], vol='Garch', p=p, q=q, mean='Zero')
    results = model.fit(disp='off')

    return results

In [12]:
aapl_garch = fit_garch_pq(aapl_returns, 3, 2)
nee_garch = fit_garch_pq(nee_returns, 2, 2)
lly_garch = fit_garch_pq(lly_returns, 1, 1)

                       Zero Mean - GARCH Model Results                        
Dep. Variable:          scaled_return   R-squared:                       0.000
Mean Model:                 Zero Mean   Adj. R-squared:                  0.001
Vol Model:                      GARCH   Log-Likelihood:               -2749.35
Distribution:                  Normal   AIC:                           5504.70
Method:            Maximum Likelihood   BIC:                           5520.53
                                        No. Observations:                 1444
Date:                Mon, Dec 08 2025   Df Residuals:                     1444
Time:                        11:47:20   Df Model:                            0
                             Volatility Model                             
                 coef    std err          t      P>|t|    95.0% Conf. Int.
--------------------------------------------------------------------------
omega          0.5354      0.235      2.279  2.267e-02 [7.495e-0