In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.metrics import mean_squared_error
from math import sqrt
import matplotlib.pyplot as plt
from sklearn.base import BaseEstimator, RegressorMixin

In [36]:
def calculate_aic(n, mse, num_params):
    aic = n * np.log(mse) + 2 * num_params
    return aic

In [37]:
def calculate_bic(n, mse, num_params):
    bic = n * np.log(mse) + num_params * np.log(n)
    return bic

In [2]:
aapl_df = pd.read_csv("data_files/AAPL/AAPL_combined.csv.gz")
lly_df = pd.read_csv("data_files/LLY/LLY_combined.csv.gz")
nee_df = pd.read_csv("data_files/NEE/NEE_combined.csv.gz")

# Data Preprocessing
def data_preprocess(df):
    # Rename columns
    df.rename(columns={'Unnamed: 0': 'timestamp',
                       '1. open': 'open',
                       '2. high': 'high',
                       '3. low': 'low',
                       '4. close': 'close',
                       '5. volume': 'volume'}, inplace=True)
    
    # Drop unnecessary columns
    if 'Unnamed: 0.1' in df.columns.tolist():
        df.drop(columns=['Unnamed: 0.1'], inplace=True)

    # Handle data types
    df['timestamp'] = pd.to_datetime(df['timestamp'])

    return df

aapl_df = data_preprocess(aapl_df)
lly_df = data_preprocess(lly_df)
nee_df = data_preprocess(nee_df)

In [3]:
#Load in AAPL data, make returns, variance, and volatility
AAPL_returns = pd.DataFrame({
    "time_stamp": aapl_df['timestamp'],
    "log_return": np.log(1+(aapl_df['close']-aapl_df['open'])/aapl_df['open'])
})
AAPL_returns['time_stamp'] = pd.to_datetime(AAPL_returns['time_stamp'], errors='coerce')
daily_variance_AAPL = AAPL_returns.groupby(AAPL_returns['time_stamp'].dt.floor("1D"))["log_return"].var()

In [4]:
LLY_returns = pd.DataFrame({
    "time_stamp": lly_df["timestamp"],
    "log_return": np.log(1+(lly_df['close']-lly_df['open'])/lly_df['open'])
})
LLY_returns['time_stamp'] = pd.to_datetime(LLY_returns['time_stamp'], errors='coerce')
daily_variance_LLY = LLY_returns.groupby(LLY_returns['time_stamp'].dt.floor("1D"))["log_return"].var()

In [5]:
NEE_returns = pd.DataFrame({
    "time_stamp": nee_df["timestamp"],
    "log_return": np.log(1+(nee_df['close']-nee_df['open'])/nee_df['open'])
})
NEE_returns['time_stamp'] = pd.to_datetime(NEE_returns['time_stamp'], errors='coerce')
daily_variance_NEE = NEE_returns.groupby(NEE_returns['time_stamp'].dt.floor("1D"))["log_return"].var()

In [7]:
daily_volatility_AAPL = np.sqrt(daily_variance_AAPL)
daily_volatility_LLY = np.sqrt(daily_variance_LLY)
daily_volatility_NEE = np.sqrt(daily_variance_NEE)


In [None]:
daily_volatility_AAPL = pd.DataFrame(daily_volatility_AAPL)
train = daily_volatility_AAPL.values[:1214, :]
test = daily_volatility_AAPL.values[1214:, :]
X_train = train.reshape(-1,1)
y_train = train
X_test = test.reshape(-1,1)
y_test = test

In [33]:
class EWMA_Estimator(BaseEstimator, RegressorMixin):
    def __init__(self, alpha=0.5):
        self.alpha = alpha
        self.last_ewma_ = None
    def fit(self, X, y=None):
        X_series = pd.Series(X.ravel())
        ewma_train = X_series.ewm(alpha=self.alpha, adjust=False).mean()
        self.last_ewma_ = ewma_train.iloc[-1]
        return self
    def predict(self,X):
        return np.full(len(X), self.last_ewma_)


In [31]:
param_grid = {
    'alpha': [0.1, 0.3, 0.5, 0.7, 0.9]
}
tscv = TimeSeriesSplit(n_splits=5)
ewma_model = EWMA_Estimator()
grid_search = GridSearchCV(
    estimator=ewma_model,
    param_grid=param_grid,
    cv=tscv,
    scoring='neg_root_mean_squared_error',
    n_jobs=-1
)
grid_search.fit(X_train, y_train)
best_alpha = grid_search.best_params_['alpha']
print("Best alpha: ", best_alpha)
min_rmse = -grid_search.best_score_
print("Best score: ", min_rmse)

Best alpha:  0.7
Best score:  0.0003031270361351625


In [38]:
final_ewma_model= EWMA_Estimator(alpha=best_alpha)
final_ewma_model.fit(X_train)
predictions = final_ewma_model.predict(X_test)
test_rmse = sqrt(mean_squared_error(y_test, predictions))
print("RMSE: ", test_rmse)
mse = mean_squared_error(y_test, predictions)
print("MSE: ", mse)
num_params = len(final_ewma_model.get_params(deep=True))
print(num_params)
n = len(y_train)
aic = calculate_aic(n, mse, num_params)
print("AIC: ", aic)
bic = calculate_bic(n, mse, num_params)
print("BIC: ", bic)

RMSE:  0.0006444392492897706
MSE:  4.1530194602516306e-07
1
AIC:  -17836.831640595792
BIC:  -17831.729964624174


In [39]:
daily_volatility_LLY = pd.DataFrame(daily_volatility_LLY)
train = daily_volatility_LLY.values[:1214, :]
test = daily_volatility_LLY.values[1214:, :]
X_train = train.reshape(-1,1)
y_train = train
X_test = test.reshape(-1,1)
y_test = test

In [40]:
grid_search.fit(X_train, y_train)
best_alpha = grid_search.best_params_['alpha']
print("Best alpha: ", best_alpha)
min_rmse = -grid_search.best_score_
print("Best score: ", min_rmse)

Best alpha:  0.3
Best score:  0.0002944452431946679


In [41]:
final_ewma_model= EWMA_Estimator(alpha=best_alpha)
final_ewma_model.fit(X_train)
predictions = final_ewma_model.predict(X_test)
test_rmse = sqrt(mean_squared_error(y_test, predictions))
print("RMSE: ", test_rmse)
mse = mean_squared_error(y_test, predictions)
print("MSE: ", mse)
num_params = len(final_ewma_model.get_params(deep=True))
print(num_params)
n = len(y_train)
aic = calculate_aic(n, mse, num_params)
print("AIC: ", aic)
bic = calculate_bic(n, mse, num_params)
print("BIC: ", bic)

RMSE:  0.0006305868065751203
MSE:  3.976397206266082e-07
1
AIC:  -17889.591431944227
BIC:  -17884.48975597261


In [42]:
daily_volatility_NEE = pd.DataFrame(daily_volatility_NEE)
train = daily_volatility_NEE.values[:1214, :]
test = daily_volatility_NEE.values[1214:, :]
X_train = train.reshape(-1,1)
y_train = train
X_test = test.reshape(-1,1)
y_test = test

In [43]:
grid_search.fit(X_train, y_train)
best_alpha = grid_search.best_params_['alpha']
print("Best alpha: ", best_alpha)
min_rmse = -grid_search.best_score_
print("Best score: ", min_rmse)

Best alpha:  0.5
Best score:  0.00022822873610299542


In [44]:
final_ewma_model= EWMA_Estimator(alpha=best_alpha)
final_ewma_model.fit(X_train)
predictions = final_ewma_model.predict(X_test)
test_rmse = sqrt(mean_squared_error(y_test, predictions))
print("RMSE: ", test_rmse)
mse = mean_squared_error(y_test, predictions)
print("MSE: ", mse)
num_params = len(final_ewma_model.get_params(deep=True))
print(num_params)
n = len(y_train)
aic = calculate_aic(n, mse, num_params)
print("AIC: ", aic)
bic = calculate_bic(n, mse, num_params)
print("BIC: ", bic)

RMSE:  0.0003118546797667472
MSE:  9.725334129242045e-08
1
AIC:  -19599.179047692327
BIC:  -19594.07737172071


In [9]:
daily_volatility_AAPL.name = 'volatility'
daily_volatility_LLY.name = 'volatility'
daily_volatility_NEE.name = 'volatility'

In [None]:
daily_volatility_AAPL.name = 'volatility'
daily_volatility_AAPL = daily_volatility_AAPL.to_frame()
daily_volatility_AAPL['EWM_90'] = daily_volatility_AAPL['volatility'].shift(1).ewm(alpha=.90).mean()
daily_volatility_AAPL['EWM_50'] = daily_volatility_AAPL['volatility'].shift(1).ewm(alpha=.50).mean()
daily_volatility_AAPL['EWM_10'] = daily_volatility_AAPL['volatility'].shift(1).ewm(alpha=.10).mean()