In [1]:
import os
import pandas as pd
import numpy as np
from statsforecast import StatsForecast

from lib.utils import get_polygon_root, get_polygon_freq, log_returns

  from tqdm.autonotebook import tqdm


In [4]:
def preprocess(df: pd.DataFrame) -> pd.DataFrame:
    """
    Preprocess dataframe for statsforecast
    """
    df = df.ffill() # fill missing values with previous value
    df = df.dropna() # drop remaining missing values (first row)
    assert df.isnull().sum().sum() == 0, "Missing values in dataframe"

    df.rename(columns={'est': 'ds'}, inplace=True) # rename column to ds
    df.sort_values(by='ds', inplace=True) # sort by ds (have to do this before getting log returns)
    df.reset_index(drop=True, inplace=True) # reset index after sorting

    df.loc[:, 'log_rt'] = log_returns(df['close']) # get log returns

    df = df[['ticker', 'log_rt', 'ds']].rename(columns={
    'log_rt' : 'y',
    'ticker' : 'unique_id'
    }) # select only relevant columns and rename

    return df

In [24]:
data = f"{get_polygon_root()}/sp500-5yr-hourly"
all_tickers = set([path.split("_")[0] for path in os.listdir(data) if path.endswith(".csv")])
all_paths = set([path for path in os.listdir(data) if path.endswith(".csv")])

tickers = ['MSFT', 'AAPL', 'GOOG']
paths = [f"{ticker}_2018-7"

In [26]:
paths

['CFG_2018-07-03_2023-07-01.csv',
 'TER_2018-07-03_2023-07-01.csv',
 'TROW_2018-07-03_2023-07-01.csv',
 'PCAR_2018-07-03_2023-07-01.csv',
 'GILD_2018-07-03_2023-07-01.csv',
 'WYNN_2018-07-03_2023-07-01.csv',
 'CSGP_2018-07-03_2023-07-01.csv',
 'NOW_2018-07-03_2023-07-01.csv',
 'COO_2018-07-03_2023-07-01.csv',
 'MET_2018-07-03_2023-07-01.csv',
 'ETN_2018-07-03_2023-07-01.csv',
 'BG_2018-07-03_2023-07-01.csv',
 'EFX_2018-07-03_2023-07-01.csv',
 'AFL_2018-07-03_2023-07-01.csv',
 'PAYC_2018-07-03_2023-07-01.csv',
 'EXR_2018-07-03_2023-07-01.csv',
 'MRK_2018-07-03_2023-07-01.csv',
 'AVGO_2018-07-03_2023-07-01.csv',
 'ALGN_2018-07-03_2023-07-01.csv',
 'IVZ_2018-07-03_2023-07-01.csv',
 'VTR_2018-07-03_2023-07-01.csv',
 'BEN_2018-07-03_2023-07-01.csv',
 'SO_2018-07-03_2023-07-01.csv',
 'ABBV_2018-07-03_2023-07-01.csv',
 'VRTX_2018-07-03_2023-07-01.csv',
 'RE_2018-07-03_2023-07-01.csv',
 'STZ_2018-07-03_2023-07-01.csv',
 'MTCH_2018-07-03_2023-07-01.csv',
 'PRU_2018-07-03_2023-07-01.csv',
 'KEYS

In [25]:
dfs = [pd.read_csv(f"{data}/{path}") for path in paths]
returns = [preprocess(df) for df in dfs]

ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [19]:
concat = pd.concat(returns)

In [22]:
concat

Unnamed: 0,unique_id,y,ds
0,MSFT,,2018-07-03 09:00:00
1,MSFT,-0.000150,2018-07-03 10:00:00
2,MSFT,-0.006189,2018-07-03 11:00:00
3,MSFT,-0.000202,2018-07-03 12:00:00
4,MSFT,-0.001010,2018-07-03 13:00:00
...,...,...,...
10051,MSFT,-0.000858,2023-06-30 12:00:00
10052,MSFT,-0.000932,2023-06-30 13:00:00
10053,MSFT,-0.000617,2023-06-30 14:00:00
10054,MSFT,0.000485,2023-06-30 15:00:00


In [21]:
StatsForecast.plot(concat, unique_id='MSFT', y='y', ds='ds', freq=get_polygon_freq())

TypeError: _StatsForecast.plot() got an unexpected keyword argument 'unique_id'