In [4]:
import pandas_datareader.data as web
import pandas as pd
import numpy as np

from pprint import pprint
from tqdm.auto import tqdm

In [5]:
def fetch_economic_data(indicators, start, end, api_key=None):
    economic_data = {}
    for indicator in tqdm(indicators):
        try:
            data = web.DataReader(indicator, "fred", start, end, api_key=api_key)
            economic_data[indicator] = data
        except Exception as e:
            print(f"Could not fetch data for {indicator}. Error: {e}")
    return economic_data

In [6]:
import pickle

with open("data/raw/economic_indicators.pkl", "rb") as f:
    economic_indicators_dict = pickle.load(f)

economic_indicators_dict

{'Gross Domestic Product': 'GDP',
 'Consumer Price Index for All Urban Consumers': 'CPIAUCNS',
 'Civilian Unemployment Rate': 'UNRATE',
 'Effective Federal Funds Rate': 'FEDFUNDS',
 'Trade Weighted U.S. Dollar Index': 'DTWEXM',
 '10-Year Treasury Constant Maturity Minus 2-Year Treasury Constant Maturity': 'T10Y2Y',
 'M2 Money Stock': 'M2',
 'Personal Consumption Expenditures': 'PCE',
 'Homeownership Rate in the U.S.': 'HOANBS',
 'Total Business Inventories': 'BUSINV',
 'Industrial Production Index': 'INDPRO',
 'Real Personal Income Less Current Transfer Receipts': 'REALLN',
 'Real Disposable Personal Income': 'W875RX1',
 'Total Nonfarm Payrolls': 'PAYEMS',
 'Personal Saving Rate': 'PSAVERT',
 'Corporate Profits After Tax': 'A939RX0Q048SBEA',
 'National Financial Conditions Index': 'NFCI',
 'Velocity of M2 Money Stock': 'WM2NS',
 'University of Michigan: Consumer Sentiment': 'UMCSENT',
 'Crude Oil Prices': 'OILPRICE',
 'High Yield Corporate Bond Yield': 'BAMLH0A0HYM2',
 '10-Year Treasur

In [7]:
economic_indicators = [i for i in economic_indicators_dict.values()]
pprint(', '.join(economic_indicators))

('GDP, CPIAUCNS, UNRATE, FEDFUNDS, DTWEXM, T10Y2Y, M2, PCE, HOANBS, BUSINV, '
 'INDPRO, REALLN, W875RX1, PAYEMS, PSAVERT, A939RX0Q048SBEA, NFCI, WM2NS, '
 'UMCSENT, OILPRICE, BAMLH0A0HYM2, GS10, CP, STLFSI, USSLIND, GDPC1, USROA')


In [8]:
api_key = "7241f282cb7ac930cf729ca9d1acb3af"
economic_data = fetch_economic_data(economic_indicators, "1990-01-01", "2023-09-07", api_key=api_key)

  0%|          | 0/27 [00:00<?, ?it/s]

In [9]:
freq = {k: pd.infer_freq(v.index) for k, v in economic_data.items()}
freq

{'GDP': 'QS-OCT',
 'CPIAUCNS': 'MS',
 'UNRATE': 'MS',
 'FEDFUNDS': 'MS',
 'DTWEXM': 'B',
 'T10Y2Y': 'B',
 'M2': 'W-MON',
 'PCE': 'MS',
 'HOANBS': 'QS-OCT',
 'BUSINV': 'MS',
 'INDPRO': 'MS',
 'REALLN': 'MS',
 'W875RX1': 'MS',
 'PAYEMS': 'MS',
 'PSAVERT': 'MS',
 'A939RX0Q048SBEA': 'QS-OCT',
 'NFCI': 'W-FRI',
 'WM2NS': 'W-MON',
 'UMCSENT': 'MS',
 'OILPRICE': 'MS',
 'BAMLH0A0HYM2': None,
 'GS10': 'MS',
 'CP': 'QS-OCT',
 'STLFSI': 'W-FRI',
 'USSLIND': 'MS',
 'GDPC1': 'QS-OCT',
 'USROA': 'QS-OCT'}

In [136]:
economic_data_resampled = {k: v.resample("B").last() for k, v in economic_data.items()}
freq_resampled = {k: pd.infer_freq(v.index) for k, v in economic_data_resampled.items()}
freq_resampled

{'GDP': 'B',
 'CPIAUCNS': 'B',
 'UNRATE': 'B',
 'FEDFUNDS': 'B',
 'DTWEXM': 'B',
 'T10Y2Y': 'B',
 'M2': 'B',
 'PCE': 'B',
 'HOANBS': 'B',
 'BUSINV': 'B',
 'INDPRO': 'B',
 'REALLN': 'B',
 'W875RX1': 'B',
 'PAYEMS': 'B',
 'PSAVERT': 'B',
 'A939RX0Q048SBEA': 'B',
 'NFCI': 'B',
 'WM2NS': 'B',
 'UMCSENT': 'B',
 'OILPRICE': 'B',
 'BAMLH0A0HYM2': 'B',
 'GS10': 'B',
 'CP': 'B',
 'STLFSI': 'B',
 'USSLIND': 'B',
 'GDPC1': 'B',
 'USROA': 'B'}

In [137]:
econ_df = pd.concat([v for v in economic_data_resampled.values()], axis=1)
econ_df_interpolated = econ_df.interpolate('akima').interpolate('linear')
# econ_df_interpolated.to_csv('data/raw/econ_data_with_nulls.csv')

In [138]:
econ_df_interpolated_no_nulls = econ_df_interpolated.dropna(axis=1)

In [139]:
econ_df_interpolated_no_nulls.head(3)

Unnamed: 0_level_0,GDP,CPIAUCNS,UNRATE,FEDFUNDS,M2,PCE,HOANBS,INDPRO,REALLN,W875RX1,...,PSAVERT,A939RX0Q048SBEA,WM2NS,UMCSENT,OILPRICE,GS10,CP,USSLIND,GDPC1,USROA
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1990-01-01,5872.701,127.4,5.4,8.23,3161.1,3730.7,88.482,61.644,765.8089,6798.6,...,8.0,37617.0,3181.9,93.0,22.641,8.21,268.869,1.71,9364.259,0.75
1990-01-02,5873.884,127.41667,5.396936,8.229793,3161.699245,3730.112311,88.477923,61.665975,766.118885,6799.630247,...,8.033922,37617.900197,3185.659726,92.799287,22.64528,8.220386,269.066101,1.693781,9364.801212,0.749525
1990-01-03,5875.061336,127.433607,5.393852,8.229631,3162.250543,3729.545632,88.473838,61.687625,766.428515,6800.6355,...,8.066905,37618.777405,3189.069155,92.600421,22.649263,8.230666,269.262463,1.678138,9365.338038,0.749048


In [124]:
import yfinance as yf

start = econ_df_interpolated_no_nulls.index.min()
end = econ_df_interpolated_no_nulls.index.max() + pd.DateOffset(days=1)

url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"

ticker_list = pd.read_html(url)[0]["Symbol"]
ticker_list = [i.replace('.', '-') for i in ticker_list]

stock_prices = yf.download(ticker_list, start, end)["Adj Close"]

stock_prices.head()

[*********************100%***********************]  503 of 503 completed


Unnamed: 0_level_0,A,AAL,AAPL,ABBV,ABT,ACGL,ACN,ADBE,ADI,ADM,...,WYNN,XEL,XOM,XRAY,XYL,YUM,ZBH,ZBRA,ZION,ZTS
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1990-01-02,,,0.263761,,1.837212,,,1.18834,1.021221,4.656209,...,,4.213065,4.034966,0.855866,,,,,1.818956,
1990-01-03,,,0.265531,,1.843844,,,1.247023,0.96817,4.656209,...,,4.200062,3.994619,0.855866,,,,,1.851438,
1990-01-04,,,0.266417,,1.840529,,,1.305707,0.954908,4.630906,...,,4.096036,3.954268,0.821631,,,,,1.851438,
1990-01-05,,,0.267302,,1.82063,,,1.335048,0.954908,4.428461,...,,4.057025,3.934095,0.872984,,,,,1.851438,
1990-01-08,,,0.269072,,1.829963,,,1.352692,0.954908,4.479073,...,,4.018018,3.994619,0.838749,,,,,1.851438,


In [128]:
stock_prices_no_nulls = stock_prices.dropna(axis=1)
stock_prices_no_nulls

Unnamed: 0_level_0,AAPL,ABT,ADBE,ADI,ADM,ADP,ADSK,AEP,AFL,AIG,...,WM,WMB,WMT,WRB,WST,WY,XEL,XOM,XRAY,ZION
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1990-01-02,0.263761,1.837212,1.188340,1.021221,4.656209,2.720687,4.496894,5.993506,0.649151,106.041237,...,0.863414,1.503941,3.640321,1.006519,2.758530,3.328015,4.213065,4.034966,0.855866,1.818956
1990-01-03,0.265531,1.843844,1.247023,0.968170,4.656209,2.707051,4.608617,5.970888,0.635339,105.915619,...,0.863414,1.537046,3.640321,1.021632,2.740262,3.254712,4.200062,3.994619,0.855866,1.851438
1990-01-04,0.266417,1.840529,1.305707,0.954908,4.630906,2.693412,4.496894,5.880419,0.621527,104.407829,...,0.863414,1.489751,3.621010,1.012564,2.776799,3.225389,4.096036,3.954268,0.821631,1.851438
1990-01-05,0.267302,1.820630,1.335048,0.954908,4.428461,2.672957,4.496894,5.744716,0.621527,101.266846,...,0.935365,1.494481,3.582385,1.012564,2.813335,3.210728,4.057025,3.934095,0.872984,1.851438
1990-01-08,0.269072,1.829963,1.352692,0.954908,4.479073,2.679774,4.454997,5.722102,0.630735,100.638641,...,0.935365,1.513399,3.630667,1.009541,2.813335,3.210728,4.018018,3.994619,0.838749,1.851438
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-08-31,187.869995,102.900002,559.340027,180.919998,79.300003,253.342285,221.940002,78.400002,74.570000,58.520000,...,156.076355,34.077545,162.610001,61.860001,406.899994,32.750000,57.130001,111.190002,37.090000,35.500000
2023-09-01,189.460007,102.849998,563.210022,182.429993,79.790001,254.476608,220.020004,77.519997,74.769997,59.240002,...,156.305328,34.235447,161.570007,62.430000,407.690002,32.750000,56.509998,113.519997,37.480000,36.439999
2023-09-05,189.699997,100.879997,564.880005,181.539993,78.879997,251.591049,219.460007,76.000000,74.199997,58.599998,...,156.325241,33.988724,160.270004,61.750000,403.670013,31.920000,55.590000,113.529999,36.689999,36.130001
2023-09-06,182.910004,101.559998,561.940002,182.179993,77.459999,249.800003,222.199997,76.379997,74.230003,59.189999,...,155.270004,33.396587,161.470001,61.529999,404.850006,32.389999,56.340000,114.510002,36.700001,34.820000


In [129]:
log_returns = np.log(stock_prices_no_nulls).diff()[1:]
log_returns.head(3)

Unnamed: 0_level_0,AAPL,ABT,ADBE,ADI,ADM,ADP,ADSK,AEP,AFL,AIG,...,WM,WMB,WMT,WRB,WST,WY,XEL,XOM,XRAY,ZION
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1990-01-03,0.006689,0.003603,0.048202,-0.053346,0.0,-0.005025,0.024541,-0.003781,-0.021506,-0.001185,...,0.0,0.021773,0.0,0.014903,-0.006644,-0.022272,-0.003091,-0.01005,0.0,0.0177
1990-01-04,0.003331,-0.001799,0.045985,-0.013793,-0.005449,-0.005051,-0.024541,-0.015268,-0.021979,-0.014338,...,0.0,-0.031254,-0.005319,-0.008915,0.013245,-0.00905,-0.02508,-0.010153,-0.040822,0.0
1990-01-05,0.003317,-0.010871,0.022223,0.0,-0.0447,-0.007623,0.0,-0.023348,0.0,-0.030546,...,0.080043,0.00317,-0.010724,0.0,0.013072,-0.004556,-0.00957,-0.005115,0.060625,0.0


In [140]:
econ_df_interpolated_no_nulls_reindexed = econ_df_interpolated_no_nulls.reindex(log_returns.index)

In [141]:
econ_df_interpolated_no_nulls_reindexed.isnull().sum().sum()

0

In [142]:
log_returns.isnull().sum().sum()

0

In [143]:
log_returns.to_csv('data/cleaned/log_returns_daily.csv')

In [144]:
for i1, i2 in zip(log_returns.index, econ_df_interpolated_no_nulls_reindexed.index):
    assert i1 == i2

In [145]:
econ_df_interpolated_no_nulls_reindexed.to_csv('data/cleaned/econ_data_daily.csv')