In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from typing import List
import os
import datetime

# Data Preprocessing

In [2]:
sp500File = "../Data/Raw Data/ScrapedSP500.csv"
unEmpFile = "../Data/Raw Data/unemployment.csv"
tenYearRateFile = "../Data/Raw Data/10yr_treasury_yield.csv"
recExpFile = "../Data/Raw Data/recession_indicator.csv"
cpiFile = "../Data/Raw Data/consumer_price_index.csv"
vixFile = "../Data/Raw Data/VIX_History.csv"

In [3]:
def createDataFrame(file, dateOne=None, dateTwo=None,  year=None, inBet=False):
    #Read data
    df = pd.read_csv(file)
   
    #Capitalize Header
    df = df.rename(columns=str.upper)

    #Create DateTime
    df.insert(0, "DATETIME", pd.to_datetime(df['DATE'])) 

    if inBet:
        #Boolean Mask
        mask = df['DATETIME'].apply(lambda val: True if dateOne <= val  <= dateTwo else False)
        df = df[mask]

    if  isinstance(year, int) and inBet == False:
        #Boolean Mask
        mask = df['DATETIME'].dt.year == year
        df = df[mask]

    #Sanity Check
    if os.path.basename(file) == "ScrapedSP500.csv" and inBet == False and isinstance(year, int):
        assert 252 == sum(df[ "DATETIME"].dt.year == year), "Missing Days"
        
    return df 

#Build Full Dataset
def buildFullData(dataSets: pd.DataFrame, listOfDataSets: List, how='left', on='DATETIME') -> pd.DataFrame:
    names = ['_unEmp', '_tenYearRate', '_recExp', '_cpi', '_vix']

    for name, data in zip(names, listOfDataSets):
        dataSets = pd.merge(dataSets, data, how=how, on=on, copy=True, suffixes=(None, name))

    dataSets = dataSets.filter([col for col in dataSets.columns if "DATE_" not in col])
    return dataSets

In [4]:
d1 = datetime.datetime(2020, 12, 11)
d2 = datetime.datetime(2021, 12, 31)

sp500Df = createDataFrame(sp500File, dateOne=d1, dateTwo=d2, inBet=True)
unEmpDf = createDataFrame(unEmpFile, dateOne=d1, dateTwo=d2, inBet=True)
tenYearRateDf = createDataFrame(tenYearRateFile, dateOne=d1, dateTwo=d2, inBet=True)
recExpDf = createDataFrame(recExpFile, dateOne=d1, dateTwo=d2, inBet=True)
cpiDf = createDataFrame(cpiFile, dateOne=d1, dateTwo=d2, inBet=True)
vixDf = createDataFrame(vixFile, dateOne=d1, dateTwo=d2, inBet=True)

In [5]:
economicIndicators = [unEmpDf,
                      tenYearRateDf,
                      recExpDf,
                      cpiDf,
                      vixDf
                      ]

fullData = buildFullData(sp500Df, economicIndicators, how='left', on='DATETIME')

In [6]:
#Write out the date
outFile = "../Data/Raw Data/fullData.csv"
fullData.to_csv(outFile, index=False)