In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from typing import List
import os

# Data Preprocessing

In [20]:
sp500File = "../Data/Raw Data/ScrapedSP500.csv"
unEmpFile = "../Data/Raw Data/unemployment.csv"
tenYearRateFile = "../Data/Raw Data/10yr_treasury_yield.csv"
recExpFile = "../Data/Raw Data/recession_indicator.csv"
cpiFile = "../Data/Raw Data/consumer_price_index.csv"
vixFile = "../Data/Raw Data/VIX_History.csv"

In [21]:
def createDataFrame(file, year=None):
    #Read data
    df = pd.read_csv(file)
   
    #Capitalize Header
    df = df.rename(columns=str.upper)

    #Create DateTime
    df.insert(0, "DATETIME", pd.to_datetime(df['DATE'])) 

    if year != None:
        #Boolean Mask
        mask = df['DATETIME'].dt.year == year
        df = df[mask]

    #Sanity Check
    if os.path.basename(file) == "ScrapedSP500.csv":
        assert 252 == sum(df[ "DATETIME"].dt.year == year), "Missing Days"
        
    return df 

#Build Full Dataset
def buildFullData(dataSets: pd.DataFrame, listOfDataSets: List, how='left', on='DATETIME') -> pd.DataFrame:
    names = ['_unEmp', '_tenYearRate', '_recExp', '_cpi', '_vix']

    for name, data in zip(names, listOfDataSets):
        dataSets = pd.merge(dataSets, data, how=how, on=on, copy=True, suffixes=(None, name))

    dataSets = dataSets.filter([col for col in dataSets.columns if "DATE_" not in col])
    return dataSets

In [22]:
sp500Df = createDataFrame(sp500File, 2021)
unEmpDf = createDataFrame(unEmpFile, 2021)
tenYearRateDf = createDataFrame(tenYearRateFile, 2021)
recExpDf = createDataFrame(recExpFile, 2021)
cpiDf = createDataFrame(cpiFile, 2021)
vixDf = createDataFrame(vixFile, 2021)

In [23]:
economicIndicators = [unEmpDf,
                      tenYearRateDf,
                      recExpDf,
                      cpiDf,
                      vixDf
                      ]

fullData = buildFullData(sp500Df, economicIndicators, how='left', on='DATETIME')

In [24]:
#Write out the date
outFile = "../Data/Raw Data/fullData.csv"
fullData.to_csv(outFile)