In [30]:
import pandas as pd

# Clean S&P500 (USA) data
# https://ca.investing.com/indices/us-spx-500-historical-data 
sp = pd.read_csv('./Datasets/S&P500-USA.csv', thousands=",")

pattern = r'\d\d\d\d-01-01'
sp = sp[sp['Date'].str.contains(pattern)]
sp = sp.reset_index(drop=True).iloc[::-1]

sp['YoY Change'] = sp['Price'].pct_change()*100
sp.drop(columns=['Vol.'], inplace=True)
sp.dropna(inplace=True)

sp.to_csv('./Datasets/S&P500-USA-Clean.csv')

In [81]:
# Clean Macrotrends Datasets
# These datasets provide us the value of the index on each business day between certain dates
# We need to find the values on the first business day of each year, so we can calculate YoY change.

def clean_macrotrend_df(df, name):
    df['date'] = pd.to_datetime(df['date'])
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df = df[df['month'] == 1]

    # Code From: https://stackoverflow.com/questions/71002941/get-the-first-row-of-each-group-of-unique-values-in-another-column 
    df = df.groupby('year', as_index=False).first()

    df['YoY Change'] = df['value'].pct_change()*100

    df.dropna(inplace=True)
    df.drop(columns=['month', 'day'], inplace=True)
    df.reset_index(drop=True, inplace=True)
    df.to_csv(f'./Datasets/{name}-Clean.csv')

In [82]:
# Clean Macrotrends Datasets
# Run function on all of our current MacroTrends Datasets
# https://www.macrotrends.net/charts/stock-indexes
bovespa = pd.read_csv('./Datasets/BOVESPA-BRA.csv')
clean_macrotrend_df(bovespa, 'BOVESPA-BRA')

cac = pd.read_csv('./Datasets/CAC40-FRA.csv')
clean_macrotrend_df(cac, 'CAC40-FRA')

dax = pd.read_csv('./Datasets/DAX30-DEU.csv')
clean_macrotrend_df(dax, 'DAX30-DEU')

hang_seng = pd.read_csv('./Datasets/HangSeng-HKG.csv')
clean_macrotrend_df(hang_seng, 'HangSeng-HKG')

nikkei = pd.read_csv('./Datasets/Nikkei225-JPN.csv')
clean_macrotrend_df(nikkei, 'Nikkei225-JPN')

shanghai = pd.read_csv('./Datasets/Shanghai-CHN.csv')
clean_macrotrend_df(shanghai, 'Shanghai-CHN')
