In [2]:
import pandas as pd

In [3]:
# Read price and dividends data into dfs and parse date columns as pandas datetime dtypes
spy_price_data = pd.read_csv(r'..\0_Raw_Data\SPY_price_history.csv', parse_dates = ['Date'])
spy_dividend_data = pd.read_csv(r'..\0_Raw_Data\SPY_dividends_history.csv', parse_dates = ['Pay Date'])

In [4]:
# Left join dfs on Date and Pay Date
merged_raw_data = spy_price_data.merge(spy_dividend_data, how='left', left_on=['Date'], right_on = ['Pay Date'])

In [5]:
# Drop rows in df that contain leap days of February 29. If we don't these will cause errors in the get_growth function below
merged_raw_data = merged_raw_data[~((merged_raw_data['Date'].dt.month == 2) & (merged_raw_data['Date'].dt.day ==29))]

In [6]:
# Convert 'Date' col from pandas datetime type to just a simple date using <dt.date> method
merged_raw_data['Date'] = pd.to_datetime(merged_raw_data['Date']).dt.date

In [7]:
# Write function to retrieve change in share value over n-year intervals
def get_growth(current_date, dataframe, price_col_name, delta=pd.Timedelta(days=5*365)):

    current_price = dataframe.loc[dataframe['Date'] == current_date, price_col_name]
    current_price = current_price.reset_index()
    
    past_date = current_date - delta
    past_price = dataframe.loc[dataframe["Date"] == past_date, price_col_name]
    past_price = past_price.reset_index()

    return (current_price/past_price)[price_col_name].values[0]

In [8]:
merged_raw_data["Perc Growth Last 5 Years"] = merged_raw_data.apply(lambda row: get_growth(row["Date"], merged_raw_data, "Close"), axis=1, result_type="expand")
merged_raw_data

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Declare Date,Ex-Div Date,Record Date,Pay Date,Frequency,Amount,Adj. Amount,Perc Growth Last 5 Years
0,1993-01-29,43.968750,43.968750,43.750000,43.937500,25.627350,1003200,,,,NaT,,,,
1,1993-02-01,43.968750,44.250000,43.968750,44.250000,25.809603,480500,,,,NaT,,,,
2,1993-02-02,44.218750,44.375000,44.125000,44.343750,25.864313,201300,,,,NaT,,,,
3,1993-02-03,44.406250,44.843750,44.375000,44.812500,26.137709,529400,,,,NaT,,,,
4,1993-02-04,44.968750,45.093750,44.468750,45.000000,26.247080,531500,,,,NaT,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7330,2022-03-08,419.619995,427.209991,415.119995,416.250000,416.250000,164772700,,,,NaT,,,,1.757367
7331,2022-03-09,425.140015,429.510010,422.820007,427.410004,427.410004,116990800,,,,NaT,,,,1.798183
7332,2022-03-10,422.519989,426.429993,420.440002,425.480011,425.480011,93972700,,,,NaT,,,,
7333,2022-03-11,428.119995,428.769989,419.529999,420.070007,420.070007,95529600,,,,NaT,,,,


In [None]:
# Need to create a column for "Percentage Yield" which calculates the 

In [44]:
# References
### https://stackoverflow.com/questions/33518124/how-to-apply-a-function-on-every-row-on-a-dataframe

In [None]:
# input time range, investment interval size
def get_growth_by_strategy(start_date: pd.datetime, end_date: pd.datetime, period_size: pd.Timedelta, inv_per_period: float):
    assert end_date - start_date > period_size, 'What are you doing Moron'
    
    investment_times = pd.date_range(start_date, end_date, period_size)
    growth_percs = [get_growth(end_date, dataframe, column, price_col_name, delta) for delta in investment_times]
    total = sum([inv_per_period * perc for perc in growth_percs])
    return total
    