# Distribution of annual returns for major indices

Different tickers are available:
- S&P 500 Total Returns: S&P 500 index with dividends reinvested _(ticker ^SP500TR)_
- S&P 500 Total Returns in real terms: the same index net of inflation

In [None]:
import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
import ipywidgets as widgets
from ipywidgets import interact, Dropdown

In [None]:
def load_data_csv(file_path):
    # Assume the first column contains the date and the 2nd the data
    df = pd.read_csv(file_path)#.set_index('Date')
    col_date = df.columns[0]
    df[col_date] = pd.to_datetime(df[col_date])
    df.set_index(col_date, inplace=True)

    col_data = df.columns[0]
    df = df[col_data]
    return df

In [None]:
df_indexes = pd.read_csv('index_config.csv').set_index('index_id')
indexes = df_indexes.to_dict(orient='index')

for key, _ in indexes.items():
    series = load_data_csv('data/' + indexes[key]['code'])
    indexes[key]['data'] = series

In [None]:
data_min_year = min(min(indexes[key]['data'].index.year) for index_details in indexes.keys())
data_max_year = max(max(indexes[key]['data'].index.year) for index_details in indexes.keys())

In [None]:
def compute_future_returns(series, period_years=1.5, dates_per_year=256):
    delta = int(period_years * dates_per_year)
    
    # Calculate future returns over the next 1.5 years
    future_returns = series.shift(-delta) / series - 1
    future_returns = (future_returns + 1) ** (1/period_years) - 1
    
    # Drop NaN values (those dates for which we can't calculate future returns)
    future_returns = future_returns.dropna()
    return future_returns

In [None]:
def bucketize(axis_max):
    buckets = [0.2, 0.3, 0.5]
    axis_max = axis_max + 0.1
    for b in buckets:
        if axis_max < b:
            return b
    return axis_max

In [None]:
dropdown_options = [(indexes[index]['index_name'], index) for index, _ in indexes.items()]

In [None]:
@interact(
    index=Dropdown(options=dropdown_options, description='Index'),
    period_years=widgets.IntSlider(value=10, min=1, max=min(50, data_max_year - data_min_year), description='Holding Years'),
    year_interval=widgets.IntRangeSlider(value=[1950, data_max_year], min=data_min_year, max=data_max_year, description='Date interval')
)
def plot_returns_distribution(index, year_interval, period_years):
    start_year, end_year = year_interval
    if end_year - start_year < period_years:
        display('Please select a year interval larger than the holding time horizon')
        return
    start_date = f'{start_year}-01-01'
    end_date = f'{end_year+1}-01-01'

    series = indexes[index]['data']
    series = series.loc[start_date:].loc[:end_date]
    returns = compute_future_returns(series, period_years, indexes[index]['points_per_year']) * 100

    # Plot results
    plt.figure(figsize=(10, 5))
    bins = np.arange(int(returns.min()), int(returns.max() + 1), 1)
    counts, _, _ = plt.hist(returns, bins=bins, edgecolor='k', alpha=0.7)

    title = indexes[index]['title']
    plt.title(f'Distribution of {title} over {period_years} Years')
    plt.xlabel('Annual Return')
    ax = plt.gca()
    ax.xaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: f'{x:.0f}%'))
    x_axis_max = max(-min(returns), max(returns))
    x_axis_max = bucketize(x_axis_max/100) * 100
    plt.xlim(-x_axis_max, x_axis_max)
    y_axis_max = max(100, counts.max() + 5)
    plt.ylim(0, y_axis_max)
    plt.show()
    
    display(f'Median: {returns.median():.1f}%')
    display(f'Nb of datapoints: {returns.shape[0]}')

_Data from Schiller (http://www.econ.yale.edu/~shiller/data.htm). Source available on [Github](https://github.com/alxbck/distribution_plot/)._