# IMPORTS 

In [1]:
import numpy as np
import pandas as pd
import sys
import gc
import joblib
import pathlib
import json
import glob
from tqdm.notebook import tqdm

# visualize
import matplotlib.pyplot as plt
import matplotlib.style as style
import plotly.express as px
import seaborn as sns
from matplotlib import pyplot
from matplotlib.ticker import ScalarFormatter
sns.set_context("talk")
style.use('seaborn-colorblind')

import plotly.graph_objects as go
import yfinance as yf
import statistics as st

# Introduction:
## The Monday Effect
According to Investopedia.com, **The Monday Effect** is "a financial theory that suggests that stock market returns will follow the prevailing trends from the previous Friday when it opens the following Monday." Then for a Security A, if A was trending down on Friday January 1st, the intraday percentage change $$\mathrm{pct\_change}(A, \mathrm{''Friday, January, 01''}) < 0,$$ then the Monday Effect implies that the $$\mathrm{pct\_change}(A, \mathrm{''Monday, January, 04''}) < 0$$
and vise versa if the Friday experienced a posituve percentage change.

## The History of The Monday Effect
So I certainly am not the first person to ever look at this effect. **The Monday Effect** is a financial markets anomaly that was originally discovered and researched by Frank Cross in 1973 (Seen here: ["Frank Cross' Research on The Monday Effect"](https://dailyspeculations.com/scholarly/faj.v29.n6.67.pdf )). 
Here's the TLDR:
* Cross looked at the Standard & Poor's 500 Composite Index from 1953 to 1970. 1953 is significant because it was the first year the market closed on Saturdays.
* The probability that the SP500 Composite Index trended positive on a Monday given it also trended up on the preceding Friday is ~$48.8\%$
* The average percent change of the SP500 Composite on Monday's after a positive trending Friday was $0\%$
* The median percent change of the SP500 Copmositve on Monday's after a positive trending Friday was $0\%$
* The probability that the SP500 Composite Index trended positive on a Monday given it also trended down on the preceding Friday is ~$24.0\%$
* The average percent change of the SP500 Composite on Monday's after a negative trending Friday was $-0.48\%$
* The median percent change of the SP500 Copmositve on Monday's after a negative trending Friday was $-0.40\%$

## What Factors Impact The Monday Effect?
To begin, **The Monday Effect**, is not a perfect estimator of where the market will be headed on a Monday. There are so many factors welded deep into the market that dictate where the market goes. These factors include, earnings, breaking news, supply and demand, even global economic and social conditions (just as seen with the COVID-19 pandemic in 2020 and the subsequent high volatility markets since then).

However, part of this research will try to find out if we can dig a little deeper and find out if there are predictors built into the market. Frank Cross used the SP500 Composite Index to perform his research. I aim to replicate this trial first with the SPDR S&P 500 ETF Trust, an exchange-traded fund which tracks the S&P 500 stock market index, but using a more recent scope of time. The second goal of this project will be to breakdown the S&P 500 into the securities that form it and to use a Bayesian Classifier Algorithm to see if we can use the trends of each individual seucrity to predict how the overall index will move.

In [2]:
# Get historical data
spy_df = yf.Ticker("SPY").history(start = "2000-01-01", end = "2022-06-01", period = "1d", interval = "1d")
# Reset index so we can do some manipulation
spy_df = spy_df.reset_index(0)
# We are analyzing by day so we need to add a day of the week to each 
spy_df['Dayofweek'] = pd.to_datetime(spy_df['Date']).dt.dayofweek
# Drop the rcolumns that are excess data
spy_df = spy_df.drop(columns=['High','Low','Dividends', 'Stock Splits'])
# Add a daily percentage change column 
spy_df['Pct_Change'] = (spy_df['Close'] / spy_df['Open'] - 1)*100
# Add a boolean that represents the percentage change (will be useful to make comparisons later on)
spy_df['Change'] = spy_df['Pct_Change'] > 0
# Rearrange some of these new columns now
spy_df = spy_df.reindex(columns = ['Date', 'Dayofweek', 'Open', 'Close', 'Volume', 'Pct_Change', 'Change'])

## How do we define pairings between Fridays and Mondays historically?
So, the Monday Effect technically is only defined in general as a pattern between Fridays and the following Monday. However, because of bank holidays, a Friday does not necessarily need to be the last day of a week and a Monday does not necessarily need to be the first day of the week, in this context Therefore, I had to come up with a different way to keep track of "Friday"-"Monday" pairings.

In [3]:
# To make it easier to keep track of the days, I also add the week and the year
spy_df['Week'] = spy_df['Date'].dt.isocalendar().week
spy_df['Year'] = spy_df['Date'].dt.isocalendar().year

In [4]:
# Lets take a look at our dataframe after some of that processing
spy_df.head()

Unnamed: 0,Date,Dayofweek,Open,Close,Volume,Pct_Change,Change,Week,Year
0,2000-01-03,0,97.598048,95.746483,8164300,-1.897133,False,1,2000
1,2000-01-04,1,94.491508,92.002182,8089800,-2.634444,False,1,2000
2,2000-01-05,2,92.125648,92.166794,12177900,0.044663,True,1,2000
3,2000-01-06,3,91.919908,90.685532,6227200,-1.342883,False,1,2000
4,2000-01-07,4,92.372517,95.952209,8066500,3.875278,True,1,2000


In [5]:
# identify the "friday" "monday" pairings
pairs = []
for index, item in spy_df.iterrows():
    # given current day, check if the next day belongs to the next week, if so then we can keep current day and the next day
    if index+1 < len(spy_df) and item.Week != spy_df.iloc[index+1].Week:
        pairs.append((spy_df.iloc[index], spy_df.iloc[index+1]))

In [6]:
pos_pos = []
pos_neg = []
neg_pos = []
neg_neg = []
for pair in pairs:
    day1 = pair[0]
    day2 = pair[1]
    if day1.Change and day2.Change:
        pos_pos.append(day2.Pct_Change)
    elif day1.Change and not day2.Change:
        pos_neg.append(day2.Pct_Change)
    elif not day1.Change and day2.Change:
        neg_pos.append(day2.Pct_Change)
    else:
        neg_neg.append(day2.Pct_Change)

total_pos = (pos_pos + pos_neg)
total_neg = (neg_pos + neg_neg)
avg_pos_pos = st.mean(pos_pos)
avg_pos_neg = st.mean(pos_neg)
avg_neg_pos = st.mean(neg_pos)
avg_neg_neg = st.mean(neg_neg)

print(f'{len(pos_pos) * 100/len(total_pos)}% of the {len(total_pos)} positive "Fridays" were followed by a positive "Monday" with an average increase of {avg_pos_pos}')
print(f'{len(pos_neg) * 100/len(total_pos)}% of the {len(total_pos)} positive "Fridays" were followed by a negative "Monday" with an average decrease of {avg_pos_neg}')
print(f'This yields an average change after a positive "Friday" of {st.mean(total_pos)} with standard deviation {st.stdev(total_pos)}')
print(f'{len(neg_pos) * 100/len(total_neg)}% of the {len(total_neg)} negative "Fridays" were followed by a positive "Monday" with an average increase of {avg_neg_pos}')
print(f'{len(neg_neg) * 100/len(total_pos)}% of the {len(total_neg)} negative "Fridays" were followed by a negative "Monday" with an average decrease of {avg_neg_neg}')
print(f'This yields an average change after a negative "Friday" of {st.mean(total_neg)} with standard deviation {st.stdev(total_neg)}')


52.84280936454849% of the 598 positive "Fridays" were followed by a positive "Monday" with an average increase of 0.5694921169309747
47.15719063545151% of the 598 positive "Fridays" were followed by a negative "Monday" with an average decrease of -0.6281685319027377
This yields an average change after a positive "Friday" of 0.004709001594675536 with standard deviation 0.9774843130697795
53.06479859894921% of the 571 negative "Fridays" were followed by a positive "Monday" with an average increase of 0.667249966724717
44.81605351170568% of the 571 negative "Fridays" were followed by a negative "Monday" with an average decrease of -0.7693877391011102
This yields an average change after a negative "Friday" of -0.0070388339080705185 with standard deviation 1.0082340297788552


# FIX THIS LINE
Define a random variable $T_{598}$ that defines the sum of the returns from the last 598 days. 
Then this variable has approximately a normal distribution by the Central Limit Theorem as $598 > 30$. 
Then we can define $T_{598} \approx N(598*\mu,n*\sigma^2) \approx N(598*,n*\sigma^2)$

In [7]:

1000*(1.004709)**(598)

16599.498957870026

In [8]:

598*(0.9774843130697797)*2

349552.30029100546

In [9]:
import math