## Check if we should use Close or Adj Close
Compare differences among the prices from Yahoo and those in the option chains to see which one should be used and how big are the discrepancies, if any.

In [1]:
from dotenv import dotenv_values
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt
import yfinance as yf
import warnings
from scipy import stats as st
from tqdm.notebook import tqdm

warnings.filterwarnings("ignore")
import sys
from pathlib import Path

# in jupyter (lab / notebook), based on notebook path
module_path = str(Path.cwd().parents[0] / "src")


if module_path not in sys.path:
    sys.path.append(module_path)
    
from MMAR.MMAR import MMAR

### Load variables

In [2]:
start_date = "2005-01-01"
end_date = "2022-12-31"
df_path = "../data/raw/spy.csv"

In [3]:
config = dotenv_values("../config/.env")

In [4]:
ALPHA_API = config["ALPHA_API"]

In [5]:
def get_df(path: str) -> pd.DataFrame:
    if os.path.isfile(path):
        df = pd.read_csv(path, parse_dates=True, index_col=0)
    else:
        df = yf.download("SPY", start=start_date, end=end_date)
        df.to_csv(path)
    return df

### Get files

In [6]:
spy = get_df(df_path)

In [7]:
opt_chain = pd.read_csv("../data/processed/chain2018.csv.gz", parse_dates=True)

In [33]:
# Get option prices 
opt_prices = opt_chain[['QUOTE_DATE', 'UNDERLYING_LAST']].drop_duplicates()
opt_prices["QUOTE_DATE"] = pd.to_datetime(opt_prices["QUOTE_DATE"])
opt_prices.columns = ["Date", "Last"]
opt_prices = opt_prices.set_index("Date")

In [34]:
opt_prices

Unnamed: 0_level_0,Last
Date,Unnamed: 1_level_1
2018-01-02,268.81
2018-01-03,270.47
2018-01-04,271.56
2018-01-05,273.41
2018-01-08,273.94
...,...
2018-12-24,234.34
2018-12-26,246.02
2018-12-27,248.21
2018-12-28,247.80


In [23]:
# Get close price
spy_close = spy.loc['2018-01-02':'2019-01-01',"Close"]

In [26]:
spy_close

Date
2018-01-02    268.769989
2018-01-03    270.470001
2018-01-04    271.609985
2018-01-05    273.420013
2018-01-08    273.920013
                 ...    
2018-12-24    234.339996
2018-12-26    246.179993
2018-12-27    248.070007
2018-12-28    247.750000
2018-12-31    249.919998
Name: Close, Length: 251, dtype: float64

In [30]:
# Get Adjusted Close
spy_adj_close = spy.loc['2018-01-02':'2019-01-01',"Adj Close"]

In [37]:
cmp_opt_close = pd.concat((spy_close, opt_prices), axis=1)
cmp_opt_close["Diff"] = cmp_opt_close["Close"]-cmp_opt_close["Last"]

In [38]:
cmp_opt_close.dropna().describe()

Unnamed: 0,Close,Last,Diff
count,251.0,251.0,251.0
mean,274.339641,274.333665,0.005976
std,10.079264,10.068101,0.075917
min,234.339996,234.34,-0.340006
25%,268.830002,268.835,-0.03001
50%,273.980011,273.96,0.009989
75%,281.035004,281.03,0.044999
max,293.579987,293.6,0.309994


In [43]:
[d for d in opt_prices.index if d not in spy_close.index]

[Timestamp('2018-12-05 00:00:00')]

There is an issue with the dates! Yahoo file has one day less, 251 instead of 252! The 5th of December is missing due to President Bush funeral
![Yahoo Finance](yf20181205.png)

In [45]:
opt_prices.loc["2018-12-02":"2018-12-10"]

Unnamed: 0_level_0,Last
Date,Unnamed: 1_level_1
2018-12-03,279.22
2018-12-04,270.32
2018-12-05,270.25
2018-12-06,269.84
2018-12-07,263.63
2018-12-10,264.1


In [41]:
cmp_opt_adj_close = pd.concat((spy_adj_close, opt_prices), axis=1)
cmp_opt_adj_close["Diff"] = cmp_opt_adj_close["Adj Close"]-cmp_opt_adj_close["Last"]
cmp_opt_adj_close.dropna().describe()

Unnamed: 0,Adj Close,Last,Diff
count,251.0,251.0,251.0
mean,249.856832,274.333665,-24.476834
std,9.2598,10.068101,1.524398
min,215.959854,234.34,-27.380648
25%,243.984322,268.835,-25.461583
50%,249.113602,273.96,-24.66692
75%,256.136581,281.03,-23.833257
max,267.770569,293.6,-18.380146


We should definetely use Close. The error is overall acceptable: on average below 1 cent.