In [1]:
#!pip install pandas_datareader

In [2]:
import datetime

import numpy as np
import pandas as pd
import pandas_datareader.data as web
import seaborn as sns
from matplotlib import pyplot as plt
from scipy import stats

In [3]:
start = datetime.date.today() - datetime.timedelta(365 * 20)
end = datetime.date.today()
prices = web.DataReader(["^GSPC"], "yahoo", start, end)["Adj Close"]

# Rename column to make names more intuitive
prices = prices.rename(columns={"^GSPC": "SP500"})
df = np.log(prices) - np.log(prices.shift(1))
df = df.iloc[1:, 0:]

TypeError: string indices must be integers

In [None]:
df.head()

### 1.1 Are returns symmetric?


In [None]:
(len(df[df.SP500 > df.SP500.mean()])) / (len(df))

### 1.2 Is Volatility constant?

In [None]:
vols = pd.DataFrame(df.SP500.rolling(50).std()).rename(columns={"SP500": "S&P 500 STD"})
# set figure size
plt.figure(figsize=(12, 5))
# plot using rolling average
sns.lineplot(
    x="Date",
    y="S&P 500 STD",
    data=vols,
    label="S&P 500 50 day standard deviation rolling avg",
)

## 2. Are Stock Returns Normally Distributed?


In [None]:
df.hist(bins=100)

### 2.1 Conducting a normality test

In [None]:
stats.normaltest((np.array(df.SP500)))

### 2.2 Testing Skewness and Kurtosis


In [None]:
stats.jarque_bera((np.array(df.SP500))).pvalue

### 2.3 Where Does Our Gaussian Distribution Break Down?


In [None]:
dfMax = df.SP500.max()
dfMin = df.SP500.min()
print(
    "Min return of sample data is %.4f and the maximum return of sample data is %.4f"
    % (dfMin, dfMax)
)

In [None]:
df.SP500.min()

In [None]:
(df.SP500.min() - df.SP500.mean()) / df.SP500.std()

In [None]:
(df.SP500.max() - df.SP500.mean()) / df.SP500.std()

In [None]:
stats.norm.cdf(-10.45)

In [None]:
(3 * df.SP500.std()) + df.SP500.mean()

In [None]:
(-3 * df.SP500.std()) + df.SP500.mean()

In [None]:
df[(df["SP500"] > 0.03699) | (df["SP500"] < -0.0364)].tail()

In [None]:
len(df[(df["SP500"] > 0.05) | (df["SP500"] < -0.05)])

## 3. Non-Gaussian Distributions


In [None]:
stats.t.rvs(df=5030, size=5000)

In [None]:
# generate t distribution with sample size 10000
x = stats.t.rvs(df=5030, size=10000)

# create plot of t distribution
plt.hist(x, density=True, edgecolor="black", bins=50)

In [None]:
t_stat, p = stats.ttest_ind(df["SP500"], stats.t.rvs(df=5030, size=5031))
print(f"t={t_stat}, p={p}")