In [1]:
! pip install yfinance
import yfinance as yf
import pandas as pd
import matplotlib.pyplot as plt
! pip install plotly
import plotly.express as px
import plotly.graph_objects as go
import statsmodels.api as sm
from statsmodels.tsa.stattools import adfuller, coint
from numpy import log
import numpy as np



In [16]:
time_period = "1y"
start_date = "2018-01-01"
end_date = "2020-04-01"


In [3]:
from scipy.odr import Model, Data, ODR
from scipy.stats import linregress
import numpy as np

def orthoregress(x, y):
    """Perform an Orthogonal Distance Regression on the given data,
    using the same interface as the standard scipy.stats.linregress function.
    Arguments:
    x: x data
    y: y data
    Returns:
    [m, c, nan, nan, nan]
    Uses standard ordinary least squares to estimate the starting parameters
    then uses the scipy.odr interface to the ODRPACK Fortran code to do the
    orthogonal distance calculations.
    """
    linreg = linregress(x, y)
    mod = Model(f)
    dat = Data(x, y)
    od = ODR(dat, mod, beta0=linreg[0:2])
    out = od.run()
    return list(out.beta) + [np.nan, np.nan, np.nan]

def f(p, x):
    """Basic linear regression 'model' for use with ODR"""
    return (p[0] * x) + p[1]

In [None]:
orthoregress(X,Y)

In [47]:
def generate_historical_prices(stock1, stock2):
    s1 = yf.Ticker(stock1)
    s2 = yf.Ticker(stock2)
    
    historical_prices = pd.concat([s1.history(start=start_date, end=end_date)['Close'].rename(stock1), s2.history(start=start_date, end=end_date)['Close'].rename(stock2)], axis = 1, sort=False)
    historical_prices.dropna(inplace=True)
    return historical_prices

def generate_pct_returns(stock1, stock2):
    pct_returns = generate_historical_prices(stock1, stock2).shift(1) / generate_historical_prices(stock1, stock2) - 1
    pct_returns.dropna(inplace=True)
    return pct_returns

def generate_log_pct_returns(stock1, stock2):
    log_pct_returns = log(generate_historical_prices(stock1, stock2).shift(1) / generate_historical_prices(stock1, stock2))
    return log_pct_returns.dropna(inplace=True)

def plot_2stock_line_chart(dataset):
    fig = go.Figure()
    fig.add_trace(go.Scatter(
                    x=dataset.index,
                    y = dataset.iloc[:,0],
                    name = dataset.columns.values[0],
                    line_color='deepskyblue',
                    opacity=0.8))
    fig.add_trace(go.Scatter(
                    x=dataset.index,
                    y = dataset.iloc[:,1],
                    name=dataset.columns.values[1],
                    line_color='dimgray',
                    opacity=0.8))
    fig.show()

def plot_2stock_scatter_chart(dataset):
    fig = px.scatter(x=dataset.iloc[:,0], y=dataset.iloc[:,1])
    fig.show()

def check_stationarity(spread, cutoff=0.05):
    p_value = adfuller(spread)[1]
    print("p-value is "+str(p_value))
    if p_value < cutoff:
        print("The null hypothesis that the series "+spread.name +" is non-stationary has been rejected")
    else:
        print("The null hypothesis that the series "+ spread.name +" is non-stationary has NOT been rejected")  
    fig = px.line(x=spread.index,y=spread)
    fig.show()

def linear_regression(seriesX, seriesY, TLS=False):
    # To run a linear regression with a constant term, add the following below.
    # But for the intents and purposes of find a hedge value to normalise the magnitude of the stock prices, I believe the constant needs to be 0 (the origin)
    if TLS:
        model = orthoregress(seriesX, seriesY)
        return model
    seriesX = sm.add_constant(seriesX)
    model = sm.OLS(seriesY, seriesX).fit()
    model.summary()
    return model

def get_spread(seriesX, seriesY, hedge):
    return (seriesY - hedge*seriesX).rename("spread", inplace = True)

def plot_single_line_chart(dataset):
    fig = px.line(x=dataset.index,y=dataset.data)
    fig.show()

def check_price_cointegration(stock1, stock2):
    historical_prices = generate_historical_prices(stock1, stock2)
    
    print("===== USING "+stock2+" AS BASE STOCK =====")
    X = historical_prices[stock1]
    Y = historical_prices[stock2]
    ols_model = linear_regression(X, Y)
    
    hedge = ols_model.params[stock1]
    print("The hedge ratio is "+str(hedge)+". That is, for every stock of "+stock2+", you should hedge "+str(hedge)+" of "+stock1)
    spread = get_spread(X, Y, hedge)
    check_stationarity(spread)
    
    print("===== USING "+stock1+" AS BASE STOCK =====")
    X = historical_prices[stock2]
    Y = historical_prices[stock1]
    ols_model = linear_regression(X, Y)
    
    hedge = ols_model.params[stock2]
    print("The hedge ratio is "+str(hedge)+". That is, for every stock of "+stock1+", you should hedge "+str(hedge)+" of "+stock2)
    spread = get_spread(X, Y, hedge)
    check_stationarity(spread)

def check_TLS_price_cointegration(stock1, stock2):
    historical_prices = generate_historical_prices(stock1, stock2)
    
    print("===== USING "+stock2+" AS BASE STOCK =====")
    X = historical_prices[stock1]
    Y = historical_prices[stock2]
    tls_model = linear_regression(X, Y, True)
    
    hedge = tls_model[0]
    print("The hedge ratio is "+str(hedge)+". That is, for every stock of "+stock2+", you should hedge "+str(hedge)+" of "+stock1)
    spread = get_spread(X, Y, hedge)
    check_stationarity(spread)
    
    print("===== USING "+stock1+" AS BASE STOCK =====")
    X = historical_prices[stock2]
    Y = historical_prices[stock1]
    tls_model = linear_regression(X, Y, True)
    
    hedge = tls_model[0]
    print("The hedge ratio is "+str(hedge)+". That is, for every stock of "+stock1+", you should hedge "+str(hedge)+" of "+stock2)
    spread = get_spread(X, Y, hedge)
    check_stationarity(spread)
    

In [45]:
stock1 = "CBA.AX"
stock2 = "NAB.AX"
plot_2stock_scatter_chart(generate_pct_returns(stock1, stock2))

AttributeError: 'NoneType' object has no attribute 'iloc'

In [48]:
plot_2stock_line_chart(generate_pct_returns(stock1, stock2))

In [38]:
check_price_cointegration(stock1, stock2)

===== USING WBC.AX AS BASE STOCK =====
The hedge ratio is 0.9091585906955356. That is, for every stock of WBC.AX, you should hedge 0.9091585906955356 of NAB.AX
p-value is 0.209285654942775
The null hypothesis that the series spread is non-stationary has NOT been rejected


===== USING NAB.AX AS BASE STOCK =====
The hedge ratio is 0.9163019904334104. That is, for every stock of NAB.AX, you should hedge 0.9163019904334104 of WBC.AX
p-value is 0.06790546732070792
The null hypothesis that the series spread is non-stationary has NOT been rejected


In [39]:
check_TLS_price_cointegration(stock1, stock2)

===== USING WBC.AX AS BASE STOCK =====
The hedge ratio is 0.9957131906744559. That is, for every stock of WBC.AX, you should hedge 0.9957131906744559 of NAB.AX
p-value is 0.12487284914150515
The null hypothesis that the series spread is non-stationary has NOT been rejected


===== USING NAB.AX AS BASE STOCK =====
The hedge ratio is 1.0042879140844783. That is, for every stock of NAB.AX, you should hedge 1.0042879140844783 of WBC.AX
p-value is 0.12486359336350716
The null hypothesis that the series spread is non-stationary has NOT been rejected


In [None]:
banks = yf.Tickers('jpm ms c msft')
tech = yf.Tickers('msft aapl goog')



# aged healthcare
jhc = yf.Ticker("JHC.AX")
ehe = yf.Ticker("EHE.AX")


# historical_prices = pd.concat([banks.tickers.JPM.history(period=time_period)['Close'].rename("JPM"), banks.tickers.MS.history(period=time_period)['Close'].rename("MS"),banks.tickers.C.history(period=time_period)['Close'].rename("CITI")], axis = 1, sort=False)
historical_prices = pd.concat([banks.tickers.JPM.history(start=start_date, end=end_date)['Close'].rename("JPM"), banks.tickers.MS.history(start=start_date, end=end_date)['Close'].rename("MS"),banks.tickers.C.history(start=start_date, end=end_date)['Close'].rename("CITI"), jhc.history(start=start_date, end=end_date)['Close'].rename("JHC")], axis = 1, sort=False)

# historical_prices = pd.concat([cpu.history(period=time_period)['Close'].rename("CPU"), lnk.history(period=time_period)['Close'].rename("LNK")], axis = 1, sort=False)

historical_prices.dropna(inplace=True)

historical_pct_returns = historical_prices.shift(1) / historical_prices - 1

historical_log_pct_returns = log(historical_prices.shift(1) / historical_prices)

historical_log_pct_returns.dropna(inplace=True)

different_data_sets = [historical_prices, historical_pct_returns, historical_log_pct_returns]

data_of_choice = different_data_sets[0]

X1 = data_of_choice["CITI"]
X2 = data_of_choice["MS"]
data_of_choice


In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(
                x=data_of_choice.index,
                y = X1,
                name = "CITI",
                line_color='deepskyblue',
                opacity=0.8))
fig.add_trace(go.Scatter(
                x=data_of_choice.index,
                y=X2,
                name="MS",
                line_color='dimgray',
                opacity=0.8))
fig.show()

In [None]:
fig = px.scatter(x=X1, y=X2)
fig.show()


# Testing for Cointegration
Source: https://robotwealth.com/practical-pairs-trading/
Engle-Granger Method using OLS regression to estimate a stationary Beta (the hedge). Here, I'm using returns of the two stocks as the regression variables instead of prices, since CAPM uses returns not prices and log returns instead of returns since log returns are compounding returns. Then run the augmented Dicky Fuller test to check for stationarity of the linear combination of X1 and X2 using Beta.

In [None]:
# For further research: why do we need to add constant
X1 = sm.add_constant(X1)

model = sm.OLS(X2, X1).fit()

model.summary()

In [None]:
hedge = model.params["CITI"]
spread = (data_of_choice["MS"] - hedge*data_of_choice["CITI"]).rename("spread", inplace = True)
spread
fig = px.line(x=spread.index,y=spread)
fig.show()

In [None]:
spread.?




In [None]:
check_stationarity(spread)

In [None]:
X3


In [None]:
X1