In [7]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import statsmodels as sm
from statsmodels.tsa.stattools import adfuller
from statsmodels.regression.linear_model import OLS
path = os.getcwd()

data = pd.read_csv(path + '/close.csv')
data['Date'] = pd.to_datetime(data['Date'])
data.set_index('Date', inplace = True)
tickers = list(data.columns.values)
data = data/data.iloc[0]
result_dict = {}
trading_data = data.loc[data.index > '2017-09-08']
data = data.loc[data.index < '2017-09-09']
potential_pairs = pd.read_csv(path + '/potential.csv', index_col = 0)
adf = {}

def half_life(ts):
    ts = np.asarray(ts)
    delta_ts = np.diff(ts)
    lag_ts = np.vstack([ts[1:], np.ones(len(ts[1:]))]).T
    beta = np.linalg.lstsq(lag_ts, delta_ts, rcond = -1)
    return (np.log(2) / beta[0])[0]

for j in range(len(potential_pairs)):
        first = potential_pairs.iloc[j]['first']
        second = potential_pairs.iloc[j]['second']
        pearson = potential_pairs.iloc[j]['Pearson']
        t = adfuller(data[first] - data[second])
        hl = half_life(data[first] - data[second])
        nt = adfuller(trading_data[first] - trading_data[second])
        nhl = half_life(trading_data[first] - trading_data[second])
        adf[potential_pairs.index.values[j]] = [t[0], t[1], hl, pearson, nt[0], nt[1], nhl]

adf_result = pd.DataFrame.from_dict(adf, orient = 'index', 
columns = ['Test Statistic', 'p-value', 'half-life', 'pearson', 'n1', 'n2','n3'])

adf_result = adf_result[adf_result['p-value'] < 0.02]
adf_result[['Test Statistic','p-value','half-life','pearson']]['PG-DUK']

Unnamed: 0,Test Statistic,p-value,half-life,pearson
PG-DUK,-3.876395,0.002216,27.180089,0.813922
UPS-D,-4.587889,0.000136,19.720975,0.880926
T-SO,-3.458724,0.009113,32.458678,0.709809
WMT-LIN,-3.458822,0.009110,33.890595,0.721946
KO-SO,-3.624921,0.005302,32.301119,0.770294
MMM-MMC,-3.343227,0.013053,25.863462,0.984076
MMC-APH,-3.337618,0.013278,27.554077,0.980270
HON-ADP,-3.204657,0.019717,30.024345,0.964932
MRK-SRE,-4.528344,0.000175,18.756311,0.887195
MRK-D,-4.632523,0.000112,20.747811,0.872579


In [17]:
adf = adf_result[['Test Statistic','p-value','half-life','pearson']]
adf.loc[['HON-NEE','TXN-SYK','BDX-SYK','HON-DHR','JPM-PNC']]

Unnamed: 0,Test Statistic,p-value,half-life,pearson
HON-NEE,-5.403225,3e-06,15.826612,0.977802
TXN-SYK,-3.565022,0.006466,32.548481,0.971142
BDX-SYK,-3.592615,0.005904,34.757947,0.981747
HON-DHR,-3.203565,0.019779,39.021054,0.96264
JPM-PNC,-3.264542,0.01654,43.673891,0.980251


In [30]:
df = data['HON'] - data['NEE']
df1 = data['TXN'] - data['SYK']
df2 = data['BDX'] - data['SYK']
df3 = data['HON'] - data['DHR']
df4 = data['JPM']- data['PNC']
df = pd.DataFrame({'HON-NEE':df, 'TXN-SYK':df1, 'BDX-SYK':df2, 'HON-DHR':df3, 'JPM-PNC':df4})
#df.columns = ['HON-NEE','TXN-SYK','BDX-SYK','HON-DHR','JPM-PNC']
df

Unnamed: 0_level_0,HON-NEE,TXN-SYK,BDX-SYK,HON-DHR,JPM-PNC
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2013-01-02,0.000000,0.000000,0.000000,0.000000,0.000000
2013-01-03,0.003719,-0.017342,0.007179,-0.005709,-0.005542
2013-01-04,0.019296,-0.029758,-0.003883,0.013786,-0.000114
2013-01-07,0.025290,-0.033057,-0.006569,0.014250,-0.008232
2013-01-08,0.004228,-0.042100,0.004282,0.011808,0.006884
2013-01-09,0.023673,-0.054333,-0.018307,-0.012719,0.007556
2013-01-10,0.012036,-0.049257,-0.024903,-0.017877,0.006490
2013-01-11,0.006225,-0.049566,-0.023896,-0.015922,0.024574
2013-01-14,0.014451,-0.052579,-0.017636,-0.005666,0.028829
2013-01-15,0.013609,-0.066784,-0.024949,-0.010773,0.033979
