In [26]:
import numpy as np
import pandas as pd
import statsmodels
import statsmodels.api as sm
from statsmodels.tsa.stattools import coint, adfuller
import pickle

import matplotlib.pyplot as plt
import seaborn as sns; sns.set(style="whitegrid")

In [27]:
data = pd.read_csv("./data/nasdaq100_4year_closingData.csv")
data = data.set_index("Date")

In [28]:
def stationarity_test(X, cutoff=0.01):
    # H_0 in adfuller is unit root exists (non-stationary)
    # We must observe significant p-value to convince ourselves that the series is stationary
    pvalue = adfuller(X)[1]
    if pvalue < cutoff:
        print('p-value = ' + str(pvalue) + ' The series ' + X.name +' is likely stationary.')
    else:
        print('p-value = ' + str(pvalue) + ' The series ' + X.name +' is likely non-stationary.')

In [29]:
def find_cointegrated_pairs(data):
    n = data.shape[1]
    score_matrix = np.zeros((n, n))
    pvalue_matrix = np.ones((n, n))
    keys = data.keys()
    pairs = []
    for i in range(n):
        for j in range(i+1, n):
            S1 = data[keys[i]]
            S2 = data[keys[j]]
            S3 = S1[S1.isnull() == False]
            S4 = S2[S2.isnull() == False]
            minLen = min(len(S3), len(S4))
            S1 = S1[-minLen:]
            S2 = S2[-minLen:]
            assert(S1.isnull().sum() == 0)
            assert(S2.isnull().sum() == 0)
            result = coint(S1 , S2)
            score = result[0]
            pvalue = result[1]
            score_matrix[i, j] = score
            pvalue_matrix[i, j] = pvalue
            if pvalue < 0.05:
                pairs.append((keys[i], keys[j], score, pvalue))
    return score_matrix, pvalue_matrix, pairs

In [30]:
score_matrix, pvalue_matrix, pairs = find_cointegrated_pairs(data)

In [31]:
len(pairs)

266

In [32]:
file = open('./data/pairs.txt', 'wb')
pickle.dump(pairs, file)
file.close()