# Check Cointegration for 2 clusters - found by K-Medoids

In [3]:
import statsmodels
import statsmodels.api as sm
from statsmodels.tsa.stattools import coint, adfuller
import pandas as pd
import numpy as np
import itertools
import datetime as dt
import matplotlib.pyplot as plt
import matplotlib.pylab as pit
from matplotlib.pylab import rcParams
%matplotlib inline
from matplotlib import style
style.use('ggplot')
rcParams['figure.figsize'] = 20,10

In [4]:
df = pd.DataFrame()
df = pd.read_csv('C:/Users/archana.parihar/Downloads/ClustersFromR_2_K-Medoids.csv')
dfCluster1m = df.loc[df['Cluster'] == 1]
dfCluster2m = df.loc[df['Cluster'] == 2]


In [5]:
badCompanies = ['BMY', 'DHR', 'ES', 'ICE', 'ORCL', 'O', 'IQV', 'COTY', 'FOXA', 'FOX', 'NWSA', 'NWS', 'ALLE', 'GOOG', 'NAVI', 'INFO', 'SYF', 'CFG', 'QRVO', 'WRK', 'KHC', 'PYPL', 'HPE', 'HPQ', 'CSRA', 'WLTW', 'UA', 'FTV', 'EVHC', 'HLT', 'DXC', 'BHGE', 'BHF', 'DWDP', 'APTV', 'CBG', 'CHK', 'HCN', 'LUK', 'PCLN', 'PDCO', 'SIG', 'SNI', 'WYN', 'CSRA', 'ABMD', 'BKNG', 'CBRE', 'IPGP', 'JEF', 'MSCI', 'NKTR', 'SIVB', 'TTWO', 'WELL']

In [6]:
df1 = pd.DataFrame()
df1 = pd.read_csv('C:/Users/archana.parihar/Downloads/allStocks13-14-15.csv', parse_dates=True,index_col=0)


In [7]:
def test_CointegrationOfCombinedSeries(timeSeries):
    coint = False    
    #plot rolling statistics to visualize whether it is stationary or not
    ma = timeSeries.rolling(window=100).mean()
    mstd = timeSeries.rolling(window=100).std()
    
   # timeSeries.plot()
   # ma.plot(color='blue')
   # mstd.plot(color='black')
    
    #perform dickey fuller to check quantitatively
    dftest = adfuller(timeSeries,autolag= 'AIC')
    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#lags used','No of observations used'])
    for key,value in dftest[4].items():
        dfoutput['Critical value (%s)'%key]= value

    #print(dfoutput) 
    #if dfoutput['Critical value (10%)']<-2.57  :
       # print('The pair is most likely Cointegrated')
        #coint = True
    #if dfoutput['Critical value (10%)']>-2.57  :
        #print('The pair is most likely NOT Cointegrated')
    if dfoutput['p-value']<=0.02:
        coint= True
    return coint

def ImproveStationarity_and_test_Cointegration(dfX,dfY):
    #Convert closePrice of Time series X to log-Moving average time series for better value of regression coefficient
    dfX_logScale = np.log(dfX['close'])
    Xma = dfX_logScale.rolling(window=100).mean()
    logScaleMinusMovingAverage1 = dfX_logScale - Xma
    logScaleMinusMovingAverage1.dropna(inplace=True)

    #Convert closePrice of Time series Y to log-Moving average time series for better value of regression coefficient
    dfY_logScale = np.log(dfY['close'])
    Yma = dfY_logScale.rolling(window=100).mean()
    logScaleMinusMovingAverage2 = dfY_logScale - Yma
    logScaleMinusMovingAverage2.dropna(inplace=True)

    #Using linear regression to calculate the value of b for linearly combines Time series
    logScaleMinusMovingAverage1 = sm.add_constant(logScaleMinusMovingAverage1)
    results = sm.OLS(logScaleMinusMovingAverage2,logScaleMinusMovingAverage1['close']).fit()
    b = results.params['close']
    z = logScaleMinusMovingAverage1['close'] - (b * logScaleMinusMovingAverage2)
    coint = test_CointegrationOfCombinedSeries(z)
    return coint

In [8]:
def coIntTestForCluster(dfCluster,ClusterNumber):
    print('Cluster number: '+ str(ClusterNumber))
    ListOfCompInOneCluster = dfCluster['Name'].tolist()
    print('Old length of cluster : '+ str(len(ListOfCompInOneCluster)))
    for i in range(0,len(badCompanies)):
        if badCompanies[i] in ListOfCompInOneCluster :
            #print("Bad Company found in Cluster: " , badCompanies[i])
            ListOfCompInOneCluster.remove(badCompanies[i])
    print('New length of cluster : '+ str(len(ListOfCompInOneCluster)))

    ListOfPairs = list(itertools.combinations(ListOfCompInOneCluster, 2))
    #print(ListOfPairs[:2])
    #print(len(ListOfPairs))
    #print(ListOfPairs[0][0],ListOfPairs[0][1])

    pair_result_dict = {}
    for x in range(0,len(ListOfPairs)):
        pair_result_dict[ListOfPairs[x]] = []

    for i in range(0,len(ListOfPairs)):
        dfX = df1.loc[df1['Name']==ListOfPairs[i][0]]
        dfY = df1.loc[df1['Name']==ListOfPairs[i][1]]
        Result = ImproveStationarity_and_test_Cointegration(dfX,dfY)
        pair_result_dict[ListOfPairs[i]] = Result

    ListOfValuesCluster_2=list(pair_result_dict.values())
    print('Total pairs:  ' + str(len(ListOfValuesCluster_2)))
    print('Number of True: ' + str(ListOfValuesCluster_2.count(True)))
    print('Number of False: ' + str(ListOfValuesCluster_2.count(False)))
    print('---------------------------------------------------------------')
    return pair_result_dict

listOfClusters = [dfCluster1m,dfCluster2m]
listOfResultDict = []

for c in range(0,len(listOfClusters)):
    res = coIntTestForCluster(listOfClusters[c],c)
    listOfResultDict.append(res)

Cluster number: 0
Old length of cluster : 117
New length of cluster : 107
Total pairs:  5671
Number of True: 1485
Number of False: 4186
---------------------------------------------------------------
Cluster number: 1
Old length of cluster : 388
New length of cluster : 354
Total pairs:  62481
Number of True: 11995
Number of False: 50486
---------------------------------------------------------------


In [9]:
ListOfCointPairs = []
for key, value in listOfResultDict[0].items():
    if value == True:
        ListOfCointPairs.append(key)
        
print(len(ListOfCointPairs))

dfCoInt = pd.DataFrame({'Pairs':ListOfCointPairs})
dfCoInt.head(5)
dfCoInt.to_csv('C:/Users/archana.parihar/Downloads/ResultsSnP/ListOfPairs_2_Medoid_Cluster_1_pvalue_02.csv',index=False, sep=',',encoding='utf-8')

1485


In [10]:
ListOfCointPairs = []

for key, value in listOfResultDict[1].items():
    if value == True:
        ListOfCointPairs.append(key)
        
print(len(ListOfCointPairs))

dfCoInt = pd.DataFrame({'Pairs':ListOfCointPairs})
dfCoInt.head(5)
dfCoInt.to_csv('C:/Users/archana.parihar/Downloads/ResultsSnP/ListOfPairs_2_Medoid_Cluster_2_pvalue_02.csv',index=False, sep=',',encoding='utf-8')

11995
