**IMPORT & LOADING CLEANED PRICES**

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.vector_ar.vecm import coint_johansen
from statsmodels.regression.linear_model import OLS
import statsmodels.api as sm

In [5]:
prices = pd.read_csv("../data/basket_prices.csv", index_col=0, parse_dates=True)
prices = prices.ffill()
prices = prices.loc[~prices.duplicated()]
prices.head()

Unnamed: 0_level_0,AAPL,AMZN,GOOGL,META,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2015-01-02,24.237556,15.426,26.296133,77.969337,39.858459
2015-01-05,23.554739,15.1095,25.795088,76.717064,39.491913
2015-01-06,23.556961,14.7645,25.158482,75.683426,38.912289
2015-01-07,23.88728,14.921,25.084486,75.683426,39.406681
2015-01-08,24.80508,15.023,25.171888,77.700989,40.565964


**JOHANSEN COINTEGRATION TEST AND EXTRACT FIRST COINTEGRATING VECTOR (WEIGHTS)**

In [8]:
arr = prices.values # Johansen needs a numpy array with columns = series
# det_order -> no deterministic terms, k_ar_diff ~ lag order for VAR differences (try 1..5)
jres = coint_johansen(arr, det_order=0, k_ar_diff=1) 
# First cointegrating vector (eigenvector corresponding to smallest eigenvalue)
cj_vector = jres.evec[:, 0]  # length = number of assets
# normalize weights so sum of absolute weights = 1 for interpretability
cj_weights = cj_vector / np.sum(np.abs(cj_vector))


In [9]:
weights_df = pd.Series(cj_weights, index=prices.columns)
print("Johansen cointegrating vector (normalized):")
print(weights_df)

Johansen cointegrating vector (normalized):
AAPL     0.014951
AMZN    -0.062103
GOOGL   -0.663455
META     0.051216
MSFT     0.208274
dtype: float64


**CONSTRUCTING THE SPREAD & PLOTTING**

In [11]:
spread = prices.dot(cj_weights) # linear combination with weights
spread = pd.Series(spread, index=prices.index, name="spread")