In [4]:
import yfinance as yf
import pandas as pd
from statsmodels.tsa.stattools import coint
from statsmodels.tsa.stattools import adfuller
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
import seaborn as sns
import matplotlib.pyplot as plt
import xgboost as xgb
import numpy as np

selected_etfs = ['XLU','XLRE','VPU','SHY','VGSH','ARKK','ARKW']

prices = yf.download(selected_etfs, start = '2015-01-01', end = '2019-12-31')['Close']

pair1 = ['XLU','XLRE']
pair2 = ['VPU','XLRE']
pair3 = ['SHY','VGSH']
pair4 = ['ARKK','ARKW']

prices = prices.dropna()

prices.info()





  prices = yf.download(selected_etfs, start = '2015-01-01', end = '2019-12-31')['Close']
[*********************100%***********************]  7 of 7 completed

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1064 entries, 2015-10-08 to 2019-12-30
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   ARKK    1064 non-null   float64
 1   ARKW    1064 non-null   float64
 2   SHY     1064 non-null   float64
 3   VGSH    1064 non-null   float64
 4   VPU     1064 non-null   float64
 5   XLRE    1064 non-null   float64
 6   XLU     1064 non-null   float64
dtypes: float64(7)
memory usage: 66.5 KB





In [None]:
def generate_labels(z, upper=1.0, lower=-1.0):
    return np.where(z > upper, -1,    # SHORT spread
           np.where(z < lower, 1,     # LONG spread
           0))                        # No trade



Ticker,ARKK,ARKW,SHY,VGSH,VPU,XLRE,XLU
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2015-10-08,17.498104,17.684343,71.649452,50.419682,69.783493,21.580568,32.115837
2015-10-09,17.890663,18.050226,71.649452,50.427929,69.540924,21.544861,31.962561
2015-10-12,17.750784,17.959162,71.632614,50.43618,70.128967,21.680582,32.247223
2015-10-13,17.687614,17.838829,71.657875,50.43618,69.923141,21.544861,32.174236
2015-10-14,17.597372,17.757519,71.759064,50.518593,69.901062,21.544861,32.166927


In [40]:
import pandas as pd
import statsmodels.api as sm

# Assumes `prices` is your DataFrame of adjusted-close prices,
# and `all_pairs` is a list of tuples: [(etf1, etf2), …]

lookback = 30
spread_records = []

for etf_a, etf_b in all_pairs:
    # Align prices and drop any NaNs for the pair
    pair_prices = pd.concat([prices[etf_a], prices[etf_b]], axis=1).dropna()
    a = pair_prices[etf_a]
    b = pair_prices[etf_b]

    # 1) Estimate hedge ratio via OLS
    model = sm.OLS(a, sm.add_constant(b)).fit()
    hedge_ratio = model.params[etf_b]

    # 2) Compute spread and rolling statistics
    spread = a - hedge_ratio * b
    rolling_mean = spread.rolling(window=lookback).mean()
    rolling_std = spread.rolling(window=lookback).std()
    z_score = (spread - rolling_mean) / rolling_std
    rolling_volatility = z_score.rolling(window=lookback).std()

    # 3) Build DataFrame for this pair
    df_pair = pd.DataFrame({
        'Date': spread.index,
        'pair': f"{etf_a}-{etf_b}",
        'spread': spread,
        'rolling_mean': rolling_mean,
        'rolling_std': rolling_std,
        'z_score': z_score,
        'rolling_volatility': rolling_volatility
    })

    spread_records.append(df_pair)

# Combine all pairs into a single long-form DataFrame
spread_long = pd.concat(spread_records, ignore_index=True)

# 4) Add lagged features for z_score and spread
spread_long['z_score_lag1'] = spread_long.groupby('pair')['z_score'].shift(1)
spread_long['spread_lag1'] = spread_long.groupby('pair')['spread'].shift(1)

spread_long['z_score_lag2'] = spread_long.groupby('pair')['z_score'].shift(2)
spread_long['spread_lag2'] = spread_long.groupby('pair')['spread'].shift(2)

spread_long['z_score_lag3'] = spread_long.groupby('pair')['z_score'].shift(3)
spread_long['spread_lag3'] = spread_long.groupby('pair')['spread'].shift(3)


# 5) Final cleanup
spread_long.dropna(inplace=True)
spread_long.sort_values(['pair', 'Date'], inplace=True)
spread_long.reset_index(drop=True, inplace=True)

# `spread_long` now contains columns:
# ['Date', 'pair', 'spread', 'rolling_mean', 'rolling_std',
#  'z_score', 'rolling_volatility', 'z_score_lag1', 'spread_lag1']


spread_long.head()
spread_long.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4024 entries, 0 to 4023
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   Date                4024 non-null   datetime64[ns]
 1   pair                4024 non-null   object        
 2   spread              4024 non-null   float64       
 3   rolling_mean        4024 non-null   float64       
 4   rolling_std         4024 non-null   float64       
 5   z_score             4024 non-null   float64       
 6   rolling_volatility  4024 non-null   float64       
 7   z_score_lag1        4024 non-null   float64       
 8   spread_lag1         4024 non-null   float64       
 9   z_score_lag2        4024 non-null   float64       
 10  spread_lag2         4024 non-null   float64       
 11  z_score_lag3        4024 non-null   float64       
 12  spread_lag3         4024 non-null   float64       
dtypes: datetime64[ns](1), float64(11), object(1)
mem

In [30]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
spread_long['pair_encoded'] = le.fit_transform(spread_long['pair'])

spread_long.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 0 entries
Data columns (total 19 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Date          0 non-null      datetime64[ns]
 1   pair          0 non-null      object        
 2   z_score       0 non-null      float64       
 3   mean          0 non-null      float64       
 4   std           0 non-null      float64       
 5   spread        0 non-null      float64       
 6   rolling_std   0 non-null      float64       
 7   rolling_mean  0 non-null      float64       
 8   r_volatility  0 non-null      float64       
 9   label         0 non-null      int64         
 10  z_score_lag1  0 non-null      float64       
 11  spread_lag1   0 non-null      float64       
 12  z_score_lag2  0 non-null      float64       
 13  spread_lag2   0 non-null      float64       
 14  z_score_lag3  0 non-null      float64       
 15  spread_lag3   0 non-null      float64       
 16  z_

In [None]:
features = ['pair_encoded','z_score_lag1','rolling_std','rolling_mean','spread','spread_lag1','z_score','r_volatility']
target = 'label'

X = spread_long[features]
y = spread_long[target]

print(X)


from sklearn.model_selection import train_test_split 


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=False  # shuffle=False preserves time order
)


Empty DataFrame
Columns: [pair_encoded, z_score_lag1, rolling_std, rolling_mean, spread, spread_lag1, z_score, volatility]
Index: []


In [None]:
import xgboost as xgb

model = xgb.XGBClassifier(
    objective='multi:softprob',  # For multi-class (long, short, neutral)
    num_class=3,
    eval_metric='mlogloss',
    use_label_encoder=False
)

model.fit(X_train, y_train)

model = xgb.XGBClassifier(
    n_estimators=100,        # Number of boosting rounds (trees)
    learning_rate=0.1,       # Step size shrinkage (how much model learns each round)
    max_depth=3,             # Max depth of a tree (controls complexity)
    subsample=0.8,           # Row subsampling for regularization
    colsample_bytree=0.8,    # Feature subsampling
    objective='multi:softmax', # Use 'multi:softmax' for multiclass (e.g., long/short/neutral)
    num_class=3,             # Number of classes in your labels
    random_state=42
)


In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.01, 0.1, 0.3],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

grid_search = GridSearchCV(
    estimator=XGBClassifier(objective='multi:softmax', num_class=3),
    param_grid=param_grid,
    scoring='accuracy',
    cv=3,
    verbose=1
)

grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_


In [None]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
