In [None]:
import yfinance as yf
import pandas as pd
from statsmodels.tsa.stattools import coint
from statsmodels.tsa.stattools import adfuller
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
import seaborn as sns
import matplotlib.pyplot as plt
import xgboost as xgb
import numpy as np

selected_etfs = ['XLU','XLRE','VPU','SHY','VGSH','ARKK','ARKW']

prices = yf.download(selected_etfs, start = '2015-01-01', end = '2019-12-31')['Close']

pair1 = ['XLU','XLRE']
pair2 = ['VPU','XLRE']
pair3 = ['SHY','VGSH']
pair4 = ['ARKK','ARKW']

prices = prices.dropna()

prices.info()





  prices = yf.download(selected_etfs, start = '2010-01-01', end = '2019-12-31')['Close']
[*********************100%***********************]  7 of 7 completed

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1064 entries, 2015-10-08 to 2019-12-30
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   ARKK    1064 non-null   float64
 1   ARKW    1064 non-null   float64
 2   SHY     1064 non-null   float64
 3   VGSH    1064 non-null   float64
 4   VPU     1064 non-null   float64
 5   XLRE    1064 non-null   float64
 6   XLU     1064 non-null   float64
dtypes: float64(7)
memory usage: 66.5 KB





In [57]:
def generate_labels(z, upper=1.0, lower=-1.0):
    return np.where(z > upper, -1,    # SHORT spread
           np.where(z < lower, 1,     # LONG spread
           0))                        # No trade

In [None]:

all_pairs = [pair1,pair2,pair3,pair4]

# store the spread series for every pair
spreads = {}

for pair in all_pairs:

    a = prices[pair[0]]
    b = prices[pair[1]]

    X = sm.add_constant(b)
    model = sm.OLS(a, X).fit()
    hedge_ratio = model.params[pair[1]]
                               
    spread = a - hedge_ratio * b

    # Normalize the spread

    lookback = 30

    spread_mean = spread.rolling(window=lookback).mean()
    spread_std = spread.rolling(window=lookback).std()

    spreads['rolling_std'] = spread_std
    spreads['rolling_mean'] = spread_mean

    z_spread = (spread - spread_mean) / spread_std

    spreads[f"{pair[0]}-{pair[1]}"] = z_spread


    pair_name = f"{pair[0]}-{pair[1]}"
    spreads[f"{pair_name}_spread"] = spread
    spreads[f"{pair_name}_z"] = z_spread
    spreads[f"{pair_name}_mean"] = spread_mean
    spreads[f"{pair_name}_std"] = spread_std


# Step 1: Create spread_df like before
spread_df = pd.DataFrame(spreads)  # columns are pairs, index is datetime

spread_df.dropna(inplace=True)
spread_df.reset_index(inplace=True)

melted_z = spread_df.melt(id_vars='Date', value_name='z_score', var_name='pair')
melted_z = melted_z[melted_z['pair'].str.endswith('_z')].copy()
melted_z['pair'] = melted_z['pair'].str.replace('_z', '', regex=False)

melted_mean = spread_df.melt(id_vars='Date', value_name='mean', var_name='pair')
melted_mean = melted_mean[melted_mean['pair'].str.endswith('_mean')].copy()
melted_mean['pair'] = melted_mean['pair'].str.replace('_mean', '', regex=False)

melted_std = spread_df.melt(id_vars='Date', value_name='std', var_name='pair')
melted_std = melted_std[melted_std['pair'].str.endswith('_std')].copy()
melted_std['pair'] = melted_mean['pair'].str.replace('_std', '', regex=False)

melted_spread = spread_df.melt(id_vars='Date', value_name='spread', var_name='pair')
melted_spread = melted_spread[melted_spread['pair'].str.endswith('_spread')].copy()
melted_spread['pair'] = melted_spread['pair'].str.replace('_spread', '', regex=False)

spread_long = melted_z.merge(melted_mean, on=['Date', 'pair'])
spread_long = spread_long.merge(melted_std, on=['Date', 'pair'])
spread_long = spread_long.merge(melted_spread, on=['Date', 'pair'])

# Now each row is a single (pair, date, z_score)

spread_long['label'] = generate_labels(spread_long['z_score'])

print(spread_long.head())

spread_long.info()
spread_long['label'].value_counts()


Empty DataFrame
Columns: [Date, pair, z_score, mean, std, spread, label]
Index: []
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 0 entries
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   Date     0 non-null      datetime64[ns]
 1   pair     0 non-null      object        
 2   z_score  0 non-null      float64       
 3   mean     0 non-null      float64       
 4   std      0 non-null      float64       
 5   spread   0 non-null      float64       
 6   label    0 non-null      int64         
dtypes: datetime64[ns](1), float64(4), int64(1), object(1)
memory usage: 132.0+ bytes


Series([], Name: count, dtype: int64)

In [2]:
# create features


spread_long['z_score_lag1'] = spread_long.groupby('pair')['z_score'].shift(1)
spread_long['spread_lag1'] = spread_long.groupby('pair')['spread'].shift(1)
spread_long['z_score_lag2'] = spread_long.groupby('pair')['z_score'].shift(2)
spread_long['spread_lag2'] = spread_long.groupby('pair')['spread'].shift(2)
spread_long['z_score_lag3'] = spread_long.groupby('pair')['z_score'].shift(3)
spread_long['spread_lag3'] = spread_long.groupby('pair')['spread'].shift(3)
spread_long['z_score_lag4'] = spread_long.groupby('pair')['z_score'].shift(4)
spread_long['spread_lag4'] = spread_long.groupby('pair')['spread'].shift(4)

spread_long['rolling_std'] = spread_long.groupby('pair')['std'].rolling(lookback).std().reset_index(level=0, drop=True)
spread_long['rolling_mean'] = spread_long.groupby('pair')['mean'].rolling(lookback).mean().reset_index(level=0, drop=True)
spread_long['volatility'] = spread_long.groupby('pair')['z_score'].rolling(lookback).std().reset_index(level=0, drop=True)


# Drop NaNs
spread_long.dropna(inplace=True)


NameError: name 'spread_long' is not defined

In [63]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
spread_long['pair_encoded'] = le.fit_transform(spread_long['pair'])

spread_long.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 0 entries
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Date          0 non-null      datetime64[ns]
 1   pair          0 non-null      object        
 2   z_score       0 non-null      float64       
 3   mean          0 non-null      float64       
 4   std           0 non-null      float64       
 5   spread        0 non-null      float64       
 6   label         0 non-null      int64         
 7   z_score_lag1  0 non-null      float64       
 8   spread_lag1   0 non-null      float64       
 9   rolling_std   0 non-null      float64       
 10  rolling_mean  0 non-null      float64       
 11  volatility    0 non-null      float64       
 12  pair_encoded  0 non-null      float64       
dtypes: datetime64[ns](1), float64(10), int64(1), object(1)
memory usage: 132.0+ bytes


In [None]:
features = ['pair_encoded','z_score_lag1','rolling_std','rolling_mean','spread','spread_lag1','z_score','volatility']
target = 'label'

X = spread_long[features]
y = spread_long[target]

from sklearn.model_selection import train_test_split 


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=False  # shuffle=False preserves time order
)


In [None]:
import xgboost as xgb

model = xgb.XGBClassifier(
    objective='multi:softprob',  # For multi-class (long, short, neutral)
    num_class=3,
    eval_metric='mlogloss',
    use_label_encoder=False
)

model.fit(X_train, y_train)

model = xgb.XGBClassifier(
    n_estimators=100,        # Number of boosting rounds (trees)
    learning_rate=0.1,       # Step size shrinkage (how much model learns each round)
    max_depth=3,             # Max depth of a tree (controls complexity)
    subsample=0.8,           # Row subsampling for regularization
    colsample_bytree=0.8,    # Feature subsampling
    objective='multi:softmax', # Use 'multi:softmax' for multiclass (e.g., long/short/neutral)
    num_class=3,             # Number of classes in your labels
    random_state=42
)


In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.01, 0.1, 0.3],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

grid_search = GridSearchCV(
    estimator=XGBClassifier(objective='multi:softmax', num_class=3),
    param_grid=param_grid,
    scoring='accuracy',
    cv=3,
    verbose=1
)

grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_


In [None]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
