In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
import joblib


In [3]:
ibm = pd.read_csv('ibm.csv',index_col=0)
hp = pd.read_csv('hp.csv',index_col=0)

ibm.sort_index(inplace=True)
hp.sort_index(inplace=True)

In [4]:
spr = ibm[['close']] - hp[['close']]
spr.dropna(inplace=True)
spr["spread"] = spr["close"]
spr.drop("close",axis=1, inplace=True)

In [5]:
# Reference https://blog.quantinsti.com/pairs-trading-basics/
interval = 24
threshold = 1.5
spr["rolling_mean"] = spr['spread'].rolling(interval).mean()
spr["rolling_std"] = spr['spread'].rolling(interval).std()
spr.dropna(inplace=True)
spr["upper_threshold"] = threshold*spr["rolling_std"]
spr["lower_threshold"] = - threshold*spr["rolling_std"]
spr["z-score"] = (spr['spread'] - spr["rolling_mean"]) / spr["rolling_std"]
### 1 is go short (sell ibm buy hp), 0 is hold, -1 is go long (sell hp buy ibm) ... I think
# spr["label"] = 1 if spr[spr["z-score"] > spr["upper_threshold"]] else 0 if spr[spr["z-score"] < spr["lower_threshold"]] else -1
spr['label'] = spr.apply(lambda x: 1 if x["z-score"] > x["upper_threshold"] else 0 if x["z-score"] < x["lower_threshold"] else -1, axis=1)
spr['label'].value_counts()


-1    1724
 1     814
 0     691
Name: label, dtype: int64

In [6]:
#Train - Test 80/20 split
split = round(0.8*len(spr))
train, test = spr[:split],spr[split:]
x_train = train.copy()
y_train = x_train["label"]
x_train.drop(["z-score", "upper_threshold", "lower_threshold", "rolling_mean", "rolling_std", "label"], axis=1, inplace=True)
x_test = test.copy()
y_test = x_test["label"]
x_test.drop(["z-score", "upper_threshold", "lower_threshold", "rolling_mean", "rolling_std", "label"], axis=1, inplace=True)

In [12]:
param_grid={'penalty':['l1','l2'], 'C':[1e-3, 1e-2, 1e-1, 1, 10]}

# Create a based model
lr = LogisticRegression()
# Instantiate the grid search model
model = GridSearchCV(estimator=LogisticRegression(solver="liblinear"), param_grid=param_grid) #helps to test different kinds of hype params
model.fit(x_train,y_train)
y_pred = model.predict(x_test)



In [13]:
print(classification_report(y_test,y_pred))
print(accuracy_score(y_pred, y_test))
print(model.best_params_)

              precision    recall  f1-score   support

          -1       0.54      1.00      0.70       350
           0       0.00      0.00      0.00       113
           1       0.00      0.00      0.00       183

    accuracy                           0.54       646
   macro avg       0.18      0.33      0.23       646
weighted avg       0.29      0.54      0.38       646

0.541795665634675
{'C': 0.001, 'penalty': 'l1'}


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [14]:
joblib.dump(model, 'logreg2.pkl')

['logreg2.pkl']