In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
import joblib


In [4]:
ibm = pd.read_csv('ibm.csv',index_col=0)
hp = pd.read_csv('hp.csv',index_col=0)

ibm.sort_index(inplace=True)
hp.sort_index(inplace=True)

In [5]:
spr = ibm[['close']] - hp[['close']]
spr.dropna(inplace=True)
spr["spread"] = spr["close"]
spr.drop("close",axis=1, inplace=True)

In [6]:
# Reference https://blog.quantinsti.com/pairs-trading-basics/
interval = 24
threshold = 2
#df with spread
def strategy_labeling(df, interval, threshold):
    df["rolling_mean"] = df['spread'].rolling(interval).mean()
    df["rolling_std"] = df['spread'].rolling(interval).std()
    df.dropna(inplace=True)
    df["upper_threshold"] = threshold*df["rolling_std"]
    df["lower_threshold"] = - threshold*df["rolling_std"]
    df["z-score"] = (df['spread'] - df["rolling_mean"]) / df["rolling_std"]
    ### 1 is go short (sell ibm buy hp), 0 is hold, -1 is go long (sell hp buy ibm) ... I think
    # spr["label"] = 1 if spr[spr["z-score"] > spr["upper_threshold"]] else 0 if spr[spr["z-score"] < spr["lower_threshold"]] else -1
    df['label'] = spr.apply(lambda x: 1 if x["z-score"] > x["upper_threshold"] else 0 if x["z-score"] < x["lower_threshold"] else -1, axis=1)
    return df

spr = strategy_labeling(spr, interval, threshold)
spr['label'].value_counts()


-1    2159
 1     575
 0     495
Name: label, dtype: int64

In [14]:
#Train - Test 80/20 split
split = round(0.8*len(spr))
train, test = spr[:split],spr[split:]
x_train = train.copy()
y_train = x_train["label"]
x_train.drop(["upper_threshold", "lower_threshold", "rolling_mean", "rolling_std", "label"], axis=1, inplace=True)
x_test = test.copy()
y_test = x_test["label"]
x_test.drop(["upper_threshold", "lower_threshold", "rolling_mean", "rolling_std", "label"], axis=1, inplace=True)

In [15]:
param_grid={'penalty':['l1','l2'], 'C':[1e-3, 1e-2, 1e-1, 1, 10]}

# Create a based model
lr = LogisticRegression()
# Instantiate the grid search model
model = GridSearchCV(estimator=LogisticRegression(solver="liblinear"), param_grid=param_grid) #helps to test different kinds of hype params
model.fit(x_train,y_train)
y_pred = model.predict(x_test)



In [16]:
print(classification_report(y_test,y_pred))
print(accuracy_score(y_pred, y_test))
print(model.best_params_)

              precision    recall  f1-score   support

          -1       0.81      0.84      0.82       437
           0       0.52      0.74      0.61        72
           1       0.76      0.49      0.60       137

    accuracy                           0.76       646
   macro avg       0.70      0.69      0.68       646
weighted avg       0.76      0.76      0.75       646

0.7554179566563467
{'C': 10, 'penalty': 'l1'}


In [10]:
joblib.dump(model, 'logreg2.pkl')

['logreg2.pkl']