In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import requests
import statsmodels
import matplotlib.pyplot as plt
import pickle

import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('combined_features.csv',index_col=0)

In [3]:
data

Unnamed: 0_level_0,vwap,SMA(5),SMA(10),12dayEWM,rsi,MACD,mom,mfi,spread,forward_return,output
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2020-11-17 17:00:00,0.949281,0.911684,0.908723,0.910738,1.028341,0.291573,0.913092,1.458798,8.522631,-0.147019,-1
2020-11-18 10:00:00,0.949618,0.912712,0.910101,0.912506,1.154188,0.337144,1.264965,2.788685,7.923738,-0.195590,-1
2020-11-18 11:00:00,0.949919,0.914357,0.912081,0.915681,1.443286,0.416619,1.582164,2.204730,7.326875,-0.288873,-1
2020-11-18 12:00:00,0.950429,0.917156,0.914294,0.919506,1.567864,0.517426,2.864586,11.746214,6.784229,-0.396706,-1
2020-11-18 13:00:00,0.950870,0.920886,0.916310,0.922577,1.556466,0.605077,3.047215,13.277380,6.601599,-0.430397,-1
...,...,...,...,...,...,...,...,...,...,...,...
2022-10-28 13:00:00,0.948541,0.837141,0.838733,0.836955,0.914318,-0.267499,-1.990000,0.492807,14.850000,0.071380,1
2022-10-28 14:00:00,0.948539,0.836728,0.838468,0.837909,0.968452,-0.254640,0.225000,9.559848,14.535000,0.054696,1
2022-10-28 15:00:00,0.948534,0.837559,0.838659,0.838728,0.973852,-0.245484,-0.322900,15.006942,14.582900,0.050257,1
2022-10-28 16:00:00,0.948511,0.838191,0.838129,0.838419,0.953725,-0.257618,-0.860000,7.968697,14.860000,0.081427,1


In [4]:
def gen_labels(df,t,threshold):
    # Calculate % return on spread t hours later
    fw_ret = df['spread'].diff(periods=t)/df['spread']
    
    #If the return is more than x%, we should have bought, and hence the label is (1)
    #If return is less than x%, we should have sold, and hence label is (-1)
    #If in between, do nothing (0)
    output = np.select([fw_ret > threshold ,fw_ret < -threshold],[1,-1])
    return output

In [5]:
X = data.loc[:,data.columns != 'output'].copy()
X = (X-X.mean())/(X.max()-X.min())
X['spread'] = data['spread']
X

Unnamed: 0_level_0,vwap,SMA(5),SMA(10),12dayEWM,rsi,MACD,mom,mfi,spread,forward_return
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2020-11-17 17:00:00,-0.403716,0.014495,0.001265,0.010330,0.002848,0.118660,0.111258,0.017568,8.522631,-0.000941
2020-11-18 10:00:00,-0.392941,0.019031,0.007421,0.018152,0.038089,0.136703,0.153404,0.028337,7.923738,-0.001084
2020-11-18 11:00:00,-0.383316,0.026285,0.016265,0.032197,0.119046,0.168171,0.191397,0.023608,7.326875,-0.001357
2020-11-18 12:00:00,-0.367022,0.038633,0.026153,0.049120,0.153931,0.208084,0.344999,0.100878,6.784229,-0.001673
2020-11-18 13:00:00,-0.352925,0.055085,0.035163,0.062704,0.150740,0.242789,0.366874,0.113278,6.601599,-0.001771
...,...,...,...,...,...,...,...,...,...,...
2022-10-28 13:00:00,-0.427340,-0.314314,-0.311453,-0.316059,-0.029082,-0.102701,-0.236461,0.009745,14.850000,-0.000302
2022-10-28 14:00:00,-0.427405,-0.316133,-0.312636,-0.311837,-0.013922,-0.097609,0.028842,0.083173,14.535000,-0.000351
2022-10-28 15:00:00,-0.427568,-0.312471,-0.311782,-0.308216,-0.012410,-0.093984,-0.036783,0.127285,14.582900,-0.000364
2022-10-28 16:00:00,-0.428302,-0.309683,-0.314151,-0.309581,-0.018046,-0.098789,-0.101114,0.070287,14.860000,-0.000272


In [6]:
y = gen_labels(X,t=24,threshold=0.05)

In [7]:
np.unique(y,return_counts = True)

(array([-1,  0,  1]), array([1161, 1344, 1448], dtype=int64))

In [8]:
split = int(len(data) * 0.8)

X_train = X[:split]
y_train = y[:split]

X_test = X[split:]
y_test = y[split:]

### Preprocessing: Time Series Split
We do not have a lot of data to work with, and hence will need to cross-validate.

In [9]:
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.model_selection import cross_val_score, GridSearchCV, TimeSeriesSplit

In [10]:
tscv = TimeSeriesSplit()
clf = SVC(decision_function_shape='ovo',class_weight='balanced')

In [11]:
cross_val_score(clf, X_train, y_train, cv=tscv, scoring='f1_weighted')

array([0.36346962, 0.36181435, 0.54665256, 0.33469337, 0.30117647])

In [12]:
params_grid = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100]},
                    {'kernel': ['linear'], 'C': [1, 10, 100]}]

In [13]:
finder = GridSearchCV(clf, params_grid, cv=5, scoring='f1_weighted')
finder.fit(X_train,y_train)

GridSearchCV(cv=5,
             estimator=SVC(class_weight='balanced',
                           decision_function_shape='ovo'),
             param_grid=[{'C': [1, 10, 100], 'gamma': [0.001, 0.0001],
                          'kernel': ['rbf']},
                         {'C': [1, 10, 100], 'kernel': ['linear']}],
             scoring='f1_weighted')

In [14]:
finder.best_params_

{'C': 100, 'kernel': 'linear'}

In [15]:
finder.best_score_

0.6577026578715863

In [16]:
y_pred = finder.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

          -1       0.96      0.57      0.72       178
           0       0.60      0.96      0.74       386
           1       0.85      0.24      0.38       227

    accuracy                           0.67       791
   macro avg       0.80      0.59      0.61       791
weighted avg       0.75      0.67      0.63       791



In [17]:
with open('svm_model.pkl', 'wb') as f:
    pickle.dump(finder, f)