In [17]:
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import StackingClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import classification_report, f1_score, accuracy_score
import pandas as pd
import numpy as np

In [18]:
df = pd.read_csv('combined_features.csv',index_col=0)
df.sort_index(inplace=True)
df.dropna(inplace = True)
#generate label for training
def gen_labels(df,t,threshold):
    # Calculate % return on spread t hours later
    df['forward_return'] = df['spread'].diff(periods=t)/df['spread']
    
    #If the return is more than x%, we should have bought, and hence the label is (1)
    #If return is less than x%, we should have sold, and hence label is (-1)
    #If in between, do nothing (0)
    df['output'] = np.select([df['forward_return'] > threshold ,df['forward_return'] < -threshold],[1,-1])
    return df

df = gen_labels(df,t=24,threshold=0.05)

#train and test data
split = round(0.8*len(df))
train, test = df[:split],df[split:]
x_train = train[['vwap','SMA(5)','SMA(10)','12dayEWM','rsi','MACD','mom','mfi','spread']]
y_train = train[['output']]

x_test = test[['vwap','SMA(5)','SMA(10)','12dayEWM','rsi','MACD','mom','mfi','spread']]
y_test = test[['output']]

In [20]:
# get a stacking ensemble of models
def get_stacking():
	# define the base models
	level0 = list()
	level0.append(('lr', LogisticRegression(solver='newton-cg',penalty='none',C=0.001, max_iter=5000)))
	level0.append(('cart', RandomForestClassifier(bootstrap= True,
                             max_depth= 120,
                             max_features= 9,
                             min_samples_leaf= 5,
                             min_samples_split= 10,
                             n_estimators=100,
                             random_state = 42)))
	level0.append(('svm', SVC(C= 10, degree= 2, gamma= 'auto', kernel= 'poly')))
	# define meta learner model
	level1 = LogisticRegression()
	# define the stacking ensemble
	model = StackingClassifier(estimators=level0, final_estimator=level1, cv=5)
	return model

In [7]:
# score = cross_val_score(get_stacking(),x_test,y_test,cv = 5,scoring = 'accuracy')
# print("The accuracy score of is:",score.mean())
# score = cross_val_score(get_stacking(),x_test,y_test,cv = 5,scoring = 'f1_weighted')
# print("The f1 score of is:",score.mean())

In [21]:
stacking_model = get_stacking().fit(x_train,y_train)
stacking_y_pred = stacking_model.predict(x_test)
print(classification_report(y_test,stacking_y_pred))
print(accuracy_score(stacking_y_pred, y_test))



              precision    recall  f1-score   support

          -1       0.67      0.84      0.75       178
           0       0.73      0.72      0.72       386
           1       0.80      0.67      0.73       227

    accuracy                           0.73       791
   macro avg       0.74      0.74      0.73       791
weighted avg       0.74      0.73      0.73       791

0.7307206068268015


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Method 2: Max-voting

In [22]:
from sklearn.ensemble import VotingClassifier
import pickle
lr = LogisticRegression(solver='newton-cg',penalty='none',C=0.001, max_iter=5000)
rf=  RandomForestClassifier(bootstrap= True,
                             max_depth= 120,
                             max_features= 9,
                             min_samples_leaf= 5,
                             min_samples_split= 10,
                             n_estimators=100,
                             random_state = 42)
svm =  SVC(C= 10, degree= 2, gamma= 'auto', kernel= 'poly')
model = VotingClassifier(estimators=[('lr', lr), ('rf', rf), ('svm',svm)], voting='hard')



In [23]:
model = model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(classification_report(y_test,y_pred))
print(accuracy_score(y_pred, y_test))

  return f(**kwargs)


              precision    recall  f1-score   support

          -1       0.91      0.49      0.64       178
           0       0.68      0.92      0.78       386
           1       0.87      0.67      0.76       227

    accuracy                           0.75       791
   macro avg       0.82      0.69      0.73       791
weighted avg       0.79      0.75      0.74       791

0.7509481668773704
