In [39]:
# Required Python Machine learning Packages
import pandas as pd
import numpy as np
# For preprocessing the data
from sklearn.preprocessing import Imputer
from sklearn import preprocessing
# To split the dataset into train and test datasets
from sklearn.model_selection import train_test_split
# To model the Gaussian Naive Bayes classifier
from sklearn.naive_bayes import GaussianNB
# To calculate the accuracy score of the model
from sklearn.metrics import accuracy_score


In [40]:
data = pd.read_csv('data_files/Evaluation_DF_v3.csv', encoding='utf-8')
data.columns.values[0] = 'Date'
data = data.set_index('Date')
print(data.head())

            Direction           ADF  ExCloseMomentum20  ExCloseMomentum40  \
Date                                                                        
1991-01-02          1  4.770193e-13           0.000000           2.266590   
1991-01-03          0  1.869675e-13           1.240293           4.081633   
1991-01-07          0  1.107817e-13           2.518882           5.440829   
1991-01-15          1  9.861885e-13          -1.723744          -0.249600   
1991-01-17          1  7.075101e-12          -0.244706           0.493825   

            ExCloseMomentum60  Expectation Position     Error        Qt  \
Date                                                                      
1991-01-02           1.753978            1        L -0.204270  0.186566   
1991-01-03           1.240293            0        S  0.295770  0.183176   
1991-01-07           4.359385            1        L -0.219633  0.184186   
1991-01-15           2.835464            1        L -0.187286  0.186352   
1991-01-17

In [41]:
data = data.drop(columns=['Position', 'Expectation','-Qt'])
print (data.head())


            Direction           ADF  ExCloseMomentum20  ExCloseMomentum40  \
Date                                                                        
1991-01-02          1  4.770193e-13           0.000000           2.266590   
1991-01-03          0  1.869675e-13           1.240293           4.081633   
1991-01-07          0  1.107817e-13           2.518882           5.440829   
1991-01-15          1  9.861885e-13          -1.723744          -0.249600   
1991-01-17          1  7.075101e-12          -0.244706           0.493825   

            ExCloseMomentum60     Error        Qt  
Date                                               
1991-01-02           1.753978 -0.204270  0.186566  
1991-01-03           1.240293  0.295770  0.183176  
1991-01-07           4.359385 -0.219633  0.184186  
1991-01-15           2.835464 -0.187286  0.186352  
1991-01-17           2.518882  0.361066  0.186854  


In [42]:
#The Freedman-Diaconis rule is very robust and works well in practice. The bin-width is set to h=2×IQR×n−1/3. So the number of bins is (max−min)/h, where n is the number of observations, max is the maximum value and min is the minimum value.

n = 1124
cols_ = ['ADF', 'ExCloseMomentum20', 'ExCloseMomentum40', 'ExCloseMomentum60']
bins = []
for i in cols_:
    max = data[i].max()
    min = data[i].min()
    iqr = data[i].quantile(0.75) - data[i].quantile(0.25)
    number_of_bins = 2*(iqr/(n**(1/3)))
    bins.append(number_of_bins)
    
    

#data['ADF_binned'] = pd.cut(data['ADF'], 5, labels=[1,2,3,4,5])

print (bins)


[1.116533415785272e-12, 1.390048485935681, 1.7916341870975407, 2.2376058648561523]


In [43]:
print(data.describe())

         Direction           ADF  ExCloseMomentum20  ExCloseMomentum40  \
count  1276.000000  1.276000e+03        1276.000000        1276.000000   
mean      0.528997  1.729332e-01           0.546798           1.147698   
std       0.499354  1.431496e+00           5.671711           7.313923   
min       0.000000  6.755747e-26         -22.700000         -23.141934   
25%       0.000000  4.127901e-19          -3.295137          -3.679306   
50%       1.000000  9.144097e-16           0.757311           1.138731   
75%       1.000000  5.804487e-12           3.931264           5.634806   
max       1.000000  2.119792e+01          22.241546          28.639195   

       ExCloseMomentum60        Error           Qt  
count        1276.000000  1276.000000  1276.000000  
mean            1.681794    -0.007843     0.503658  
std             8.682522     0.932533     0.258030  
min           -24.752595    -5.194013     0.158484  
25%            -4.293936    -0.645785     0.340641  
50%            

In [44]:

bins = 5
label = [1,2,3,4,5]
data['ADF_binned'] = pd.cut(data['ADF'], bins, labels=label)
data['ExCloseMomentum20_binned'] = pd.cut(data['ExCloseMomentum20'], bins, labels=label)
data['ExCloseMomentum40_binned'] = pd.cut(data['ExCloseMomentum40'], bins, labels=label)
data['ExCloseMomentum60_binned'] = pd.cut(data['ExCloseMomentum60'], bins, labels=label)
data['Error_binned'] = pd.cut(data['Error'], bins, labels=label)
data['Qt_binned'] = pd.cut(data['Qt'], bins, labels=label)

data['ExCloseMomentum60_binned'].value_counts()

3    562
4    320
2    266
5     90
1     38
Name: ExCloseMomentum60_binned, dtype: int64

In [45]:
data = data.drop(columns=['ADF','ExCloseMomentum20','ExCloseMomentum40', 
                   'ExCloseMomentum60', 'Error', 'Qt'])


print (data.head())

            Direction ADF_binned ExCloseMomentum20_binned  \
Date                                                        
1991-01-02          1          1                        3   
1991-01-03          0          1                        3   
1991-01-07          0          1                        3   
1991-01-15          1          1                        3   
1991-01-17          1          1                        3   

           ExCloseMomentum40_binned ExCloseMomentum60_binned Error_binned  \
Date                                                                        
1991-01-02                        3                        3            3   
1991-01-03                        3                        3            3   
1991-01-07                        3                        3            3   
1991-01-15                        3                        3            3   
1991-01-17                        3                        3            3   

           Qt_binned  
Date     

In [46]:
import numpy as np
labels = np.array(data['Direction'])

data = data.drop('Direction', axis=1)

features_list = list(data.columns)

features = np.array(data)

In [47]:
x_train, x_test, y_train, y_test = train_test_split(features, 
                        labels, test_size = 192, shuffle=False, stratify=None)

In [48]:
print (x_train.shape)
print (x_test.shape)
print (y_train.shape)
print (y_test.shape)

(1084, 6)
(192, 6)
(1084,)
(192,)


In [51]:
clf = GaussianNB()
clf.fit(x_train, y_train)
target_pred = clf.predict(x_test)
print (target_pred)

[1 0 0 1 0 1 1 0 1 0 0 0 1 1 1 1 1 0 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
 1 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 1 0 0 0 0 0 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 1 1 0 0 0 1
 1 1 0 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 1 0 0 0 0 0 0 0
 0 0 0 0 0 0 0]


In [52]:
accuracy_score(y_test, target_pred, normalize = True)

0.4635416666666667