In [194]:
import numpy as np
import sklearn
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import GridSearchCV
# http://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_recall_curve.html
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, r2_score, precision_recall_curve, mean_squared_error

from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import classification_report,f1_score

In [195]:
df = pd.read_csv('labeled_DIS.csv', index_col='timestamp')
df.head(1)

Unnamed: 0_level_0,Chaikin A/D,ADX,Aroon Down,Aroon Up,Real Lower Band,Real Middle Band,Real Upper Band,CCI,EMA,MACD,...,FastD,FastK,open,high,low,close,volume,buy,sell,hold
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1998-07-10,9495330.0,25.816,100.0,0.0,10.0417,93.966,177.8903,-654.2586,95.9199,-7.1406,...,66.6667,0.0,37.25,38.5,36.5,38.13,5284767,1,0,0


In [196]:
# samples
X = df.drop(['buy', 'sell', 'hold'], axis=1)
X.head(1)

Unnamed: 0_level_0,Chaikin A/D,ADX,Aroon Down,Aroon Up,Real Lower Band,Real Middle Band,Real Upper Band,CCI,EMA,MACD,...,SMA,SlowD,SlowK,FastD,FastK,open,high,low,close,volume
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1998-07-10,9495330.0,25.816,100.0,0.0,10.0417,93.966,177.8903,-654.2586,95.9199,-7.1406,...,101.133,58.8071,60.4938,66.6667,0.0,37.25,38.5,36.5,38.13,5284767


In [197]:
X = df.drop([
    'buy','sell','hold',
    'Chaikin A/D', 'ADX', 'Aroon Down','Aroon Up', 
    'Real Lower Band', 'Real Middle Band', 'Real Upper Band',
#     'SlowK','FastD','FastK','SlowD',
    'CCI', 'OBV', 'MACD_Hist', 'MACD_Signal','MACD','EMA','SMA',
    'volume'
], axis=1)
X.head(1)

Unnamed: 0_level_0,RSI,SlowD,SlowK,FastD,FastK,open,high,low,close
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1998-07-10,12.9131,58.8071,60.4938,66.6667,0.0,37.25,38.5,36.5,38.13


In [211]:
# truth labels (pull just the sell label)
print('hold', df['hold'].sum())
print('buy', df['buy'].sum())
print('sell', df['sell'].sum())
Y = df.filter(['buy','sell','hold'], axis=1)
Y['label'] = np.where(Y['sell']==1, 1, 0)
Y = Y.filter(['label'], axis=1)['label'].values
Y

hold 2547
buy 1282
sell 1281


array([0, 0, 1, ..., 0, 1, 0])

In [199]:
from sklearn.model_selection import train_test_split
# 75%/25% train/test
X_train, X_test, Y_train, Y_test = train_test_split(
    X,Y
)
print('X_train',len(X_train),'examples')
print('X_test',len(X_test),'examples')
print('Y_train',len(Y_train),'examples')
print('Y_test',len(Y_test),'examples')
print('shape', X_train.shape)

X_train 3832 examples
X_test 1278 examples
Y_train 3832 examples
Y_test 1278 examples
shape (3832, 9)


In [200]:
scaler = StandardScaler()  
# Don't cheat - fit only on training data
scaler.fit(X_train)  
X_train = scaler.transform(X_train)  
# apply same transformation to test data
X_test = scaler.transform(X_test)

X_train[0]

array([-0.11244813,  1.5714341 ,  1.41772355,  1.15098914,  0.35408637,
       -0.33720579, -0.34213735, -0.35204559, -0.35576565])

In [201]:
def print_metrics(name, model, predictions, X_train, Y_train, Y_test):
    print(name+":")
    print("Training score: %f" % model.score(X_train, Y_train))
    print("Training loss: %f" % model.loss_)
    print("Accuracy: ", accuracy_score(predictions, Y_test))
    print("F1: ", 
        f1_score(predictions, Y_test, average=None), 
        f1_score(predictions, Y_test, average='micro')
    )
    print("R2: ", r2_score(predictions, Y_test))
    print("Precision: ", 
        precision_score(predictions, Y_test, average=None), 
        precision_score(predictions, Y_test, average='micro')
    )
    print("Recall: ", 
        precision_score(predictions, Y_test, average=None), 
        recall_score(predictions, Y_test, average='micro')
    )

In [202]:
# http://scikit-learn.org/stable/modules/neural_networks_supervised.html
mlp = MLPClassifier(
    alpha=1e-5,
    hidden_layer_sizes=(71, 71),
    learning_rate='constant',
    solver='lbfgs'
)

mlp.fit(X_train, Y_train)

MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(71, 71), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='lbfgs', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [203]:
print_metrics("Multi-Layer Perceptron", mlp, mlp.predict(X_test), X_train, Y_train, Y_test)
# probability score
X_train_probability = mlp.predict_proba(X_train) 
X_train_probability

Multi-Layer Perceptron:
Training score: 0.832203
Training loss: 0.323081
Accuracy:  0.737089201878
F1:  [ 0.82804504  0.44186047] 0.737089201878
R2:  -0.532740570466
Precision:  [ 0.84535005  0.41433022] 0.737089201878
Recall:  [ 0.84535005  0.41433022] 0.737089201878


array([[  9.99930535e-01,   6.94645901e-05],
       [  9.05708027e-01,   9.42919727e-02],
       [  4.00660164e-01,   5.99339836e-01],
       ..., 
       [  9.99808697e-01,   1.91303088e-04],
       [  2.53036250e-01,   7.46963750e-01],
       [  9.99739041e-01,   2.60958779e-04]])

In [204]:
param_grid = {
    'hidden_layer_sizes': [(71,), (71, 71), (71, 71, 3)],
    'tol': [1e-2, 1e-3, 1e-4, 1e-5, 1e-6],
    'epsilon': [1e-3, 1e-7, 1e-8, 1e-9, 1e-8]
}
gridmlp = GridSearchCV(
    MLPClassifier(learning_rate='adaptive', learning_rate_init=1., early_stopping=True, shuffle=True),
    param_grid=param_grid, n_jobs=-1)
gridmlp.fit(X_train, Y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=True, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='adaptive',
       learning_rate_init=1.0, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'hidden_layer_sizes': [(71,), (71, 71), (71, 71, 3)], 'tol': [0.01, 0.001, 0.0001, 1e-05, 1e-06], 'epsilon': [0.001, 1e-07, 1e-08, 1e-09, 1e-08]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [205]:
gridmlp_predictions = gridmlp.predict(X_test)
print("Training set score: %f" % gridmlp.score(X_train, Y_train))
# print("Training set loss: %f" % gridmlp.loss_)
print("Results with Multi-Layer Perceptron:")
print("accuracy_score: ", accuracy_score(gridmlp_predictions, Y_test))
print("r2_score: ", r2_score(gridmlp_predictions, Y_test))
# print("recall: ", recall_score(gridmlp_predictions, Y_test))
print(classification_report(y_true=Y_test,y_pred=gridmlp_predictions))

Training set score: 0.748956
Results with Multi-Layer Perceptron:
accuracy_score:  0.731611893584
r2_score:  -4.99827586207
             precision    recall  f1-score   support

          0       0.75      0.96      0.84       957
          1       0.32      0.06      0.10       321

avg / total       0.64      0.73      0.66      1278



In [206]:
# https://github.com/scikit-learn/scikit-learn/blob/master/examples/neural_networks/plot_mlp_training_curves.py

In [207]:
# https://stats.stackexchange.com/questions/260736/multiclass-classification-having-class-imbalance-with-gradient-boosting-classifi
# https://www.analyticsvidhya.com/blog/2016/02/complete-guide-parameter-tuning-gradient-boosting-gbm-python/

from sklearn.ensemble import GradientBoostingClassifier

# http://scikit-learn.org/stable/modules/neural_networks_supervised.html
gbc = GradientBoostingClassifier()

gbc.fit(X_train, Y_train)
gbc_predictions = gbc.predict(X_test)

print("Training set score: %f" % gbc.score(X_train, Y_train))
# print("Training set loss: %f" % gbc.loss_)

print("Results with Gradient Boost:")
print("accuracy: ", accuracy_score(gbc_predictions, Y_test))
print("r2: ", r2_score(gbc_predictions, Y_test))
# print("recall: ", recall_score(gbc_predictions, Y_test_single))
print(classification_report(y_true=Y_test,y_pred=gbc_predictions))

Training set score: 0.826461
Results with Gradient Boost:
accuracy:  0.747261345853
r2:  -0.930387205387
             precision    recall  f1-score   support

          0       0.79      0.90      0.84       957
          1       0.49      0.31      0.38       321

avg / total       0.72      0.75      0.72      1278



In [208]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=100, min_samples_leaf=10)
rfc.fit(X_train, Y_train)
Y_pred = rfc.predict(X_test)
percent = np.mean(Y_pred == Y_test)*100
print("Accuracy:", percent)
print('MSE:', mean_squared_error(Y_pred, Y_test))
print('score:', rfc.score(X_test, Y_test))
print(classification_report(y_true=Y_test,y_pred=Y_pred))

Accuracy: 74.3348982786
MSE: 0.256651017214
score: 0.743348982786
             precision    recall  f1-score   support

          0       0.79      0.90      0.84       957
          1       0.48      0.27      0.35       321

avg / total       0.71      0.74      0.72      1278



In [209]:
gb=GradientBoostingClassifier(n_estimators=200,learning_rate=.01)
gb.fit(X_train,Y_train)

cross_val_score(estimator=gb,X=X_test,y=Y_test,scoring='precision_weighted',cv=5)
print(classification_report(y_true=Y_test,y_pred=gb.predict(X_test)))

             precision    recall  f1-score   support

          0       0.76      0.96      0.85       957
          1       0.47      0.10      0.16       321

avg / total       0.69      0.75      0.68      1278

