In [1]:
import math
import numpy as np
import pandas as pd
from pandas_datareader import data as web
import matplotlib.pyplot as plt
import datetime
import tables 
import tstables  
import os
from indicators import *

def write2excel( df, filename ):
    filepath = os.path.join('..', '..', 'datastore', filename) + '.xlsx'
    writer = pd.ExcelWriter(filepath, engine='xlsxwriter')
    df.to_excel(writer )
    writer.save()


In [2]:
symbol = 'USD_TRY'
account_type = 'practice'
granularity = 'S5'
start_datetime = datetime.datetime(2017,1,1,0,0,0)
end_datetime = datetime.datetime(2017,8,1,0,0,0)
decision_frequency = '1H'

file_path = '..\\..\\datastore\\_{0}\\{1}\\{2}.h5'.format(account_type,symbol,granularity)

h5 = tables.open_file(file_path, 'r')
ts = h5.root.data._f_get_timeseries()
raw = ts.read_range(start_datetime,end_datetime)
raw = pd.DataFrame(raw)
                
# Aggregate the high frequency data to the decision frequency
ohlc_dict = {   'ask_o':'first', 'ask_h':'max', 'ask_l':'min', 'ask_c': 'last',                                                                                                    
                'bid_o':'first', 'bid_h':'max', 'bid_l':'min', 'bid_c': 'last',                                                                                                    
                'volume': 'sum' }

df = raw.resample(decision_frequency, closed='left', label='left').apply(ohlc_dict).dropna()

indicator_list = ['Wave', 'WaveAngle', 'MACD']
df, indicator_cols = add_indicators(df, indicator_list)

In [3]:
df.head()
df.rename(columns={'ask_o': 'Open'},inplace=True)
df.rename(columns={'ask_h': 'High'},inplace=True)
df.rename(columns={'ask_l': 'Low'},inplace=True)
df.rename(columns={'ask_c': 'Close'},inplace=True)
df.rename(columns={'volume': 'Volume'},inplace=True)
cols = ['Open','High','Low','Close','Volume']
cols.extend(indicator_cols)

lookback = 12
for i in range(1,lookback+1):
    columnname = 'C_C' + str(i)    
    df[columnname] =  df['Close'] / df['Close'].shift(i)
    cols.append(columnname)
    
    columnname = 'C_H' + str(i)    
    df[columnname] =  df['Close'] / df['High'].shift(i)
    cols.append(columnname)
    
    columnname = 'C_L' + str(i)    
    df[columnname] =  df['Close'] / df['Low'].shift(i)
    cols.append(columnname)
   
df = df[cols]

In [4]:
for lag in range(1,13):
    df['future_return{}'.format(lag)] = np.log(df['Close'].shift(-lag)/df['Close'])

def f(row,lag):
    if row['future_return{}'.format(lag)] >= 0.0:
        val = 1
    else:
        val = 0
    return val

for lag in range(1,13):
    df['Up{}'.format(lag)] = df.apply(f, args=(lag,), axis=1)


In [5]:
df = df.dropna()

Unnamed: 0,Open,High,Low,Close,Volume,WaveClose,WaveHigh,WaveLow,WaveAngle,macd,...,Up3,Up4,Up5,Up6,Up7,Up8,Up9,Up10,Up11,Up12
2017-07-28 04:00:00,3.53838,3.53855,3.53667,3.53686,5417.0,3.53947,3.542419,3.537122,-0.111293,-0.000615,...,0,0,0,1,0,1,0,0,0,0
2017-07-28 05:00:00,3.53686,3.53687,3.53443,3.53486,3690.0,3.539207,3.542102,3.536968,-0.133947,-0.00079,...,0,1,1,1,1,0,0,0,0,0
2017-07-28 06:00:00,3.53486,3.53733,3.53306,3.53506,8890.0,3.53897,3.541829,3.536744,-0.177753,-0.000902,...,1,1,1,1,0,0,0,0,0,0
2017-07-28 07:00:00,3.53506,3.53584,3.5287,3.53124,11700.0,3.538528,3.541487,3.536285,-0.255351,-0.001284,...,1,1,1,1,0,0,1,0,0,0
2017-07-28 08:00:00,3.53124,3.53602,3.53049,3.53388,10668.0,3.538262,3.541175,3.535954,-0.27184,-0.001359,...,1,1,1,0,0,0,0,0,0,0


In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score

score_dict = {}
for lag in range(1,13):

    X = df[cols].values
    y = df['Up{}'.format(lag)].values

    lr = LogisticRegression()
    
    scores = cross_val_score(lr, X, y, cv=10, scoring='accuracy')
    
    score_dict[lag] = scores.mean()
    



In [7]:
score_dict

{1: 0.50413805796962285,
 2: 0.51942343486504394,
 3: 0.50667329231722802,
 4: 0.49818266116535792,
 5: 0.49278736905033221,
 6: 0.50099787405793905,
 7: 0.45287755083179082,
 8: 0.49110201613966697,
 9: 0.48090543616168402,
 10: 0.49873721611369853,
 11: 0.49531857230052478,
 12: 0.48626703881616373}

In [9]:
df[cols].tail()

Unnamed: 0,Open,High,Low,Close,Volume,WaveClose,WaveHigh,WaveLow,WaveAngle,macd,...,C_L9,C_C10,C_H10,C_L10,C_C11,C_H11,C_L11,C_C12,C_H12,C_L12
2017-07-28 04:00:00,3.53838,3.53855,3.53667,3.53686,5417.0,3.53947,3.542419,3.537122,-0.111293,-0.000615,...,1.000003,0.998859,0.998504,0.999508,0.99889,0.99789,1.00043,1.000387,0.999435,1.000642
2017-07-28 05:00:00,3.53686,3.53687,3.53443,3.53486,3690.0,3.539207,3.542102,3.536968,-0.133947,-0.00079,...,0.99955,0.998748,0.998167,0.999437,0.998294,0.997939,0.998943,0.998325,0.997325,0.999864
2017-07-28 06:00:00,3.53486,3.53733,3.53306,3.53506,8890.0,3.53897,3.541829,3.536744,-0.177753,-0.000902,...,0.999048,0.998768,0.9987,0.999607,0.998805,0.998224,0.999494,0.998351,0.997996,0.999
2017-07-28 07:00:00,3.53506,3.53584,3.5287,3.53124,11700.0,3.538528,3.541487,3.536285,-0.255351,-0.001284,...,0.998005,0.997695,0.997483,0.997968,0.997689,0.997621,0.998527,0.997726,0.997145,0.998414
2017-07-28 08:00:00,3.53124,3.53602,3.53049,3.53388,10668.0,3.538262,3.541175,3.535954,-0.27184,-0.001359,...,0.998581,0.998353,0.995942,0.998751,0.99844,0.998229,0.998714,0.998435,0.998367,0.999273


In [10]:
cols

['Open',
 'High',
 'Low',
 'Close',
 'Volume',
 'WaveClose',
 'WaveHigh',
 'WaveLow',
 'WaveAngle',
 'macd',
 'macdsignal',
 'macdhist',
 'C_C1',
 'C_H1',
 'C_L1',
 'C_C2',
 'C_H2',
 'C_L2',
 'C_C3',
 'C_H3',
 'C_L3',
 'C_C4',
 'C_H4',
 'C_L4',
 'C_C5',
 'C_H5',
 'C_L5',
 'C_C6',
 'C_H6',
 'C_L6',
 'C_C7',
 'C_H7',
 'C_L7',
 'C_C8',
 'C_H8',
 'C_L8',
 'C_C9',
 'C_H9',
 'C_L9',
 'C_C10',
 'C_H10',
 'C_L10',
 'C_C11',
 'C_H11',
 'C_L11',
 'C_C12',
 'C_H12',
 'C_L12']

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
#cols.extend(['WaveAngle','macd','macdsignal','macdhist','dmi_plus','dmi_minus','dmi_diff','atr+1','atr-1'])

score = {}
for lag in [1,2,3,4,6,12]:

    X = df[cols].values
    y = df['Up{}'.format(lag)].values

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)
    
    lr = LogisticRegression()
    lr.fit(X_train, y_train)
    lr.score(X_train, y_train)

    probs = lr.predict_proba(X_test)
    predicted = lr.predict(X_test)
    score[lag] = lr.score(X_test, y_test)

    probs = lr.predict_proba(X)
    df['probs0_{}'.format(lag)] = 0.0
    df['probs1_{}'.format(lag)] = 0.0
    df['probs0_{}'.format(lag)] = probs[:,0]
    df['probs1_{}'.format(lag)] = probs[:,1]

    df['predict{}'.format(lag)] = 0.0
    df['predict{}'.format(lag)] = lr.predict(X)
    


In [None]:
X = df[cols].values
y = df['Up6'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)
y_predicted = lr.predict(X_test)

from sklearn import metrics
print( metrics.accuracy_score(y_test,y_predicted) )
print( metrics.confusion_matrix(y_test,y_predicted) )


In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
clf = RandomForestClassifier(n_estimators=10, max_depth=None,min_samples_split=2, random_state=0)
scores = cross_val_score(clf, X, y)
scores.mean()   


In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostClassifier

clf = AdaBoostClassifier(n_estimators=10)
scores = cross_val_score(clf, X, y)
scores.mean()       

In [None]:
from sklearn.datasets import make_hastie_10_2
from sklearn.ensemble import GradientBoostingClassifier

clf = GradientBoostingClassifier(n_estimators=10, learning_rate=10,max_depth=1, random_state=0).fit(X, y)
scores = cross_val_score(clf, X, y)
scores.mean()   