In [1]:
import lucrum as lc
import lucrum.datareader as ldr
from IPython.display import display
from sklearn.utils import resample
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
hist_price = ldr.get_data_binance(symbols="XRPUSDT", 
                                  start="5 May, 2018", 
                                  end="18 Apr, 2019", 
                                  interval="15m",
                                  timezone="Europe/Malta")

display(hist_price.head())
display(hist_price.tail())
print(hist_price.shape)

Unnamed: 0,open_time,open,high,low,close,close_time,trades,volume
0,2018-05-05 02:00:00+02:00,0.8898,0.9,0.888,0.89766,2018-05-05 02:14:59.999000+02:00,274,159373.17
1,2018-05-05 02:15:00+02:00,0.89766,0.9046,0.89601,0.90388,2018-05-05 02:29:59.999000+02:00,290,243422.1
2,2018-05-05 02:30:00+02:00,0.90388,0.9046,0.8954,0.90391,2018-05-05 02:44:59.999000+02:00,206,126523.8
3,2018-05-05 02:45:00+02:00,0.9039,0.905,0.8952,0.89644,2018-05-05 02:59:59.999000+02:00,295,121757.09
4,2018-05-05 03:00:00+02:00,0.89643,0.89922,0.89218,0.8947,2018-05-05 03:14:59.999000+02:00,251,146653.0


Unnamed: 0,open_time,open,high,low,close,close_time,trades,volume
33261,2019-04-18 01:00:00+02:00,0.33577,0.33587,0.33377,0.3339,2019-04-18 01:14:59.999000+02:00,664,605290.1
33262,2019-04-18 01:15:00+02:00,0.33388,0.33406,0.3323,0.33344,2019-04-18 01:29:59.999000+02:00,805,712010.9
33263,2019-04-18 01:30:00+02:00,0.33345,0.33417,0.33288,0.33391,2019-04-18 01:44:59.999000+02:00,477,337987.8
33264,2019-04-18 01:45:00+02:00,0.33398,0.33455,0.33392,0.33436,2019-04-18 01:59:59.999000+02:00,502,766151.9
33265,2019-04-18 02:00:00+02:00,0.33435,0.33634,0.33411,0.33597,2019-04-18 02:14:59.999000+02:00,780,789653.1


(33266, 8)


In [3]:
# trying to predict crossover 
from lucrum.algo import pyta
import numpy as np

In [4]:
# creating features
ta_config = {
    "EMA":[("ema_3", {"timeperiod" : 3, "price" : "close"})],
    "SMA":[("sma_10", {"timeperiod" : 10, "price" : "close"})]  
}

ta_columns = pyta.apply_ta(hist_price, ta_config)

In [5]:
# add outcome 

# calculating crossovers 
ema_3 = hist_price["ema_3"].shift(1)
sma_10 = hist_price["sma_10"].shift(1)
hist_price["crossing"] = (((hist_price["ema_3"] < hist_price["sma_10"]) & (ema_3 >= sma_10))
            | ((hist_price["ema_3"] > hist_price["sma_10"]) & (ema_3 <= sma_10)))

# change to 1 and 0 
hist_price["crossing"] = hist_price["crossing"].map({True: 1, False: 0})

# drop NA 
hist_price = hist_price.dropna()
display(hist_price.head())
display(hist_price.tail(100))

Unnamed: 0,open_time,open,high,low,close,close_time,trades,volume,ema_3,sma_10,crossing
9,2018-05-05 04:15:00+02:00,0.90133,0.90334,0.89769,0.903,2018-05-05 04:29:59.999000+02:00,285,201916.2,0.902182,0.899894,0
10,2018-05-05 04:30:00+02:00,0.903,0.903,0.8971,0.89816,2018-05-05 04:44:59.999000+02:00,209,113327.48,0.900171,0.899944,0
11,2018-05-05 04:45:00+02:00,0.89999,0.901,0.898,0.9009,2018-05-05 04:59:59.999000+02:00,148,56226.42,0.900536,0.899646,0
12,2018-05-05 05:00:00+02:00,0.90094,0.90099,0.89831,0.89832,2018-05-05 05:14:59.999000+02:00,166,68384.99,0.899428,0.899087,0
13,2018-05-05 05:15:00+02:00,0.89832,0.90234,0.89832,0.90234,2018-05-05 05:29:59.999000+02:00,212,85895.89,0.900884,0.899677,0


Unnamed: 0,open_time,open,high,low,close,close_time,trades,volume,ema_3,sma_10,crossing
33166,2019-04-17 01:15:00+02:00,0.32263,0.32300,0.32236,0.32250,2019-04-17 01:29:59.999000+02:00,201,185006.0,0.322320,0.321598,0
33167,2019-04-17 01:30:00+02:00,0.32249,0.32479,0.32249,0.32449,2019-04-17 01:44:59.999000+02:00,640,950636.0,0.323405,0.321934,0
33168,2019-04-17 01:45:00+02:00,0.32448,0.32500,0.32379,0.32379,2019-04-17 01:59:59.999000+02:00,388,484795.4,0.323597,0.322163,0
33169,2019-04-17 02:00:00+02:00,0.32400,0.32540,0.32314,0.32382,2019-04-17 02:14:59.999000+02:00,699,921183.3,0.323709,0.322356,0
33170,2019-04-17 02:15:00+02:00,0.32378,0.32448,0.32322,0.32380,2019-04-17 02:29:59.999000+02:00,423,430358.5,0.323754,0.322621,0
33171,2019-04-17 02:30:00+02:00,0.32358,0.32402,0.32291,0.32332,2019-04-17 02:44:59.999000+02:00,334,360972.6,0.323537,0.322867,0
33172,2019-04-17 02:45:00+02:00,0.32332,0.32332,0.32187,0.32201,2019-04-17 02:59:59.999000+02:00,418,432049.8,0.322774,0.322971,1
33173,2019-04-17 03:00:00+02:00,0.32202,0.32258,0.32062,0.32111,2019-04-17 03:14:59.999000+02:00,543,928340.7,0.321942,0.322961,0
33174,2019-04-17 03:15:00+02:00,0.32093,0.32200,0.32050,0.32174,2019-04-17 03:29:59.999000+02:00,412,533990.4,0.321841,0.322924,0
33175,2019-04-17 03:30:00+02:00,0.32196,0.32249,0.32099,0.32212,2019-04-17 03:44:59.999000+02:00,502,607909.0,0.321980,0.322870,0


In [6]:
# pre-processing
crdt = hist_price[["open_time", "ema_3", "sma_10", "crossing"]]

def lag(dataframe, lag, column):
    for i in range(1, lag + 1):
        lagged = crdt[column].shift(i)
        col_name = column + '_lag_' + str(i)
        dataframe.insert(dataframe.shape[1] - 1, col_name, lagged)

lag(crdt, 5, "ema_3")
lag(crdt, 5, "sma_10")
display(crdt.tail(100))

Unnamed: 0,open_time,ema_3,sma_10,ema_3_lag_1,ema_3_lag_2,ema_3_lag_3,ema_3_lag_4,ema_3_lag_5,sma_10_lag_1,sma_10_lag_2,sma_10_lag_3,sma_10_lag_4,sma_10_lag_5,crossing
33166,2019-04-17 01:15:00+02:00,0.322320,0.321598,0.322140,0.321619,0.321129,0.321048,0.321125,0.321452,0.321367,0.321330,0.321318,0.321367,0
33167,2019-04-17 01:30:00+02:00,0.323405,0.321934,0.322320,0.322140,0.321619,0.321129,0.321048,0.321598,0.321452,0.321367,0.321330,0.321318,0
33168,2019-04-17 01:45:00+02:00,0.323597,0.322163,0.323405,0.322320,0.322140,0.321619,0.321129,0.321934,0.321598,0.321452,0.321367,0.321330,0
33169,2019-04-17 02:00:00+02:00,0.323709,0.322356,0.323597,0.323405,0.322320,0.322140,0.321619,0.322163,0.321934,0.321598,0.321452,0.321367,0
33170,2019-04-17 02:15:00+02:00,0.323754,0.322621,0.323709,0.323597,0.323405,0.322320,0.322140,0.322356,0.322163,0.321934,0.321598,0.321452,0
33171,2019-04-17 02:30:00+02:00,0.323537,0.322867,0.323754,0.323709,0.323597,0.323405,0.322320,0.322621,0.322356,0.322163,0.321934,0.321598,0
33172,2019-04-17 02:45:00+02:00,0.322774,0.322971,0.323537,0.323754,0.323709,0.323597,0.323405,0.322867,0.322621,0.322356,0.322163,0.321934,1
33173,2019-04-17 03:00:00+02:00,0.321942,0.322961,0.322774,0.323537,0.323754,0.323709,0.323597,0.322971,0.322867,0.322621,0.322356,0.322163,0
33174,2019-04-17 03:15:00+02:00,0.321841,0.322924,0.321942,0.322774,0.323537,0.323754,0.323709,0.322961,0.322971,0.322867,0.322621,0.322356,0
33175,2019-04-17 03:30:00+02:00,0.321980,0.322870,0.321841,0.321942,0.322774,0.323537,0.323754,0.322924,0.322961,0.322971,0.322867,0.322621,0


In [7]:
# # experiment 

# # calculate spread between sma 
# hist_price["sma_spread"] = (hist_price["sma_3"] - hist_price["sma_10"]).abs()

# # calculate signal 
# display(hist_price)
crdt = crdt.dropna()
test_col = crdt.columns[3:crdt.shape[1] - 1].values 
crdt[test_col] = crdt[test_col].div(crdt[test_col].sum(axis=1), axis=0)

display(crdt)

Unnamed: 0,open_time,ema_3,sma_10,ema_3_lag_1,ema_3_lag_2,ema_3_lag_3,ema_3_lag_4,ema_3_lag_5,sma_10_lag_1,sma_10_lag_2,sma_10_lag_3,sma_10_lag_4,sma_10_lag_5,crossing
14,2018-05-05 05:30:00+02:00,0.901407,0.900400,0.100082,0.099920,0.100043,0.100003,0.100226,0.099948,0.099882,0.099945,0.099978,0.099972,0
15,2018-05-05 05:45:00+02:00,0.901503,0.901300,0.100143,0.100085,0.099923,0.100046,0.100006,0.100031,0.099951,0.099885,0.099948,0.099981,0
16,2018-05-05 06:00:00+02:00,0.903502,0.901852,0.100124,0.100113,0.100055,0.099893,0.100017,0.100101,0.100001,0.099921,0.099856,0.099918,0
17,2018-05-05 06:15:00+02:00,0.904261,0.901809,0.100288,0.100067,0.100056,0.099998,0.099836,0.100105,0.100044,0.099944,0.099864,0.099798,0
18,2018-05-05 06:30:00+02:00,0.905795,0.902410,0.100289,0.100204,0.099983,0.099972,0.099914,0.100017,0.100021,0.099960,0.099860,0.099780,0
19,2018-05-05 06:45:00+02:00,0.905878,0.902706,0.100374,0.100204,0.100119,0.099898,0.099887,0.099998,0.099932,0.099937,0.099875,0.099776,0
20,2018-05-05 07:00:00+02:00,0.904689,0.903240,0.100307,0.100298,0.100128,0.100044,0.099823,0.099956,0.099923,0.099857,0.099862,0.099801,0
21,2018-05-05 07:15:00+02:00,0.902884,0.903258,0.100119,0.100251,0.100241,0.100072,0.099988,0.099959,0.099899,0.099867,0.099800,0.099805,1
22,2018-05-05 07:30:00+02:00,0.902192,0.903576,0.099911,0.100110,0.100242,0.100233,0.100063,0.099952,0.099950,0.099891,0.099858,0.099792,0
23,2018-05-05 07:45:00+02:00,0.903841,0.903891,0.099837,0.099914,0.100114,0.100245,0.100236,0.099990,0.099955,0.099953,0.099894,0.099861,0


In [8]:
print(crdt.crossing.value_counts())

0    28862
1     4390
Name: crossing, dtype: int64


In [9]:
# creating features 
for i in range(1, 6):
    crdt["x_" + str(i)] = crdt["sma_10_lag_"+ str(i)] - crdt["ema_3_lag_"+ str(i)] 

In [10]:
data = crdt[["open_time","x_1", "x_2", "x_3", "x_4", "x_5", "crossing"]].copy()
display(data.head())

data["crossing"] = data["crossing"].shift(-1)
data = data.dropna()
data["crossing"] = data["crossing"].astype("int8")
display(data.head())

Unnamed: 0,open_time,x_1,x_2,x_3,x_4,x_5,crossing
14,2018-05-05 05:30:00+02:00,-0.000134,-3.8e-05,-9.9e-05,-2.5e-05,-0.000254,0
15,2018-05-05 05:45:00+02:00,-0.000112,-0.000134,-3.8e-05,-9.9e-05,-2.5e-05,0
16,2018-05-05 06:00:00+02:00,-2.3e-05,-0.000112,-0.000134,-3.8e-05,-9.9e-05,0
17,2018-05-05 06:15:00+02:00,-0.000183,-2.3e-05,-0.000112,-0.000134,-3.8e-05,0
18,2018-05-05 06:30:00+02:00,-0.000272,-0.000183,-2.3e-05,-0.000112,-0.000134,0


Unnamed: 0,open_time,x_1,x_2,x_3,x_4,x_5,crossing
14,2018-05-05 05:30:00+02:00,-0.000134,-3.8e-05,-9.9e-05,-2.5e-05,-0.000254,0
15,2018-05-05 05:45:00+02:00,-0.000112,-0.000134,-3.8e-05,-9.9e-05,-2.5e-05,0
16,2018-05-05 06:00:00+02:00,-2.3e-05,-0.000112,-0.000134,-3.8e-05,-9.9e-05,0
17,2018-05-05 06:15:00+02:00,-0.000183,-2.3e-05,-0.000112,-0.000134,-3.8e-05,0
18,2018-05-05 06:30:00+02:00,-0.000272,-0.000183,-2.3e-05,-0.000112,-0.000134,0


In [11]:
print(data.crossing.value_counts()[1])

4390


In [12]:
# downsampling
# Separate majority and minority classes
data_majority = data[data["crossing"]==0]
data_minority = data[data["crossing"]==1]



# Downsample majority class
df_majority_downsampled = resample(data_majority, 
                                 replace=False,    # sample without replacement
                                 n_samples=data.crossing.value_counts()[1])     # reproducible results

# Combine minority class with downsampled majority class
data_downsampled = pd.concat([df_majority_downsampled, data_minority])
 
# Display new class counts
data_downsampled.crossing.value_counts()
display(data_downsampled)

Unnamed: 0,open_time,x_1,x_2,x_3,x_4,x_5,crossing
23627,2019-01-07 09:30:00+01:00,-0.000870,-0.001194,-0.000960,-0.000570,-0.000447,0
6272,2018-07-10 05:15:00+02:00,0.000510,0.000831,0.000637,0.000352,0.000161,0
9060,2018-08-08 06:15:00+02:00,0.000665,-0.000269,-0.000108,0.000130,0.000103,0
20195,2018-12-02 15:30:00+01:00,-0.000165,-0.000118,0.000040,0.000134,0.000011,0
32575,2019-04-10 21:30:00+02:00,-0.000240,-0.000344,-0.000327,-0.000268,-0.000284,0
5418,2018-07-01 00:15:00+02:00,0.000085,-0.000042,-0.000196,-0.000280,-0.000415,0
15921,2018-10-18 17:30:00+02:00,0.000458,0.000163,-0.000142,-0.000047,0.000184,0
21163,2018-12-12 17:30:00+01:00,-0.000279,-0.000410,-0.000203,-0.000188,-0.000113,0
23030,2019-01-01 04:15:00+01:00,0.000159,0.000339,0.000321,0.000224,0.000084,0
5850,2018-07-05 19:45:00+02:00,0.000664,0.000522,0.000301,0.000278,-0.000002,0


In [13]:

# print(y_train)
# print(y_test)

In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score

X = data_downsampled[["x_1","x_2","x_3","x_4","x_5"]]
y = data_downsampled["crossing"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30)

clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
probs_y = clf.predict_proba(X_test) 

y_pred_a = np.array([1 if x[1] >= 0.65 else 0 for x in probs_y])

print(X_test.size)

print(precision_score(y_test, y_pred, labels=[1,0], average='binary'))
print(y_pred[y_pred == 1].shape)
print(precision_score(y_test, y_pred_a, labels=[1,0], average='binary'))
print(y_pred_a[y_pred_a == 1].shape)


print(X_test.size)
print(y_pred.size)
y_pred_a = np.reshape(y_pred_a, (-1,1)) 
X_test.insert(column="outcome", loc=0,value = y_pred_a)
X_test.insert(column="true", loc=0,value = y_test)

X_test.sort_index(inplace=True)
print(X_test)



  from numpy.core.umath_tests import inner1d


13170
0.6368231046931407
(1385,)
0.69
(700,)
13170
2634
       true  outcome       x_1       x_2       x_3       x_4       x_5
32        1        0  0.000108  0.000139  0.000095  0.000086  0.000026
48        1        1  0.000169  0.000158  0.000110 -0.000171 -0.000255
67        1        0 -0.000106 -0.000186 -0.000424 -0.001003 -0.000443
78        0        0  0.000477  0.000457  0.000429  0.000346  0.000363
88        1        1  0.000261  0.000537  0.000576  0.000746  0.000620
94        1        1 -0.000127 -0.000321 -0.000360 -0.000508 -0.000259
100       0        0 -0.000177 -0.000020  0.000068  0.000204  0.000029
114       1        0 -0.000037 -0.000175 -0.000166 -0.000157 -0.000178
140       0        0 -0.000158 -0.000542 -0.000154  0.000068  0.000294
142       0        0  0.000187  0.000113 -0.000158 -0.000542 -0.000154
168       1        1 -0.000189 -0.000310 -0.000398 -0.000400 -0.000415
176       0        0  0.000518  0.000440  0.000311  0.000374  0.000339
183       1        1 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [15]:
print(crdt.iloc[33257])

IndexError: single positional indexer is out-of-bounds

In [None]:
# from sklearn.linear_model import LinearRegression
# reg = LinearRegression().fit(X_train, y_train)
# y_pred = reg.predict(X_test)
# print(y_pred)
# # probs_y = log.predict_proba(X_test) 

# #print(precision_score(y_test, y_pred,labels=[1,0], average='binary'))



In [None]:
# classifier = Sequential()
# #First Hidden Layer
# classifier.add(Dense(4, activation='relu', kernel_initializer='random_normal', input_dim=5))
# #Second  Hidden Layer
# classifier.add(Dense(4, activation='relu', kernel_initializer='random_normal'))
# #Output Layer
# classifier.add(Dense(1, activation='sigmoid', kernel_initializer='random_normal'))
# classifier.compile(optimizer ='adam',loss='binary_crossentropy', metrics =['accuracy'])
# classifier.fit(X_train,y_train, batch_size=10, epochs=100)

In [None]:
# y_pred = model.predict(X_test)
# y_pred = [int(round(x[0])) for x in y_pred]

# print(y_pred)
# print(precision_score(y_test, y_pred,labels=[1,0], average='binary'))