In [1]:
import lucrum as lc
import lucrum.datareader as ldr
from IPython.display import display
from sklearn.utils import resample
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
hist_price = ldr.get_data_binance(symbols="XRPUSDT", 
                                  start="5z May, 2018", 
                                  end="18 Apr, 2019", 
                                  interval="15m",
                                  timezone="Europe/Malta")

display(hist_price.head())
display(hist_price.tail())
print(hist_price.shape)

Unnamed: 0,open_time,open,high,low,close,close_time,trades,volume
0,2019-04-18 02:00:00+02:00,0.33435,0.33634,0.33411,0.33597,2019-04-18 02:14:59.999000+02:00,780,789653.1
1,2019-04-18 02:15:00+02:00,0.33597,0.336,0.33461,0.33539,2019-04-18 02:29:59.999000+02:00,572,372700.7
2,2019-04-18 02:30:00+02:00,0.33534,0.33761,0.33525,0.33679,2019-04-18 02:44:59.999000+02:00,901,611970.7
3,2019-04-18 02:45:00+02:00,0.33665,0.337,0.33589,0.33621,2019-04-18 02:59:59.999000+02:00,598,599738.8
4,2019-04-18 03:00:00+02:00,0.33626,0.3367,0.33555,0.33627,2019-04-18 03:14:59.999000+02:00,461,444363.7


Unnamed: 0,open_time,open,high,low,close,close_time,trades,volume
92,2019-04-19 01:00:00+02:00,0.33388,0.33422,0.33116,0.33416,2019-04-19 01:14:59.999000+02:00,1143,1632911.4
93,2019-04-19 01:15:00+02:00,0.33418,0.33561,0.33353,0.33496,2019-04-19 01:29:59.999000+02:00,1142,2185346.0
94,2019-04-19 01:30:00+02:00,0.33496,0.3355,0.33483,0.33487,2019-04-19 01:44:59.999000+02:00,365,435935.1
95,2019-04-19 01:45:00+02:00,0.33498,0.3354,0.33491,0.33533,2019-04-19 01:59:59.999000+02:00,262,264054.6
96,2019-04-19 02:00:00+02:00,0.33525,0.33537,0.33264,0.333,2019-04-19 02:14:59.999000+02:00,486,550946.1


(97, 8)


In [3]:
# trying to predict crossover 
from lucrum.algo import pyta
import numpy as np

In [4]:
# creating features
ta_config = {
    "ema":[("ema_3", {"timeperiod" : 3, "price" : "close"})],
    "sma":[("sma_10", {"timeperiod" : 10, "price" : "close"})]  
}

ta_columns = pyta.apply_ta(hist_price, ta_config)

In [5]:
# add outcome 

# calculating crossovers 
ema_3 = hist_price["ema_3"].shift(1)
sma_10 = hist_price["sma_10"].shift(1)
hist_price["crossing"] = (((hist_price["ema_3"] < hist_price["sma_10"]) & (ema_3 >= sma_10))
            | ((hist_price["ema_3"] > hist_price["sma_10"]) & (ema_3 <= sma_10)))

# change to 1 and 0 
hist_price["crossing"] = hist_price["crossing"].map({True: 1, False: 0})

# drop NA 
hist_price = hist_price.dropna()
display(hist_price.head())
display(hist_price.tail(100))

Unnamed: 0,open_time,open,high,low,close,close_time,trades,volume,ema_3,sma_10,crossing
9,2019-04-18 04:15:00+02:00,0.33693,0.33725,0.33606,0.3366,2019-04-18 04:29:59.999000+02:00,633,568641.7,0.336981,0.33682,0
10,2019-04-18 04:30:00+02:00,0.33652,0.33854,0.33652,0.33821,2019-04-18 04:44:59.999000+02:00,816,693216.8,0.337596,0.337044,0
11,2019-04-18 04:45:00+02:00,0.338,0.33803,0.33683,0.33735,2019-04-18 04:59:59.999000+02:00,864,694372.5,0.337473,0.33724,0
12,2019-04-18 05:00:00+02:00,0.33753,0.33917,0.33736,0.33883,2019-04-18 05:14:59.999000+02:00,792,432169.7,0.338151,0.337444,0
13,2019-04-18 05:15:00+02:00,0.33888,0.3399,0.33817,0.33966,2019-04-18 05:29:59.999000+02:00,799,438128.6,0.338906,0.337789,0


Unnamed: 0,open_time,open,high,low,close,close_time,trades,volume,ema_3,sma_10,crossing
9,2019-04-18 04:15:00+02:00,0.33693,0.33725,0.33606,0.33660,2019-04-18 04:29:59.999000+02:00,633,568641.7,0.336981,0.336820,0
10,2019-04-18 04:30:00+02:00,0.33652,0.33854,0.33652,0.33821,2019-04-18 04:44:59.999000+02:00,816,693216.8,0.337596,0.337044,0
11,2019-04-18 04:45:00+02:00,0.33800,0.33803,0.33683,0.33735,2019-04-18 04:59:59.999000+02:00,864,694372.5,0.337473,0.337240,0
12,2019-04-18 05:00:00+02:00,0.33753,0.33917,0.33736,0.33883,2019-04-18 05:14:59.999000+02:00,792,432169.7,0.338151,0.337444,0
13,2019-04-18 05:15:00+02:00,0.33888,0.33990,0.33817,0.33966,2019-04-18 05:29:59.999000+02:00,799,438128.6,0.338906,0.337789,0
14,2019-04-18 05:30:00+02:00,0.33966,0.34041,0.33908,0.34013,2019-04-18 05:44:59.999000+02:00,1044,956173.8,0.339518,0.338175,0
15,2019-04-18 05:45:00+02:00,0.34028,0.34500,0.34012,0.34400,2019-04-18 05:59:59.999000+02:00,3157,3514341.0,0.341759,0.338779,0
16,2019-04-18 06:00:00+02:00,0.34400,0.34671,0.34000,0.34292,2019-04-18 06:14:59.999000+02:00,3316,5164827.0,0.342339,0.339271,0
17,2019-04-18 06:15:00+02:00,0.34296,0.34345,0.34118,0.34287,2019-04-18 06:29:59.999000+02:00,1528,1686424.9,0.342605,0.339747,0
18,2019-04-18 06:30:00+02:00,0.34288,0.34439,0.34189,0.34234,2019-04-18 06:44:59.999000+02:00,1367,1554730.4,0.342472,0.340291,0


In [6]:
# pre-processing
crdt = hist_price[["open_time", "ema_3", "sma_10", "crossing"]]

def lag(dataframe, lag, column):
    for i in range(1, lag + 1):
        lagged = crdt[column].shift(i)
        col_name = column + '_lag_' + str(i)
        dataframe.insert(dataframe.shape[1] - 1, col_name, lagged)
        
lag(crdt, 5, "ema_3")
lag(crdt, 5, "sma_10")
display(crdt.tail(100))

Unnamed: 0,open_time,ema_3,sma_10,ema_3_lag_1,ema_3_lag_2,ema_3_lag_3,ema_3_lag_4,ema_3_lag_5,sma_10_lag_1,sma_10_lag_2,sma_10_lag_3,sma_10_lag_4,sma_10_lag_5,crossing
9,2019-04-18 04:15:00+02:00,0.336981,0.336820,,,,,,,,,,,0
10,2019-04-18 04:30:00+02:00,0.337596,0.337044,0.336981,,,,,0.336820,,,,,0
11,2019-04-18 04:45:00+02:00,0.337473,0.337240,0.337596,0.336981,,,,0.337044,0.336820,,,,0
12,2019-04-18 05:00:00+02:00,0.338151,0.337444,0.337473,0.337596,0.336981,,,0.337240,0.337044,0.336820,,,0
13,2019-04-18 05:15:00+02:00,0.338906,0.337789,0.338151,0.337473,0.337596,0.336981,,0.337444,0.337240,0.337044,0.336820,,0
14,2019-04-18 05:30:00+02:00,0.339518,0.338175,0.338906,0.338151,0.337473,0.337596,0.336981,0.337789,0.337444,0.337240,0.337044,0.336820,0
15,2019-04-18 05:45:00+02:00,0.341759,0.338779,0.339518,0.338906,0.338151,0.337473,0.337596,0.338175,0.337789,0.337444,0.337240,0.337044,0
16,2019-04-18 06:00:00+02:00,0.342339,0.339271,0.341759,0.339518,0.338906,0.338151,0.337473,0.338779,0.338175,0.337789,0.337444,0.337240,0
17,2019-04-18 06:15:00+02:00,0.342605,0.339747,0.342339,0.341759,0.339518,0.338906,0.338151,0.339271,0.338779,0.338175,0.337789,0.337444,0
18,2019-04-18 06:30:00+02:00,0.342472,0.340291,0.342605,0.342339,0.341759,0.339518,0.338906,0.339747,0.339271,0.338779,0.338175,0.337789,0


In [7]:
# # experiment 

# # calculate spread between sma 
# hist_price["sma_spread"] = (hist_price["sma_3"] - hist_price["sma_10"]).abs()

# # calculate signal 
# display(hist_price)
crdt = crdt.dropna()
test_col = crdt.columns[3:crdt.shape[1] - 1].values 
crdt[test_col] = crdt[test_col].div(crdt[test_col].sum(axis=1), axis=0)

display(crdt)

Unnamed: 0,open_time,ema_3,sma_10,ema_3_lag_1,ema_3_lag_2,ema_3_lag_3,ema_3_lag_4,ema_3_lag_5,sma_10_lag_1,sma_10_lag_2,sma_10_lag_3,sma_10_lag_4,sma_10_lag_5,crossing
14,2019-04-18 05:30:00+02:00,0.339518,0.338175,0.100403,0.100180,0.099979,0.100015,0.099833,0.100072,0.099970,0.099910,0.099852,0.099785,0
15,2019-04-18 05:45:00+02:00,0.341759,0.338779,0.100469,0.100288,0.100064,0.099864,0.099900,0.100071,0.099957,0.099855,0.099795,0.099737,0
16,2019-04-18 06:00:00+02:00,0.342339,0.339271,0.100956,0.100294,0.100113,0.099890,0.099690,0.100076,0.099897,0.099783,0.099681,0.099621,0
17,2019-04-18 06:15:00+02:00,0.342605,0.339747,0.100922,0.100750,0.100090,0.099909,0.099687,0.100017,0.099872,0.099694,0.099580,0.099478,0
18,2019-04-18 06:30:00+02:00,0.342472,0.340291,0.100799,0.100721,0.100550,0.099891,0.099711,0.099958,0.099818,0.099673,0.099496,0.099382,0
19,2019-04-18 06:45:00+02:00,0.342551,0.340894,0.100581,0.100619,0.100542,0.100371,0.099713,0.099940,0.099780,0.099640,0.099496,0.099318,0
20,2019-04-18 07:00:00+02:00,0.341716,0.341161,0.100434,0.100411,0.100450,0.100372,0.100202,0.099948,0.099771,0.099612,0.099472,0.099328,0
21,2019-04-18 07:15:00+02:00,0.341888,0.341632,0.100120,0.100365,0.100342,0.100381,0.100303,0.099958,0.099880,0.099703,0.099544,0.099404,0
22,2019-04-18 07:30:00+02:00,0.341414,0.341843,0.100115,0.100064,0.100309,0.100286,0.100325,0.100040,0.099902,0.099824,0.099647,0.099488,1
23,2019-04-18 07:45:00+02:00,0.340812,0.341898,0.099950,0.100088,0.100038,0.100283,0.100259,0.100075,0.100013,0.099876,0.099797,0.099621,0


In [8]:
print(crdt.crossing.value_counts())

0    70
1    13
Name: crossing, dtype: int64


In [9]:
# creating features 
for i in range(1, 6):
    crdt["x_" + str(i)] = crdt["sma_10_lag_"+ str(i)] - crdt["ema_3_lag_"+ str(i)] 

In [10]:
data = crdt[["open_time","x_1", "x_2", "x_3", "x_4", "x_5", "crossing"]].copy()
display(data.head())

data["crossing"] = data["crossing"].shift(-1)
data = data.dropna()
data["crossing"] = data["crossing"].astype("int8")
display(data.head())

Unnamed: 0,open_time,x_1,x_2,x_3,x_4,x_5,crossing
14,2019-04-18 05:30:00+02:00,-0.000331,-0.00021,-6.9e-05,-0.000163,-4.8e-05,0
15,2019-04-18 05:45:00+02:00,-0.000397,-0.00033,-0.000209,-6.9e-05,-0.000163,0
16,2019-04-18 06:00:00+02:00,-0.00088,-0.000397,-0.00033,-0.000209,-6.9e-05,0
17,2019-04-18 06:15:00+02:00,-0.000905,-0.000878,-0.000396,-0.000329,-0.000209,0
18,2019-04-18 06:30:00+02:00,-0.000841,-0.000903,-0.000877,-0.000395,-0.000329,0


Unnamed: 0,open_time,x_1,x_2,x_3,x_4,x_5,crossing
14,2019-04-18 05:30:00+02:00,-0.000331,-0.00021,-6.9e-05,-0.000163,-4.8e-05,0
15,2019-04-18 05:45:00+02:00,-0.000397,-0.00033,-0.000209,-6.9e-05,-0.000163,0
16,2019-04-18 06:00:00+02:00,-0.00088,-0.000397,-0.00033,-0.000209,-6.9e-05,0
17,2019-04-18 06:15:00+02:00,-0.000905,-0.000878,-0.000396,-0.000329,-0.000209,0
18,2019-04-18 06:30:00+02:00,-0.000841,-0.000903,-0.000877,-0.000395,-0.000329,0


In [11]:
print(data.crossing.value_counts()[1])

13


In [12]:
# downsampling
# Separate majority and minority classes
data_majority = data[data["crossing"]==0]
data_minority = data[data["crossing"]==1]

# Downsample majority class
df_majority_downsampled = resample(data_majority, 
                                 replace=False,    # sample without replacement
                                 n_samples=data.crossing.value_counts()[1])     # reproducible results

# Combine minority class with downsampled majority class
data_downsampled = pd.concat([df_majority_downsampled, data_minority])
 
# Display new class counts
data_downsampled.crossing.value_counts()
display(data_downsampled)

Unnamed: 0,open_time,x_1,x_2,x_3,x_4,x_5,crossing
92,2019-04-19 01:00:00+02:00,0.0002867042,0.000139,4.7e-05,3.5e-05,9.4e-05,0
20,2019-04-18 07:00:00+02:00,-0.0004858764,-0.00064,-0.000838,-0.0009,-0.000874,0
91,2019-04-19 00:45:00+02:00,0.0001388716,4.7e-05,3.5e-05,9.4e-05,0.000164,0
19,2019-04-18 06:45:00+02:00,-0.0006406443,-0.000839,-0.000901,-0.000875,-0.000394,0
39,2019-04-18 11:45:00+02:00,0.0005161209,0.000574,0.000765,0.000535,0.000249,0
85,2019-04-18 23:15:00+02:00,0.0001741156,0.000111,5.5e-05,-0.000128,-0.000142,0
61,2019-04-18 17:15:00+02:00,0.0002173237,0.00025,0.000327,0.000255,-5e-06,0
67,2019-04-18 18:45:00+02:00,-0.0002744127,-8.7e-05,0.000124,0.000164,0.000137,0
36,2019-04-18 11:00:00+02:00,0.0005323747,0.000248,-0.000104,-0.000229,-0.000203,0
51,2019-04-18 14:45:00+02:00,-3.614675e-07,3.5e-05,-1.1e-05,0.000131,0.000233,0


In [13]:

# print(y_train)
# print(y_test)

In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score

X = data_downsampled[["x_1","x_2","x_3","x_4","x_5"]]
y = data_downsampled["crossing"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30)

clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
probs_y = clf.predict_proba(X_test) 

y_pred_a = np.array([1 if x[1] >= 0.65 else 0 for x in probs_y])

print(X_test.size)

print(precision_score(y_test, y_pred, labels=[1,0], average='binary'))
print(y_pred[y_pred == 1].shape)
print(precision_score(y_test, y_pred_a, labels=[1,0], average='binary'))
print(y_pred_a[y_pred_a == 1].shape)


print(X_test.size)
print(y_pred.size)
y_pred_a = np.reshape(y_pred_a, (-1,1)) 
X_test.insert(column="outcome", loc=0,value = y_pred_a)
X_test.insert(column="true", loc=0,value = y_test)

X_test.sort_index(inplace=True)
print(X_test)



40
1.0
(3,)
1.0
(1,)
40
8
    true  outcome       x_1       x_2       x_3       x_4       x_5
18     0        0 -0.000841 -0.000903 -0.000877 -0.000395 -0.000329
19     0        0 -0.000641 -0.000839 -0.000901 -0.000875 -0.000394
21     1        0 -0.000162 -0.000486 -0.000639 -0.000837 -0.000899
30     1        1  0.000116  0.000204  0.000168  0.000314  0.000530
36     0        0  0.000532  0.000248 -0.000104 -0.000229 -0.000203
56     1        0 -0.000170 -0.000282 -0.000351 -0.000062 -0.000118
64     1        0  0.000164  0.000137  0.000157  0.000217  0.000251
91     0        0  0.000139  0.000047  0.000035  0.000094  0.000164


  from numpy.core.umath_tests import inner1d
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [15]:
print(crdt.iloc[33257])

IndexError: single positional indexer is out-of-bounds

In [None]:
# from sklearn.linear_model import LinearRegression
# reg = LinearRegression().fit(X_train, y_train)
# y_pred = reg.predict(X_test)
# print(y_pred)
# # probs_y = log.predict_proba(X_test) 

# #print(precision_score(y_test, y_pred,labels=[1,0], average='binary'))



In [None]:
# classifier = Sequential()
# #First Hidden Layer
# classifier.add(Dense(4, activation='relu', kernel_initializer='random_normal', input_dim=5))
# #Second  Hidden Layer
# classifier.add(Dense(4, activation='relu', kernel_initializer='random_normal'))
# #Output Layer
# classifier.add(Dense(1, activation='sigmoid', kernel_initializer='random_normal'))
# classifier.compile(optimizer ='adam',loss='binary_crossentropy', metrics =['accuracy'])
# classifier.fit(X_train,y_train, batch_size=10, epochs=100)

In [None]:
# y_pred = model.predict(X_test)
# y_pred = [int(round(x[0])) for x in y_pred]

# print(y_pred)
# print(precision_score(y_test, y_pred,labels=[1,0], average='binary'))