In [1]:
# Data preparation for Ph.D thesis
# @author: Andres L. Suarez-Cetrulo
%config IPCompleter.greedy=True

In [2]:
# Imports
import pandas as pd
import numpy as np
import datetime

# Using TALib abstract API to create a dictionary of technical indicators to iterate later.
from talib import abstract

In [3]:
def get_datediff(start, end): 
    """ 
    Obtains amount of days between starting and end date. 
    @param: start: start date
    @param: end: end date
    @return: amount of days between both input dates
    """
    d1 = datetime.datetime.strptime(start.replace('-', ':')+':00:00:00', '%Y:%m:%d:%H:%M:%S')
    d2 = datetime.datetime.strptime(end.replace('-', ':')+':23:59:59', '%Y:%m:%d:%H:%M:%S')
    return round((d2 - d1).total_seconds() / 3600 / 24, 0)

In [22]:
# Declaring date ranges

# test and train
# start_date='2018-07-01'
# end_date='2018-09-30'
#start_date='2018-07-01'
#end_date='2019-06-30'

# devset
#start_date='2018-05-01'
#end_date='2018-06-30'

# test predictions
start_date='2019-07-01'
end_date='2019-07-31'

# Crypto currencies to take into account 
CURRENCIES_AND_SYMBOLS = {
    'BITCOIN': 'BTC',
    'DASH': 'DASH',
    'ETHEREUM': 'ETH',
    'LITECOIN': 'LTC',
    'MONERO': 'XMR',
    'RIPPLE': 'XRP'
}
SELECTED = 5 # Selected cryptocurrency, from dictionary above (0-INDEXED)

# Paths
DATA_PATH='C:\\Users\\suare\\OneDrive\\Escritorio\\data\\out\\'
# SOURCE_PATH=DATA_PATH+'crypto_compare\\'
SOURCE_PATH='C:\\Users\\suare\\MEGA\\PhD\\1 Code and data\\data\\raw\\crypto_compare\\'
RESULT_PATH=DATA_PATH+'analysis\\CRYPTO\\'

# select crypto currency to be processed
CURRENCY=list(CURRENCIES_AND_SYMBOLS.keys())[SELECTED]
SYMBOL=CURRENCIES_AND_SYMBOLS[CURRENCY]
INPUT_FILE_EXTENSION='.csv.gz'
print(SYMBOL)

# List of dates for files to be loaded ( +1 in periods is extra day in next month so the labels are complete)
FILES = pd.date_range(start_date, periods=(get_datediff(start_date, end_date) + 1), freq='D')
RESULT_FILEPATH_PREPROCESSED=RESULT_PATH+CURRENCY+'_['+str(FILES[0])[:10]+'_to_'+str(FILES[len(FILES)-1])[:10]+'].csv'
RESULT_FILEPATH_PROCESSED=RESULT_PATH+CURRENCY+'_('+start_date+'_to_'+end_date+')_indicators.csv'

# Creating a dictionary of technical indicators using TALib abstract API
indicator = {}

# Add as many indicators as necessary: see all indicators in https://mrjbq7.github.io/ta-lib/
indicator['sma']=abstract.Function('sma') # Simple Moving Average
indicator['ema']=abstract.Function('ema') # Exponential Moving Average
indicator['mom']=abstract.Function('mom') # Momentum
indicator['stoch']=abstract.Function('stoch') # Stochastic (returns K and D)
indicator['macd']=abstract.Function('macd') # Moving Average Convergence/Divergence
indicator['rsi']=abstract.Function('rsi') # Relative Strength Index
indicator['willr']=abstract.Function('willr') # Williams' %R
indicator['adosc']=abstract.Function('adosc') # Chaikin A/D Oscillator
indicator['cci']=abstract.Function('cci') # Commodity Channel Index

# other indicators
#indicator['adx']=abstract.Function('adx') # Average Directional Movement Index
#indicator['aroon']=abstract.Function('aroon') # Aroon
#indicator['bbands']=abstract.Function('bbands') # Bollinger Bands
#indicator['obv']=abstract.Function('obv') # On Balance Volume
#indicator['trima']=abstract.Function('trima') # Triangular Moving Average
#indicator['roc']=abstract.Function('roc') # Rate of change : ((price/prevPrice)-1)*100
#indicator['rocr']=abstract.Function('rocr') # Rate of change ratio: (price/prevPrice)
#indicator['stochf']=abstract.Function('stochf') # Stochastic fast (returns K and D)
#indicator['adosc']=abstract.Function('adosc') # Chaikin A/D Oscillator
#indicator['medprice']=abstract.Function('medprice') # Median Price
#indicator['typprice']=abstract.Function('typprice') # Typical Price
#indicator['wclprice']=abstract.Function('wclprice') # Weighted Close Price
#indicator['atr']=abstract.Function('atr') # Average True Range
#indicator['macdfix']=abstract.Function('macdfix') # #Moving Average Convergence/Divergence Fix 12/26
#indicator['mfi']=abstract.Function('mfi') # Money Flow Index


XRP


In [23]:
# In the current datasets:

##############################
#
# “Volume From” and “To” are the volumes of the respective currency pair.
#
# For example, for the BTC-USD pair  
#
# “Volume From” is the number of Bitcoins traded for US dollars 
# “Volume To” is the number of dollars traded (for the period) for Bitcoins.
#
#
# In summary:
# -----------
# "volumeto" means the volume in the currency that is being traded
# "volumefrom" means the volume in the base currency that things are traded into.
#
# As all our cryptocurrencies are Currency/USD, then we use volumeto (volume in dollars) as volume
# This is consistent with the other columns ohcl, that are in dollars also.
#
##############################

# picking volume_to as actual volume
time_column='timestamp'
columns = ['close','high','low','open','time','volumefrom','volume',time_column] 

In [24]:
# Create list of DFs
dataframes=[]
aux_df= pd.DataFrame(columns=columns)

# Iterating through files
for file in FILES:
    file = str(file)[:10] # exclude time (00:00:00)
    file_path = CURRENCY+'/'+SYMBOL+'_'+file+INPUT_FILE_EXTENSION
    csv_path=SOURCE_PATH+file_path
    new_df=pd.read_csv(csv_path, compression='gzip', sep=';', \
                       parse_dates=True,infer_datetime_format=True, \
                       index_col = 0)
    new_df.columns = columns
    #dataframes.append(new_df)
    
    # BEGIN-NEW!
    # Removing from overall dataframe all rows captured the next day 
    #  This works as a drop_duplicates, but it will also apply corrections in prices/volumes 
    dups=pd.merge(new_df.reset_index(drop=True),aux_df.reset_index(drop=True), on=[time_column])[time_column]
    if len(dups)>1: aux_df=aux_df[~aux_df.timestamp.isin(dups)]
    new_df.index=new_df[time_column]
    new_df.index=pd.to_datetime(new_df.index)
    aux_df.index=aux_df[time_column]
    aux_df.index=pd.to_datetime(aux_df.index)
    dataframes=[]
    dataframes.append(aux_df) # only add clean DF
    dataframes.append(new_df) # add new data
    aux_df=pd.concat(dataframes) # update dataframe to check for dups
    # END-NEW!

# concat all dataframes in a single one
#df=pd.concat(dataframes) # this shouldn't be used anymore as it would include the dups removed above
#df=df.drop_duplicates(['close','high','low','open','time','volumefrom','volume','timestamp']) # 21087 rows without duplicates
df = aux_df # dropping duplicates and selecting second occurrence of a row if there is a correction (done in the loop above)


# Index dataframe by its actual readable timestamp
df.index=df['timestamp']
df.index=pd.to_datetime(df.index)
df=df.sort_index()

# Save raw concatenated file in analysis
df.to_csv(RESULT_FILEPATH_PREPROCESSED, sep=';', encoding='utf-8', index = True)
print(df.head())

                      close    high     low    open        time  volumefrom  \
timestamp                                                                     
2019-06-30 11:10:00  0.4104  0.4107  0.4103  0.4107  1561893000    24719.80   
2019-06-30 11:11:00  0.4103  0.4104  0.4102  0.4104  1561893060     9990.55   
2019-06-30 11:12:00  0.4110  0.4111  0.4103  0.4103  1561893120    57557.76   
2019-06-30 11:13:00  0.4107  0.4111  0.4105  0.4110  1561893180    38364.98   
2019-06-30 11:14:00  0.4109  0.4109  0.4107  0.4107  1561893240    13912.16   

                       volume            timestamp  
timestamp                                           
2019-06-30 11:10:00  10156.44  2019-06-30 11:10:00  
2019-06-30 11:11:00   4102.09  2019-06-30 11:11:00  
2019-06-30 11:12:00  23677.55  2019-06-30 11:12:00  
2019-06-30 11:13:00  15749.10  2019-06-30 11:13:00  
2019-06-30 11:14:00   5713.67  2019-06-30 11:14:00  


In [25]:
print(start_date)
print(end_date)
print(get_datediff(start_date, end_date)*60*24)
print(start_date+' 00:00:00')
start_time = datetime.datetime.strptime(start_date+' 00:00:00', '%Y-%m-%d %H:%M:%S')
end_time = datetime.datetime.strptime(end_date+' 23:59:59', '%Y-%m-%d %H:%M:%S')
print(len(df.reset_index(drop=True)[(df.reset_index(drop=True)['timestamp'] >= start_date+' 00:00:00') & (df.reset_index(drop=True)['timestamp'] <= end_date+" 23:59:59")]))
# Testing the dataset (TO-DO: use a proper unit testing library)

2019-07-01
2019-07-31
44640.0
2019-07-01 00:00:00
44640


In [26]:
#First check to see if there are any gaps or duplicates.
#The number of rows must be diff*60*24 in df[(df['timestamp'] > start_date) & (df['timestamp'] <= end_date+" 23:59:59")
test1=(get_datediff(start_date, end_date)*60*24) == len(df.reset_index(drop=True)[(df.reset_index(drop=True)['timestamp'] >= start_date+' 00:00:00') & (df.reset_index(drop=True)['timestamp'] <= end_date+" 23:59:59")])
print ('TEST 1: '+ str(test1) + ' - The amount of records is correct?')

# Second check to see if there are any gaps
ohlc_dict = {'open':'first','high':'max','low':'min','close':'last','volume':'sum'}
test=df.resample('1min').agg(ohlc_dict) # This will create rows with NaN values if there are any minutes missing in the timerange
test[test['open'].isnull()] # only displays gaps. if empty, happy days!
test2=len(test[test['open'].isnull()]) == 0
print ('TEST 2: '+str(test2)+' - Is the dataset complete? (no gaps)')

test3=len(df)==len(test)
print ('TEST 3: '+str(test3)+' - Are the resampled and the original dataset of the same size? (so # rows is ok)')

test4= df.reset_index(drop=True).groupby(['timestamp']).count()['high']
test4=(len(test4[test4 > 1])==0)
print('TEST 4: '+str(test4)+' - Is the dataset free from any duplicates?')

# If there are any dups (Test 4 == False), check the given row (change timestamp below)
# example=df[(df['timestamp'] == '2018-07-01 20:33:00')]
# example

print('')
print('ONLY CONTINUE IF ALL ARE TRUE!!')
print('--------------------------------')
if not (test1 and test2 and test3 and test4):
    df = [] 
    indicators = []
    print("OOOOOOOPS!! Your next thing won't work! :-)")
    print("Fix this mess first!")
else: print("All true. Please, continue. :-)")

#If there were any gaps (TEST 2 = False), then fill them:
#test['volume'] = test['volume'].fillna(0) 
#test['close'] = test['close'].ffill()     
#test['open'] = test['open'].fillna(test['close']) 
#test['low'] = test['low'].fillna(test['close']) 
#test['high'] = test['high'].fillna(test['close'])
#df = test

TEST 1: True - The amount of records is correct?
TEST 2: True - Is the dataset complete? (no gaps)
TEST 3: True - Are the resampled and the original dataset of the same size? (so # rows is ok)
TEST 4: True - Is the dataset free from any duplicates?

ONLY CONTINUE IF ALL ARE TRUE!!
--------------------------------
All true. Please, continue. :-)


In [27]:
ohlc_dict = {'open':'first','high':'max','low':'min','close':'last','volume':'sum'}

for resampling_lvl in ['1min','5min','10min','15min','30min']:
    aux = df.resample(resampling_lvl).agg(ohlc_dict)
    aux.reset_index(inplace=True)
    aux.rename(columns = {'timestamp':'datetime'}, inplace = True)
    print(aux)
    aux.to_csv(RESULT_FILEPATH_PREPROCESSED.replace('.csv','')+'_'+resampling_lvl+'.csv', sep=';', encoding='utf-8', index = False)


                 datetime    open    high     low   close     volume
0     2019-06-30 11:10:00  0.4107  0.4107  0.4103  0.4104   10156.44
1     2019-06-30 11:11:00  0.4104  0.4104  0.4102  0.4103    4102.09
2     2019-06-30 11:12:00  0.4103  0.4111  0.4103  0.4110   23677.55
3     2019-06-30 11:13:00  0.4110  0.4111  0.4105  0.4107   15749.10
4     2019-06-30 11:14:00  0.4107  0.4109  0.4107  0.4109    5713.67
5     2019-06-30 11:15:00  0.4109  0.4112  0.4108  0.4112   42070.60
6     2019-06-30 11:16:00  0.4112  0.4113  0.4109  0.4111   14370.80
7     2019-06-30 11:17:00  0.4111  0.4112  0.4110  0.4110    2586.49
8     2019-06-30 11:18:00  0.4110  0.4111  0.4109  0.4109   13862.16
9     2019-06-30 11:19:00  0.4109  0.4111  0.4108  0.4108    3322.67
10    2019-06-30 11:20:00  0.4108  0.4109  0.4104  0.4104   46934.70
11    2019-06-30 11:21:00  0.4104  0.4108  0.4102  0.4103   27684.38
12    2019-06-30 11:22:00  0.4103  0.4105  0.4101  0.4105   15894.33
13    2019-06-30 11:23:00  0.4105 

                datetime    open    high     low   close      volume
0    2019-06-30 11:00:00  0.4107  0.4111  0.4102  0.4109    59398.85
1    2019-06-30 11:15:00  0.4109  0.4116  0.4101  0.4113   324275.63
2    2019-06-30 11:30:00  0.4113  0.4114  0.4086  0.4104   257730.67
3    2019-06-30 11:45:00  0.4104  0.4118  0.4101  0.4106   380329.56
4    2019-06-30 12:00:00  0.4106  0.4117  0.4090  0.4115   348415.71
5    2019-06-30 12:15:00  0.4115  0.4115  0.4099  0.4104   138438.60
6    2019-06-30 12:30:00  0.4104  0.4117  0.4101  0.4104   129268.53
7    2019-06-30 12:45:00  0.4104  0.4117  0.4103  0.4108   176306.16
8    2019-06-30 13:00:00  0.4108  0.4116  0.4101  0.4110   125589.84
9    2019-06-30 13:15:00  0.4110  0.4111  0.4078  0.4079   327561.11
10   2019-06-30 13:30:00  0.4079  0.4098  0.4078  0.4094   247467.30
11   2019-06-30 13:45:00  0.4094  0.4094  0.4002  0.4005  2339029.97
12   2019-06-30 14:00:00  0.4005  0.4021  0.3987  0.4010  1352395.81
13   2019-06-30 14:15:00  0.4010  

In [12]:
RESULT_FILEPATH_PREPROCESSED

'C:\\Users\\suare\\OneDrive\\Escritorio\\data\\out\\analysis\\CRYPTO\\RIPPLE_[2019-06-01_to_2019-07-01].csv'

In [10]:
# all the numbers here and below assume a default time period for ta params of 10 mins averages
# change the other numbers (params of 5min and 20min and theremoval of 20 first mins of the day below) if the mins number is changed.
# in that case change as well the time tag '_10' with the corresponding one.
default_timerange=10
# Set extra timeranges for moving averages
extra_timeranges=[default_timerange/2, default_timerange, default_timerange*2, default_timerange*3, default_timerange*6]

# ###########################################
# Iterate and run list of indicators selected
# All of them produced for 25 prior mins of data 
# ###########################################
for key in list(indicator.keys()):
    # For indicators that only return one column (this will need to be modified depending on the selection of indicators)
    if key in ['ema','sma','trima']:
        for timerange in extra_timeranges:
            df[key+'_'+str(int(timerange))]=indicator[key](df, timeperiod=timerange) 
    elif key not in ['bbands','aroon','stoch','macd','macdfix','stochf']:
        df[key+'_'+str(int(default_timerange))]=indicator[key](df, timeperiod=(default_timerange))
    # Otherwise check the list of columns and return all
    else: 
        key_output=indicator[key](df, timeperiod=(default_timerange)) #, price='close')
        for j in range(0,len(list(key_output.columns))):
            df[key+'_'+key_output.columns[int(j)]]=key_output[key_output.columns[j]]
            
df.head()

Unnamed: 0_level_0,close,high,low,open,time,volumefrom,volume,timestamp,sma_5,sma_10,...,mom_10,stoch_slowk,stoch_slowd,macd_macd,macd_macdsignal,macd_macdhist,rsi_10,willr_10,adosc_10,cci_10
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-06-30 11:10:00,11559.43,11559.43,11549.69,11549.95,1561893000,15.89,183479.03,2019-06-30 11:10:00,,,...,,,,,,,,,,
2019-06-30 11:11:00,11571.06,11571.06,11558.96,11559.43,1561893060,76.36,884602.67,2019-06-30 11:11:00,,,...,,,,,,,,,,
2019-06-30 11:12:00,11580.12,11586.99,11570.81,11571.06,1561893120,44.75,517760.27,2019-06-30 11:12:00,,,...,,,,,,,,,,
2019-06-30 11:13:00,11571.63,11581.74,11568.48,11580.12,1561893180,11.68,135232.23,2019-06-30 11:13:00,,,...,,,,,,,,,,
2019-06-30 11:14:00,11566.94,11571.63,11566.94,11571.63,1561893240,5.766,66653.29,2019-06-30 11:14:00,11569.836,,...,,,,,,,,,,


In [11]:
def set_label(row):
    if row['close_t+1'] > row['close']: return 1
    else: return 0

# One minute ahead closing price
df['close_t+1']=df['close'].shift(-1)

# Creating label/y to be predicted / independent (predicted) feature 'y' 
df['label'] = df.apply(set_label, axis=1)
df.head()

Unnamed: 0_level_0,close,high,low,open,time,volumefrom,volume,timestamp,sma_5,sma_10,...,stoch_slowd,macd_macd,macd_macdsignal,macd_macdhist,rsi_10,willr_10,adosc_10,cci_10,close_t+1,label
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-06-30 11:10:00,11559.43,11559.43,11549.69,11549.95,1561893000,15.89,183479.03,2019-06-30 11:10:00,,,...,,,,,,,,,11571.06,1
2019-06-30 11:11:00,11571.06,11571.06,11558.96,11559.43,1561893060,76.36,884602.67,2019-06-30 11:11:00,,,...,,,,,,,,,11580.12,1
2019-06-30 11:12:00,11580.12,11586.99,11570.81,11571.06,1561893120,44.75,517760.27,2019-06-30 11:12:00,,,...,,,,,,,,,11571.63,0
2019-06-30 11:13:00,11571.63,11581.74,11568.48,11580.12,1561893180,11.68,135232.23,2019-06-30 11:13:00,,,...,,,,,,,,,11566.94,0
2019-06-30 11:14:00,11566.94,11571.63,11566.94,11571.63,1561893240,5.766,66653.29,2019-06-30 11:14:00,11569.836,,...,,,,,,,,,11565.92,0


In [12]:
# Filter out irrelevant dates once the dataset is complete.
df=df[(df['timestamp'] > start_date) & (df['timestamp'] <= end_date+" 23:59:59")]

In [13]:
# Select columns for output
columns_selected=['timestamp',
                 'rsi_10','willr_10','adosc_10','macd_macd' ,'cci_10','mom_10',
                 'stoch_slowk','stoch_slowd',
                 'ema_5','ema_10','ema_20','ema_30','ema_60',
                 'sma_5','sma_10','sma_20','sma_30','sma_60',
                 'label']

# Export processed data
output = pd.DataFrame(df, columns=columns_selected)
output.to_csv(RESULT_FILEPATH_PROCESSED, sep=';', encoding='utf-8', index = False)

# Printing classes distributions
print("In crypto: "+CURRENCY)
label_zero=len(df[df['label'] == 0])
label_one=len(df[df['label'] == 1])
#print(label_zero)
#print(label_one)
print("0 in "+str(float(label_zero)/(label_one+label_zero))+"%")
print("1 in "+str(float(label_one)/(label_one+label_zero))+"%")

In crypto: BITCOIN
0 in 0.5022849462365592%
1 in 0.4977150537634409%


In [14]:
# Class distribution 2 months before Q3 2018

#In crypto: LITECOIN
#0 in 0.5770947176684882%
#1 in 0.42290528233151187%
##
#In crypto: BITCOIN
#0 in 0.5248178506375227%
#1 in 0.4751821493624772%

#In crypto: DASH
#0 in 0.5934767759562841%
#1 in 0.4065232240437158%

# 
#In crypto: ETHEREUM
#0 in 0.511976320582878%
#1 in 0.48802367941712205%
# 
#In crypto: RIPPLE
#0 in 0.5984631147540984%
#1 in 0.40153688524590164%
# 
#In crypto: MONERO
#0 in 0.5995901639344262%
#1 in 0.4004098360655738%

# ########################################

# Class distribution in Q3 2018 to Q2 2019 (included)
#In crypto: MONERO
#0 in 0.7447983257229832%
#1 in 0.25520167427701673%
# 
#In crypto: RIPPLE
#0 in 0.6268797564687976%
#1 in 0.37312024353120243%
# 
#In crypto: ETHEREUM
#0 in 0.5466057838660578%
#1 in 0.45339421613394215%
# 
#In crypto: DASH
#0 in 0.7087347792998477%
#1 in 0.2912652207001522%
# 
#In crypto: BITCOIN
#0 in 0.4984931506849315%
#1 in 0.5015068493150685%
# 
#In crypto: LITECOIN
#0 in 0.6055745814307458%
#1 in 0.39442541856925417%


# Class distribution in Q3 2018
# In crypto: MONERO
# 0 in 0.689779589372%
# 1 in 0.310220410628%
# 
# In crypto: RIPPLE
# 0 in 0.641070350242%
# 1 in 0.358929649758%
# 
# In crypto: ETHEREUM
# 0 in 0.535235507246%
# 1 in 0.464764492754%
# 
# In crypto: DASH
# 0 in 0.639447463768%
# 1 in 0.360552536232%
# 
# In crypto: BITCOIN
# 0 in 0.503985507246%
# 1 in 0.496014492754%
# 
# In crypto: LITECOIN
# 0 in 0.62444897343%
# 1 in 0.37555102657%

# class distribution in July 2019

In crypto: LITECOIN
0 in 0.5386200716845878%
1 in 0.4613799283154122%

In crypto: RIPPLE
0 in 0.613978494623656%
1 in 0.3860215053763441%

In crypto: MONERO
0 in 0.6600806451612903%
1 in 0.33991935483870966%

In crypto: ETHEREUM
0 in 0.5233870967741936%
1 in 0.47661290322580646%

In crypto: DASH
0 in 0.6907482078853047%
1 in 0.30925179211469533%

In crypto: BITCOIN
0 in 0.5022849462365592%
1 in 0.4977150537634409%

SyntaxError: invalid syntax (<ipython-input-14-24113922dfae>, line 83)