In [10]:
# Data preparation for Ph.D thesis
# @author: Andres L. Suarez-Cetrulo
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import datetime

#Alpha Vantage API to download 15 days of minute data (only if required)
from alpha_vantage.timeseries import TimeSeries
apikey='E8HE93TKWNLCACEU'
ts = TimeSeries(key=apikey)

# Using TALib abstract API to create a dictionary of technical indicators to iterate later.
from talib import abstract

In [11]:
files=['2017-10-01','2017-10-07','2017-10-16',
       '2017-10-21','2017-10-28','2017-11-06','2017-11-15','2017-11-18',
       '2017-11-25','2017-12-03','2017-12-09','2017-12-16','2017-12-25',
       '2017-12-31','2018-01-09','2018-01-15','2018-01-16','2018-01-22',
       '2018-01-30','2018-02-01','2018-02-03','2018-02-10','2018-02-17',
       '2018-02-18','2018-02-24','2018-02-25','2018-03-03','2018-03-04',
       '2018-03-10','2018-03-11','2018-03-17','2018-03-18','2018-03-24',
       '2018-03-25','2018-03-31','2018-04-01']

dataframes=[]
for file in files:
    csv_path="/home/cetrulin/Desktop/Andres/data/raw/google_finance/S&P500/spy_eft/SPY_"+str(file)+".csv"
    new_df=pd.read_csv(csv_path, sep=';', parse_dates=True,infer_datetime_format=True)
    new_df.columns = ['date','open','high','low','close','volume']
    dataframes.append(new_df)
    #print(csv_path) # print paths added to list of dataframes    
#len(dataframes) # check length of list of DFs  

# concat all dataframes in a single one
df=pd.concat(dataframes) # 57375 rows until 2017-11-18 
df=df.drop_duplicates(['date','open','high','low','close','volume']) # 21087 rows without duplicates
#df

# Save raw concatenated file in analysis
df.to_csv('/home/cetrulin/Desktop/Andres/data/analysis/S&P500/SPY_Q1_2018_['+str(files[0])+'_to_'+str(files[len(files)-1])+'].csv', sep=';', encoding='utf-8')

#df.drop also drops the selected column (if value=1) or rows (if =0)
#df = df.drop(df.columns[[0]], 1)
#df = df.drop('symbol', 1)

In [12]:
#Add parameters to transform in TS
timeseries=['low','close','open','high','volume']
#Length of the TS. How many values do we keep per serie. 
# e.g. 1 -> t / 2 -> t,t-1 / 3 -> t,t-1,t-2 / 4 -> t,t-1,t-2,t-3
length = 4

'''
# Add lagged times
for column in timeseries:
    #df[column+'_t']=df[column]
    for i in range(1,length):
        df[column+'_t-'+str(i)]=df[column].shift(i) #it could also be sorted and group by if needed
    #del drops the delected df column
    #del df[column]
'''

"\n# Add lagged times\nfor column in timeseries:\n    #df[column+'_t']=df[column]\n    for i in range(1,length):\n        df[column+'_t-'+str(i)]=df[column].shift(i) #it could also be sorted and group by if needed\n    #del drops the delected df column\n    #del df[column]\n"

In [13]:
# Creating a dictionary of technical indicators using TALib abstract API
indicator = {}
# Add as many indicators as necessary: see all indicators in https://mrjbq7.github.io/ta-lib/
indicator['sma']=abstract.Function('sma') # Simple Moving Average
indicator['ema']=abstract.Function('ema') # Exponential Moving Average
indicator['mom']=abstract.Function('mom') # Momentum
indicator['stoch']=abstract.Function('stoch') # Stochastic (returns K and D)
indicator['macd']=abstract.Function('macd') # Moving Average Convergence/Divergence
indicator['rsi']=abstract.Function('rsi') # Relative Strength Index
indicator['willr']=abstract.Function('willr') # Williams' %R
indicator['adosc']=abstract.Function('adosc') # Chaikin A/D Oscillator
indicator['cci']=abstract.Function('cci') # Commodity Channel Index

# --- --- 

# extra columns
#indicator['adx']=abstract.Function('adx') # Average Directional Movement Index
#indicator['aroon']=abstract.Function('aroon') # Aroon
#indicator['bbands']=abstract.Function('bbands') # Bollinger Bands
#indicator['obv']=abstract.Function('obv') # On Balance Volume
# EXTENDED
#indicator['trima']=abstract.Function('trima') # Triangular Moving Average
#indicator['roc']=abstract.Function('roc') # Rate of change : ((price/prevPrice)-1)*100
#indicator['rocr']=abstract.Function('rocr') # Rate of change ratio: (price/prevPrice)
#indicator['stochf']=abstract.Function('stochf') # Stochastic fast (returns K and D)
#indicator['adosc']=abstract.Function('adosc') # Chaikin A/D Oscillator
#indicator['medprice']=abstract.Function('medprice') # Median Price
#indicator['typprice']=abstract.Function('typprice') # Typical Price
#indicator['wclprice']=abstract.Function('wclprice') # Weighted Close Price
#indicator['atr']=abstract.Function('atr') # Average True Range
#indicator['macdfix']=abstract.Function('macdfix') # #Moving Average Convergence/Divergence Fix 12/26
#indicator['mfi']=abstract.Function('mfi') # Money Flow Index

# Show the descriptions
#for ind in list(indicator.values()):
#    print('===============================')
#    print(ind)
#    print('===============================')
#    print()


In [14]:
# Add SMA, EMA and TRIMA for extra times
#df['sma_5']=indicator['sma'](df, timeperiod=5) 
#df['ema_5']=indicator['ema'](df, timeperiod=5)
#df['trima_5']=indicator['trima'](df, timeperiod=5) 
#df['sma_20']=indicator['sma'](df, timeperiod=20) 
#df['ema_20']=indicator['ema'](df, timeperiod=20) 
#df['trima_20']=indicator['trima'](df, timeperiod=20) 

In [15]:
# Also keep in the dataset the one below
#opening
#closing
#highest
#lowest

In [16]:
# Not entered as it didn't work at the first try
#df['mom']=indicator['mom'](df['close'], timeperiod=10) # default
# Momentum extra
#df['mom_op']=indicator['mom'](df['open'], timeperiod=10)
#df['mom_hi']=indicator['mom'](df['high'], timeperiod=10) 
#df['mom_lo']=indicator['mom'](df['low'], timeperiod=10) 

#df=df.loc[(df['date'] >= '2018-01-01 00:00:00')]
# UNCOMMENT IF APPLYING ONLY Q1. TO-DO

# df.loc[(df['date'] >= '2018-03-10 00:00:00')] exploration of start of summertime in the data

In [17]:

# all the numbers here and below assume a default time period for ta params of 10 mins averages
# change the other numbers (params of 5min and 20min and theremoval of 20 first mins of the day below) if the mins number is changed.
# in that case change as well the time tag '_10' with the corresponding one.
default_timerange=10

# ###########################################
# Iterate and run list of indicators selected
# All of them produced for 25 prior mins of data 
# ###########################################
for key in list(indicator.keys()):
    # For indicators that only return one column (this will need to be modified depending on the selection of indicators)
    if key in ['ema','sma','trima']:
        df[key+'_'+str(default_timerange/2)]=indicator[key](df, timeperiod=(default_timerange/2)) 
        df[key+'_'+str(default_timerange)]=indicator[key](df, timeperiod=(default_timerange)) 
        df[key+'_'+str(default_timerange*2)]=indicator[key](df, timeperiod=(default_timerange*2))     
    elif key not in ['bbands','aroon','stoch','macd','macdfix','stochf']:
        df[key+'_'+str(default_timerange)]=indicator[key](df, timeperiod=(default_timerange))
    # Otherwise check the list of columns and return all
    else: 
        key_output=indicator[key](df, timeperiod=(default_timerange)) #, price='close')
        for j in range(0,len(list(key_output.columns))):
            df[key+'_'+key_output.columns[j]]=key_output[[j]]
            
df

Unnamed: 0,date,open,high,low,close,volume,ema_5,ema_10,ema_20,rsi_10,...,macd_macd,macd_macdsignal,macd_macdhist,cci_10,mom_10,stoch_slowk,stoch_slowd,sma_5,sma_10,sma_20
0,2017-09-11 13:30:00,248.040,248.050,248.035,248.040,195320.0,,,,,...,,,,,,,,,,
1,2017-09-11 13:31:00,248.100,248.170,248.030,248.040,191201.0,,,,,...,,,,,,,,,,
2,2017-09-11 13:32:00,248.160,248.180,248.090,248.110,76710.0,,,,,...,,,,,,,,,,
3,2017-09-11 13:33:00,248.220,248.240,248.150,248.160,151905.0,,,,,...,,,,,,,,,,
4,2017-09-11 13:34:00,248.310,248.350,248.230,248.240,168466.0,248.118000,,,,...,,,,,,,,248.1180,,
5,2017-09-11 13:35:00,248.340,248.340,248.280,248.300,85173.0,248.178667,,,,...,,,,,,,,248.1700,,
6,2017-09-11 13:36:00,248.300,248.390,248.300,248.350,104646.0,248.235778,,,,...,,,,,,,,248.2320,,
7,2017-09-11 13:37:00,248.300,248.340,248.280,248.295,60156.0,248.255519,,,,...,,,,,,,,248.2690,,
8,2017-09-11 13:38:00,248.300,248.350,248.290,248.310,55077.0,248.273679,,,,...,,,,,,65.694444,73.912037,248.2990,,
9,2017-09-11 13:39:00,248.370,248.390,248.280,248.310,55475.0,248.285786,248.215500,,,...,,,,65.173116,,45.896465,62.914562,248.3130,248.2155,


In [18]:
# Convert timestamp from GMT to US time (not working)
#import pytz
#eastern = pytz.timezone('US/Eastern')
#df['date']=df['date'].tz_localize(pytz.utc).tz_convert(eastern)
#print(df)

# Creating label/y to be predicted

# Ten minutes ahead cloasing price as value y to predict
#df['close_t+10']=df['close'].shift(-10)
df['close_t+1']=df['close'].shift(-1)

def func(row):
    if row['close_t+1'] > row['close']:
        return 1
    else:
        return 0

df['label'] = df.apply(func, axis=1)
df

Unnamed: 0,date,open,high,low,close,volume,ema_5,ema_10,ema_20,rsi_10,...,macd_macdhist,cci_10,mom_10,stoch_slowk,stoch_slowd,sma_5,sma_10,sma_20,close_t+1,label
0,2017-09-11 13:30:00,248.040,248.050,248.035,248.040,195320.0,,,,,...,,,,,,,,,248.040,0
1,2017-09-11 13:31:00,248.100,248.170,248.030,248.040,191201.0,,,,,...,,,,,,,,,248.110,1
2,2017-09-11 13:32:00,248.160,248.180,248.090,248.110,76710.0,,,,,...,,,,,,,,,248.160,1
3,2017-09-11 13:33:00,248.220,248.240,248.150,248.160,151905.0,,,,,...,,,,,,,,,248.240,1
4,2017-09-11 13:34:00,248.310,248.350,248.230,248.240,168466.0,248.118000,,,,...,,,,,,248.1180,,,248.300,1
5,2017-09-11 13:35:00,248.340,248.340,248.280,248.300,85173.0,248.178667,,,,...,,,,,,248.1700,,,248.350,1
6,2017-09-11 13:36:00,248.300,248.390,248.300,248.350,104646.0,248.235778,,,,...,,,,,,248.2320,,,248.295,0
7,2017-09-11 13:37:00,248.300,248.340,248.280,248.295,60156.0,248.255519,,,,...,,,,,,248.2690,,,248.310,1
8,2017-09-11 13:38:00,248.300,248.350,248.290,248.310,55077.0,248.273679,,,,...,,,,65.694444,73.912037,248.2990,,,248.310,0
9,2017-09-11 13:39:00,248.370,248.390,248.280,248.310,55475.0,248.285786,248.215500,,,...,,65.173116,,45.896465,62.914562,248.3130,248.2155,,248.370,1


In [19]:
# Filtering out first 20 mins of the day and last 1 minutes, 
# as they will have NaN values in either features or label (1 min ahead)
# The last hour is not normally good for buying as we shouldn't keep nothing open by end of business.
# The first half an hour may not be great for training as there is a lot of factors coming from the 
#  European and Asian markets having impacting the results, more than in the rest of the day.

#df['date'].between_time('09:00','10:00') #didn't work/check

# #############################################
# PART 1: Specifying times to remove
# #############################################

# Not between 09:30 and 14:02am (macd starts being populated at 14:03) and not from 5:59pm onwards
# excluded_times=['09:3','09:4','15:5','16:0']

# Before 6/11/2017 (summer time)
excluded_times_summer=['13:3','13:4','13:5'] # for GMT
excluded_times_summer_mins=['14:00','14:01','14:02','20:00'] # for GMT

# After 6/11/2017 (winter time)
excluded_times=['14:3','14:4','14:5'] # for GMT
excluded_times_mins=['15:00','15:01','15:02','21:00'] # for GMT

# #############################################
# PART 2: DF split before and after summer time
# #############################################

# Summer time data range obtained by data analysis
mask_summertime_dates = ((df['date'] < '2017-11-06 00:00:00') | (df['date'] >= '2018-03-10 00:00:00'))
mask_after_dates = ((df['date'] >= '2017-11-06 00:00:00') & (df['date'] < '2018-03-10 00:00:00'))

# splitting the data
df_summer=df.loc[mask_summertime_dates]
df_after=df.loc[mask_after_dates]

# #############################################
# PART 3: Removal of x first mins and last minute of the day 
# #############################################

# Not between 09:30 and 10:20am (lack of prior times in TS and indicators) and not from 5:50pm onwards (lack of label)
df_summer=df_summer[~df_summer.date.str[11:15].isin(excluded_times_summer)]
df_summer=df_summer[~df_summer.date.str[11:16].isin(excluded_times_summer_mins)]

# Same after summer time
df_after=df_after[~df_after.date.str[11:15].isin(excluded_times)]
df_after=df_after[~df_after.date.str[11:16].isin(excluded_times_mins)]

# Concat both sets together
df=pd.concat([df_summer,df_after])


In [20]:
# preview
df

Unnamed: 0,date,open,high,low,close,volume,ema_5,ema_10,ema_20,rsi_10,...,macd_macdhist,cci_10,mom_10,stoch_slowk,stoch_slowd,sma_5,sma_10,sma_20,close_t+1,label
33,2017-09-11 14:03:00,248.650,248.670,248.630,248.645,49595.0,248.588890,248.572046,248.517690,73.451530,...,-0.019845,183.333333,0.135,38.744589,29.480519,248.579,248.5815,248.54575,248.645,0
34,2017-09-11 14:04:00,248.645,248.690,248.610,248.645,82126.0,248.607593,248.585311,248.529815,73.451530,...,-0.016183,128.325509,0.085,54.369589,37.603716,248.590,248.5900,248.55250,248.640,0
35,2017-09-11 14:05:00,248.650,248.660,248.640,248.640,33210.0,248.618396,248.595254,248.540309,71.975408,...,-0.014298,96.453901,0.075,73.561508,55.558562,248.606,248.5975,248.55700,248.650,1
36,2017-09-11 14:06:00,248.690,248.700,248.650,248.650,51806.0,248.628930,248.605208,248.550755,73.173454,...,-0.012636,106.356589,0.065,69.097222,65.676106,248.624,248.6040,248.56250,248.690,1
37,2017-09-11 14:07:00,248.740,248.740,248.690,248.690,34735.0,248.649287,248.620625,248.564017,77.456669,...,-0.009257,131.102850,0.090,65.651709,69.436813,248.654,248.6130,248.57300,248.740,1
38,2017-09-11 14:08:00,248.730,248.780,248.720,248.740,87616.0,248.679525,248.642329,248.580777,81.548397,...,-0.004321,142.469754,0.130,68.919683,67.889538,248.673,248.6260,248.58600,248.740,0
39,2017-09-11 14:09:00,248.710,248.745,248.690,248.740,82146.0,248.699683,248.660088,248.595941,81.548397,...,-0.001847,100.825728,0.150,69.812540,68.127978,248.692,248.6410,248.59900,248.720,0
40,2017-09-11 14:10:00,248.720,248.730,248.710,248.720,15522.0,248.706455,248.670981,248.607756,74.840303,...,-0.002319,76.288660,0.160,67.248438,68.660220,248.708,248.6570,248.60950,248.730,1
41,2017-09-11 14:11:00,248.730,248.730,248.690,248.730,24546.0,248.714304,248.681712,248.619399,75.939839,...,-0.002693,56.885457,0.170,56.573057,64.544678,248.724,248.6740,248.61950,248.730,0
42,2017-09-11 14:12:00,248.720,248.760,248.715,248.730,81581.0,248.719536,248.690491,248.629932,75.939839,...,-0.003618,74.712644,0.190,47.578348,57.133281,248.732,248.6930,248.63050,248.720,0


In [21]:
# Export processed data
symbol='SPY'
if symbol=='SPY':
    output_csv_path="/home/cetrulin/Desktop/Andres/data/analysis/S&P500/S&P500_Q12018_"+str(datetime.date.today())+"_indicators.csv"
elif symbol=='S': 
    output_csv_path="/home/cetrulin/Desktop/Andres/data/analysis/S/S_"+str(datetime.date.today())+"_indicators.csv"

df.to_csv(output_csv_path, sep=';', encoding='utf-8')
