## Load useful libraries

In [1]:
import json
import requests
import datetime
import time
import pytz
import pandas as pd

import pprint as pp

## User settings

Using papermill, we can change this with command line arguments:

In [2]:
# This cell is tagged `parameters`
config_file = '/home/emily/Desktop/projects/test/badass-data-science/badassdatascience/forex/data/DEVELOPMENT.json'
count = 5000 
granularity = 'H1'
output_file_sans_extension = 'output/H1'
now = True
instruments = 'EUR_USD,USD_CAD,USD_JPY,USD_CHF,AUD_USD,GBP_USD,NZD_USD'
price_types = 'BAM'
error_retry_interval = 5
timezone_to_use = 'America/Toronto'   # Don't change this!

end_date = None

In [3]:
instrument_list = instruments.split(',')

In [4]:
print(count)
print(granularity)

5000
H1


In [5]:
#
#
#
have_prior_data = True
try:
    df = pd.read_parquet(output_file_sans_extension + '.parquet')
    df = df[df['complete']].copy()
    df.head()
except:
    have_prior_data = False

In [6]:
def get_min_max_times(df):

    df_agg_min = df[['instrument', 'time']].groupby(['instrument'])['time'].agg('min').reset_index()
    df_agg_min.index = df_agg_min['instrument']
    df_agg_min.drop(columns = ['instrument'], inplace = True)
    dict_instrument_to_min_time = df_agg_min.to_dict(orient = 'index')
    pp.pprint(dict_instrument_to_min_time)

    print()

    df_agg_max = df[['instrument', 'time']].groupby(['instrument'])['time'].agg('max').reset_index()
    df_agg_max.index = df_agg_max['instrument']
    df_agg_max.drop(columns = ['instrument'], inplace = True)
    dict_instrument_to_max_time = df_agg_max.to_dict(orient = 'index')
    pp.pprint(dict_instrument_to_max_time)

    #{'AUD_USD': {'time': 1744977600},
    # 'EUR_USD': {'time': 1744977600},
    # 'GBP_USD': {'time': 1744977600},
    # 'NZD_USD': {'time': 1744977600},
    # 'USD_CAD': {'time': 1744977600},
    # 'USD_CHF': {'time': 1744977600},
    # 'USD_JPY': {'time': 1744977600}}

    print()

    print(datetime.datetime.fromtimestamp(dict_instrument_to_min_time['AUD_USD']['time']))
    print(datetime.datetime.fromtimestamp(dict_instrument_to_max_time['AUD_USD']['time']))

    return dict_instrument_to_min_time, dict_instrument_to_max_time

In [7]:
if have_prior_data:
    dict_instrument_to_min_time, dict_instrument_to_max_time = get_min_max_times(df)

{'AUD_USD': {'time': 1020715200},
 'EUR_USD': {'time': 1020715200},
 'GBP_USD': {'time': 1020715200},
 'NZD_USD': {'time': 1032811200},
 'USD_CAD': {'time': 1020801600},
 'USD_CHF': {'time': 1020715200},
 'USD_JPY': {'time': 1020715200}}

{'AUD_USD': {'time': 1744977600},
 'EUR_USD': {'time': 1744977600},
 'GBP_USD': {'time': 1744977600},
 'NZD_USD': {'time': 1744977600},
 'USD_CAD': {'time': 1744977600},
 'USD_CHF': {'time': 1744977600},
 'USD_JPY': {'time': 1744977600}}

2002-05-06 13:00:00
2025-04-18 05:00:00


In [8]:
#
# define fixed values
#
timezone = pytz.timezone(timezone_to_use)

In [9]:
#if end_date != None and now:
#    print('Cannot use both --end-date and --now. Exiting.')
#    sys.exit(0)

#if now:
if True:
    end_date_original = int(time.mktime(datetime.datetime.now().timetuple()))
    end_date = end_date_original

In [10]:
#
# load config
#
with open(config_file) as f:
    config = json.load(f)

In [11]:
#
# specify headers
#
headers = {
    'Content-Type' : 'application/json',
    'Authorization' : 'Bearer ' + config['token'],
    'Accept-Datetime-Format' : config['oanda_date_time_format'],
}

In [12]:
#
# send a request to Oanda for historical candlestick values
#
def get_instrument_candlesticks(instrument, count, price_types, granularity, end_date):
    url = config['server'] + '/v3/instruments/' + instrument + '/candles?count=' + str(count) + '&price=' + price_types + '&granularity=' + granularity + '&to=' + str(end_date)

    worked = False
    while not worked:
        try:
            r = requests.get(url, headers=headers)
            worked = True
        except:
            time.sleep(error_retry_interval)
        
    rj = r.json()
    return rj

In [13]:
#
# alters the dictionary in place; not my favorite design idiom
#
def deal_with_candlestick_format_and_time(candle):
    candle['time'] = int(float(candle['time']))
    time_dt = datetime.datetime.fromtimestamp(candle['time'], tz = timezone)
    candle['time_iso'] = time_dt.isoformat()
    candle['weekday'] = time_dt.weekday()
    candle['hour'] = time_dt.hour

    #
    # deal with prices that are currently string values but need to be float
    #
    # and reorganize them
    #
    for price_type in ['bid', 'mid', 'ask']:
        for candlestick_component in candle[price_type].keys():
            candle[price_type + '_' + candlestick_component] = float(candle[price_type][candlestick_component])
        candle[price_type + '_return'] = candle[price_type + '_c'] - candle[price_type + '_o']
        candle[price_type + '_volatility'] = candle[price_type + '_h'] - candle[price_type + '_l']
            
    for price_type in ['bid', 'mid', 'ask']:
        del(candle[price_type])
            
    return None

In [14]:
#
# iterate through the instruments
#
if True:
    insert_many_list = []
    for instrument in instrument_list:

        if have_prior_data:
            try:
                start_time = dict_instrument_to_max_time[instrument]['time']
            except:
                start_time = datetime.datetime(2010, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc)


        
        # initialize per instrument
        finished = False
        end_date = end_date_original

        # loop through the timestamp ranges for each set of n=count values
        while not finished:

            # retrieve the instrument candlesticks from the Oanda server
            rj = get_instrument_candlesticks(instrument, count, price_types, granularity, end_date)        


            candlesticks = rj['candles']
            
            # deal with timestamps and time-related content
            date_list = []
            for candle in candlesticks:
                deal_with_candlestick_format_and_time(candle)
                date_list.append(candle['time'])
                
            rj['timestamp_int_min'] = min(date_list)
            rj['timestamp_int_max'] = max(date_list)

            insert_many_list.append(rj)

            # Are we done with the current instrument?
            if len(date_list) < count:
                finished = True

            if min(date_list) < start_time:
                finished = True
            
            # prepare for the next iteration
            end_date = rj['timestamp_int_min'] - 0.1



else:
    pass
    #with open(output_file, 'r') as f:
    #    candlestick_dict_list = json.load(f)

In [15]:
#pp.pprint(insert_many_list)

In [16]:
if True:
    candlestick_dict_list = []
    for item in insert_many_list:
        instrument = item['instrument']
        granularity = item['granularity']
        candles_list = item['candles']
        for candle in candles_list:

            #import pprint as pp
            #print()
            #pp.pprint(candle)
            #print()
            #import sys; sys.exit(0)
            
            candle['instrument'] = instrument
            candle['granularity'] = granularity

            # we check this again later in the final assembly
            if candle['complete']:
                candlestick_dict_list.append(candle)

    #with open(output_file, 'w') as f:
    #    json.dump(candlestick_dict_list, f, indent = 2)

In [17]:
#
# make dataframe
#
df_new = pd.DataFrame(candlestick_dict_list)
df_new = df_new[df_new['complete']].copy()

In [18]:
df_new.head()

Unnamed: 0,complete,volume,time,time_iso,weekday,hour,bid_o,bid_h,bid_l,bid_c,...,mid_return,mid_volatility,ask_o,ask_h,ask_l,ask_c,ask_return,ask_volatility,instrument,granularity
0,True,8045,1719842400,2024-07-01T10:00:00-04:00,0,10,1.07553,1.07692,1.07278,1.07294,...,-0.00259,0.00414,1.07567,1.07716,1.07293,1.07308,-0.00259,0.00423,EUR_USD,H1
1,True,4910,1719846000,2024-07-01T11:00:00-04:00,0,11,1.07295,1.07298,1.07189,1.07262,...,-0.00033,0.00109,1.07309,1.07312,1.07203,1.07276,-0.00033,0.00109,EUR_USD,H1
2,True,2600,1719849600,2024-07-01T12:00:00-04:00,0,12,1.07263,1.07324,1.07235,1.07307,...,0.00045,0.0009,1.07278,1.0734,1.0725,1.07323,0.00045,0.0009,EUR_USD,H1
3,True,2099,1719853200,2024-07-01T13:00:00-04:00,0,13,1.07308,1.07328,1.07272,1.07286,...,-0.00022,0.00055,1.07323,1.07343,1.07288,1.07301,-0.00022,0.00055,EUR_USD,H1
4,True,2151,1719856800,2024-07-01T14:00:00-04:00,0,14,1.07287,1.07295,1.07234,1.07289,...,1e-05,0.0006,1.07303,1.07309,1.0725,1.07303,0.0,0.00059,EUR_USD,H1


In [19]:
#{'AUD_USD': {'time': 1020715200},
# 'EUR_USD': {'time': 1020715200},
# 'GBP_USD': {'time': 1020715200},
# 'NZD_USD': {'time': 1032811200},
# 'USD_CAD': {'time': 1020801600},
# 'USD_CHF': {'time': 1020715200},
# 'USD_JPY': {'time': 1020715200}}

#{'AUD_USD': {'time': 1744977600},
# 'EUR_USD': {'time': 1744977600},
# 'GBP_USD': {'time': 1744977600},
# 'NZD_USD': {'time': 1744977600},
# 'USD_CAD': {'time': 1744977600},
# 'USD_CHF': {'time': 1744977600},
# 'USD_JPY': {'time': 1744977600}}

# 2002-05-06 13:00:00
# 2025-04-18 05:00:00

In [20]:
dict_instrument_to_min_time, dict_instrument_to_max_time = get_min_max_times(df_new)

{'AUD_USD': {'time': 1719842400},
 'EUR_USD': {'time': 1719842400},
 'GBP_USD': {'time': 1719842400},
 'NZD_USD': {'time': 1719842400},
 'USD_CAD': {'time': 1719842400},
 'USD_CHF': {'time': 1719842400},
 'USD_JPY': {'time': 1719842400}}

{'AUD_USD': {'time': 1745265600},
 'EUR_USD': {'time': 1745265600},
 'GBP_USD': {'time': 1745265600},
 'NZD_USD': {'time': 1745265600},
 'USD_CAD': {'time': 1745265600},
 'USD_CHF': {'time': 1745265600},
 'USD_JPY': {'time': 1745265600}}

2024-07-01 07:00:00
2025-04-21 13:00:00


In [39]:
print(len(df.index))
print(len(df_new.index))

907007
34993


In [22]:
df.head()

Unnamed: 0,complete,volume,time,time_iso,weekday,hour,bid_o,bid_h,bid_l,bid_c,...,mid_return,mid_volatility,ask_o,ask_h,ask_l,ask_c,ask_return,ask_volatility,instrument,granularity
0,True,2921,1719554400,2024-06-28T02:00:00-04:00,4,2,1.06898,1.06976,1.06891,1.06963,...,0.00065,0.00085,1.06912,1.0699,1.06906,1.06978,0.00066,0.00084,EUR_USD,H1
1,True,4242,1719558000,2024-06-28T03:00:00-04:00,4,3,1.06965,1.07032,1.06908,1.06963,...,-2e-05,0.00124,1.06979,1.07047,1.06924,1.06977,-2e-05,0.00123,EUR_USD,H1
2,True,3510,1719561600,2024-06-28T04:00:00-04:00,4,4,1.06961,1.06994,1.06914,1.06971,...,9e-05,0.00079,1.06977,1.07008,1.0693,1.06986,9e-05,0.00078,EUR_USD,H1
3,True,3072,1719565200,2024-06-28T05:00:00-04:00,4,5,1.06969,1.07087,1.06926,1.07087,...,0.00117,0.00161,1.06985,1.07103,1.06941,1.07101,0.00116,0.00162,EUR_USD,H1
4,True,2862,1719568800,2024-06-28T06:00:00-04:00,4,6,1.07085,1.07097,1.07037,1.07089,...,4e-05,0.0006,1.07099,1.07112,1.07051,1.07103,4e-05,0.00061,EUR_USD,H1


In [23]:
df_new.head()

Unnamed: 0,complete,volume,time,time_iso,weekday,hour,bid_o,bid_h,bid_l,bid_c,...,mid_return,mid_volatility,ask_o,ask_h,ask_l,ask_c,ask_return,ask_volatility,instrument,granularity
0,True,8045,1719842400,2024-07-01T10:00:00-04:00,0,10,1.07553,1.07692,1.07278,1.07294,...,-0.00259,0.00414,1.07567,1.07716,1.07293,1.07308,-0.00259,0.00423,EUR_USD,H1
1,True,4910,1719846000,2024-07-01T11:00:00-04:00,0,11,1.07295,1.07298,1.07189,1.07262,...,-0.00033,0.00109,1.07309,1.07312,1.07203,1.07276,-0.00033,0.00109,EUR_USD,H1
2,True,2600,1719849600,2024-07-01T12:00:00-04:00,0,12,1.07263,1.07324,1.07235,1.07307,...,0.00045,0.0009,1.07278,1.0734,1.0725,1.07323,0.00045,0.0009,EUR_USD,H1
3,True,2099,1719853200,2024-07-01T13:00:00-04:00,0,13,1.07308,1.07328,1.07272,1.07286,...,-0.00022,0.00055,1.07323,1.07343,1.07288,1.07301,-0.00022,0.00055,EUR_USD,H1
4,True,2151,1719856800,2024-07-01T14:00:00-04:00,0,14,1.07287,1.07295,1.07234,1.07289,...,1e-05,0.0006,1.07303,1.07309,1.0725,1.07303,0.0,0.00059,EUR_USD,H1


In [37]:
df_full = pd.concat([df, df_new]).sort_values(['time', 'instrument']).drop_duplicates()

count = 0
for instrument in instrument_list:
    df_temp = df_full[df_full['instrument'] == instrument]
    count += len(df_temp['time'].unique())

print(len(df_full.index) == count)

True


In [38]:
print(len(df.index) + len(df_new.index))
print(len(pd.concat([df, df_new]).sort_values(['time', 'instrument']).index))
print(len(df_full.index))

942000
942000
907231
