## Import useful libraries

In [1]:
import pandas as pd
import numpy as np
import datetime
import pytz

from airflow.providers.mysql.hooks.mysql import MySqlHook

## User settings

In [2]:
#list_of_price_types = ['bid', 'mid', 'ask']
#instrument_name = 'EUR/USD'
#interval_name = 'Day'

database_name = 'django'

## Connect to database

In [3]:
mysql_hook = MySqlHook(mysql_conn_id = database_name)

## Define a function to pull candlestick data

This only works for a single (price type, instrument, interval) tuple.

In [4]:
def get_candlestick_pull_query():
    sql_query_for_candlestick_pull = """SELECT

    ts.timestamp, inst.name AS instrument_name, cs.o, cs.l, cs.h, cs.c, v.volume

    FROM

    timeseries_candlestick cs, timeseries_instrument inst,
    timeseries_interval iv, timeseries_pricetype pt,
    timeseries_volume v, timeseries_timestamp ts

    WHERE

    cs.instrument_id = inst.id
    AND cs.interval_id = iv.id
    AND cs.price_type_id = pt.id
    AND cs.volume_id = v.id
    AND cs.timestamp_id = ts.id

    AND pt.name = '%s'
    AND inst.name = '%s'
    AND iv.name = '%s'

    ORDER BY timestamp
    ;
    """

    return sql_query_for_candlestick_pull

In [5]:
class CandlestickDataFrame():

    def __init__(
        self,
        instrument_name = 'EUR/USD',
        interval_name = 'Day',
        list_of_price_types = ['bid', 'mid', 'ask'],
        timezone_to_use = 'US/Eastern',
    ):
        
        self.list_of_price_types = list_of_price_types
        self.instrument_name = instrument_name
        self.interval_name = interval_name
        self.timezone_to_use = timezone_to_use

    def create_initial_dataframe(self):
        
        list_of_dataframes = []

        for pt in self.list_of_price_types:
            sql_to_run = get_candlestick_pull_query() % (pt, self.instrument_name, self.interval_name)
            df_pt = mysql_hook.get_pandas_df(sql_to_run)
    
            column_name_map = {}
            for col_name in ['o', 'l', 'h', 'c', 'volume']:
                column_name_map[col_name] = pt + '_' + col_name

            df_pt.rename(columns = column_name_map, inplace = True)

            list_of_dataframes.append(df_pt)

        df = list_of_dataframes[0]
        for df_i in list_of_dataframes[1:]:
            df = pd.merge(df, df_i, how = 'left', on = ['timestamp', 'instrument_name'])

        self.df = df

    def add_date_and_time_related_features(self):
        tz = pytz.timezone(self.timezone_to_use)
        self.df['timestamp_dt_Eastern'] = [datetime.datetime.fromtimestamp(x, tz = tz) for x in self.df['timestamp']]
        self.df['weekday_Eastern'] = [x.weekday() for x in self.df['timestamp_dt_Eastern']]
        self.df['hour_Eastern'] = [x.hour for x in self.df['timestamp_dt_Eastern']]

    def conduct_volume_test(self):
        volume_test = np.min(
            np.int8(
                (self.df['ask_volume'].values == self.df['mid_volume']) &
                (self.df['mid_volume'].values == self.df['bid_volume'])
            )
        )
        
        if volume_test == 1:
            self.df['volume'] = self.df['mid_volume']
            self.df.drop(columns = [x + '_volume' for x in self.list_of_price_types], inplace = True)
            self.volume_test = True
        else:
            self.volume_test = False

    def conduct_nan_test(self):
        self.are_there_nans = (len(self.df.index) != len(self.df.dropna().index))

    # https://www.investopedia.com/terms/s/spread.asp#toc-5-forex-spreads
    def compute_spread(self):
        for item in ['o', 'l', 'h', 'c']:
            self.df['spread_' + item] = self.df['ask_' + item] - self.df['bid_' + item]

    def is_mid_really_the_mid(self):
        self.dict_max_abs_diff_in_mid_check = {}
        columns_used_here = []
        for item in ['o', 'l', 'h', 'c']:
            self.df['mid_computed_' + item] = np.round((self.df['bid_' + item] + self.df['ask_' + item]) / 2., 5)
            self.df['mid_compare_' + item] = self.df['mid_' + item] == self.df['mid_computed_' + item]
            self.df['mid_diff_' + item] = np.abs(self.df['mid_' + item] - self.df['mid_computed_' + item])
            
            self.dict_max_abs_diff_in_mid_check[item] = np.max(self.df['mid_diff_' + item])

            columns_used_here.append('mid_' + item)
            columns_used_here.append('mid_computed_' + item)
            columns_used_here.append('mid_compare_' + item)
            columns_used_here.append('mid_diff_' + item)

        self.df_mid_test = self.df[columns_used_here].copy()

        for item in ['o', 'l', 'h', 'c']:
            self.df.drop(
                columns = [
                    'mid_computed_' + item,
                    'mid_compare_' + item,
                    'mid_diff_' + item,
                ],
                inplace = True,
            )

                    

    
    def fit(self):
        self.create_initial_dataframe()
        self.conduct_volume_test()
        self.add_date_and_time_related_features()
        self.conduct_nan_test()
        self.compute_spread()
        self.is_mid_really_the_mid()

In [6]:
cdf = CandlestickDataFrame()
cdf.fit()

[[34m2025-03-30T12:59:54.709-0700[0m] {[34mbase.py:[0m84} INFO[0m - Retrieving connection 'django'[0m


In [7]:
cdf.df.columns

Index(['timestamp', 'instrument_name', 'bid_o', 'bid_l', 'bid_h', 'bid_c',
       'mid_o', 'mid_l', 'mid_h', 'mid_c', 'ask_o', 'ask_l', 'ask_h', 'ask_c',
       'volume', 'timestamp_dt_Eastern', 'weekday_Eastern', 'hour_Eastern',
       'spread_o', 'spread_l', 'spread_h', 'spread_c'],
      dtype='object')

In [8]:
cdf.are_there_nans

False

In [9]:
cdf.volume_test

True

In [10]:
import pprint as pp
pp.pprint(cdf.dict_max_abs_diff_in_mid_check)

{'c': 1.0000000000065512e-05,
 'h': 0.00034000000000000696,
 'l': 0.0009200000000000319,
 'o': 1.0000000000065512e-05}


In [11]:
cdf.df_mid_test

Unnamed: 0,mid_o,mid_computed_o,mid_compare_o,mid_diff_o,mid_l,mid_computed_l,mid_compare_l,mid_diff_l,mid_h,mid_computed_h,mid_compare_h,mid_diff_h,mid_c,mid_computed_c,mid_compare_c,mid_diff_c
0,0.91535,0.91535,True,0.0,0.91535,0.91535,True,0.0,0.91535,0.91535,True,0.00000,0.91535,0.91535,True,0.0
1,0.90435,0.90435,True,0.0,0.90435,0.90435,True,0.0,0.90435,0.90435,True,0.00000,0.90435,0.90435,True,0.0
2,0.90935,0.90935,True,0.0,0.90935,0.90935,True,0.0,0.90935,0.90935,True,0.00000,0.90935,0.90935,True,0.0
3,0.91400,0.91400,True,0.0,0.91400,0.91400,True,0.0,0.91400,0.91400,True,0.00000,0.91400,0.91400,True,0.0
4,0.91410,0.91410,True,0.0,0.91410,0.91410,True,0.0,0.91410,0.91410,True,0.00000,0.91410,0.91410,True,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6764,1.08156,1.08156,True,0.0,1.07816,1.07816,True,0.0,1.08583,1.08583,True,0.00000,1.08012,1.08012,True,0.0
6765,1.08013,1.08013,True,0.0,1.07770,1.07770,True,0.0,1.08302,1.08303,False,0.00001,1.07918,1.07918,True,0.0
6766,1.07940,1.07940,True,0.0,1.07439,1.07439,True,0.0,1.08029,1.08029,True,0.00000,1.07528,1.07528,True,0.0
6767,1.07578,1.07578,True,0.0,1.07330,1.07330,True,0.0,1.08212,1.08213,False,0.00001,1.08020,1.08020,True,0.0


In [12]:
# 
# Forex spreads are the differences between the bid price (the price at which you can sell a currency pair) and the ask price (the price at which you can buy a currency pair). This spread is essentially the cost of trading and the primary way that forex brokers make money. The size of the spread depends on market liquidity, volatility, and the specific currency pair being traded.

# Major currency pairs like EUR/USD typically have tighter spreads because of high liquidity, while exotic pairs may have wider spreads.7

#For traders, especially those engaged in short-term strategies like day trading or scalping, the spread is a crucial consideration as it directly affects the profitability of each trade. Wider spreads mean a trade needs to move further in the trader's favor just to break even. Some brokers offer fixed spreads, while others provide variable spreads that fluctuate with market conditions. 