## Import useful libraries

In [1]:
import pandas as pd
import numpy as np
import datetime
import pytz
import matplotlib.pyplot as plt

from airflow.providers.mysql.hooks.mysql import MySqlHook

## User settings

In [2]:
#list_of_price_types = ['bid', 'mid', 'ask']
#instrument_name = 'EUR/USD'
#interval_name = 'Day'

database_name = 'django'

## Connect to database

In [3]:
mysql_hook = MySqlHook(mysql_conn_id = database_name)

## Define a function to pull candlestick data

This only works for a single (price type, instrument, interval) tuple.

In [4]:
def get_candlestick_pull_query():
    sql_query_for_candlestick_pull = """SELECT

    ts.timestamp, inst.name AS instrument_name, cs.o, cs.l, cs.h, cs.c, v.volume

    FROM

    timeseries_candlestick cs, timeseries_instrument inst,
    timeseries_interval iv, timeseries_pricetype pt,
    timeseries_volume v, timeseries_timestamp ts

    WHERE

    cs.instrument_id = inst.id
    AND cs.interval_id = iv.id
    AND cs.price_type_id = pt.id
    AND cs.volume_id = v.id
    AND cs.timestamp_id = ts.id

    AND pt.name = '%s'
    AND inst.name = '%s'
    AND iv.name = '%s'

    ORDER BY timestamp
    ;
    """

    return sql_query_for_candlestick_pull

In [None]:
class CandlestickDataFrame():

    def __init__(
        self,
        instrument_name = 'EUR/USD',
        interval_name = 'Day',
        cut_before_year = 2014,
        list_of_price_types = ['bid', 'mid', 'ask'],  # Don't change this!
        timezone_to_use = 'US/Eastern',  # Don't change this!
    ):
        
        self.list_of_price_types = list_of_price_types
        self.instrument_name = instrument_name
        self.interval_name = interval_name
        self.timezone_to_use = timezone_to_use
        self.cut_before_year = cut_before_year

    def create_initial_dataframe(self):
        
        list_of_dataframes = []

        for pt in self.list_of_price_types:
            sql_to_run = get_candlestick_pull_query() % (pt, self.instrument_name, self.interval_name)
            df_pt = mysql_hook.get_pandas_df(sql_to_run)
    
            column_name_map = {}
            for col_name in ['o', 'l', 'h', 'c', 'volume']:
                column_name_map[col_name] = pt + '_' + col_name

            df_pt.rename(columns = column_name_map, inplace = True)

            list_of_dataframes.append(df_pt)

        df = list_of_dataframes[0]
        for df_i in list_of_dataframes[1:]:
            df = pd.merge(df, df_i, how = 'left', on = ['timestamp', 'instrument_name'])

        self.df = df

    def add_date_and_time_related_features(self):
        tz = pytz.timezone(self.timezone_to_use)
        self.df['timestamp_dt_Eastern'] = [datetime.datetime.fromtimestamp(x, tz = tz) for x in self.df['timestamp']]
        self.df['weekday_Eastern'] = [x.weekday() for x in self.df['timestamp_dt_Eastern']]
        self.df['hour_Eastern'] = [x.hour for x in self.df['timestamp_dt_Eastern']]
        self.df['year_Eastern'] = [x.year for x in self.df['timestamp_dt_Eastern']]

    def filter_by_date_and_time_related_features(self):
        self.df = self.df[self.df['year_Eastern'] >= self.cut_before_year].copy()

    def conduct_volume_test(self):
        volume_test = np.min(
            np.int8(
                (self.df['ask_volume'].values == self.df['mid_volume']) &
                (self.df['mid_volume'].values == self.df['bid_volume'])
            )
        )
        
        if volume_test == 1:
            self.df['volume'] = self.df['mid_volume']
            self.df.drop(columns = [x + '_volume' for x in self.list_of_price_types], inplace = True)
            self.volume_test = True
        else:
            self.volume_test = False

    def conduct_nan_test(self):
        self.are_there_nans = (len(self.df.index) != len(self.df.dropna().index))

    # https://www.investopedia.com/terms/s/spread.asp#toc-5-forex-spreads
    def compute_spread(self):
        for item in ['o', 'l', 'h', 'c']:
            self.df['spread_' + item] = self.df['ask_' + item] - self.df['bid_' + item]

    def is_mid_really_the_mid(self):
        self.dict_max_abs_diff_in_mid_check = {}
        columns_used_here = []
        for item in ['o', 'l', 'h', 'c']:
            self.df['mid_computed_' + item] = np.round((self.df['bid_' + item] + self.df['ask_' + item]) / 2., 5)
            self.df['mid_compare_' + item] = self.df['mid_' + item] == self.df['mid_computed_' + item]
            self.df['mid_diff_' + item] = np.abs(self.df['mid_' + item] - self.df['mid_computed_' + item])
            
            self.dict_max_abs_diff_in_mid_check[item] = np.max(self.df['mid_diff_' + item])

            columns_used_here.append('mid_' + item)
            columns_used_here.append('mid_computed_' + item)
            columns_used_here.append('mid_compare_' + item)
            columns_used_here.append('mid_diff_' + item)

        self.df_mid_test = self.df[columns_used_here].copy()

        for item in ['o', 'l', 'h', 'c']:
            self.df.drop(
                columns = [
                    'mid_computed_' + item,
                    'mid_compare_' + item,
                    'mid_diff_' + item,
                ],
                inplace = True,
            )


    def compute_return_and_volatility(self):
        for pt in self.list_of_price_types:
            self.df[pt + '_return'] = self.df[pt + '_c'] - self.df[pt + '_o']
            self.df[pt + '_volatility'] = self.df[pt + '_h'] - self.df[pt + '_l']

    
    def fit(self):
        self.create_initial_dataframe()
        self.conduct_volume_test()
        self.add_date_and_time_related_features()
        self.filter_by_date_and_time_related_features()
        self.conduct_nan_test()
        self.compute_spread()
        self.is_mid_really_the_mid()
        self.compute_return_and_volatility()

In [None]:
cdf = CandlestickDataFrame()
cdf.fit()

In [None]:
cdf.df

In [None]:
cdf.are_there_nans

In [None]:
cdf.volume_test

In [None]:
import pprint as pp
pp.pprint(cdf.dict_max_abs_diff_in_mid_check)

In [None]:
cdf.df_mid_test

In [None]:
cdf.df['mid_return'].values

In [None]:
#(
#    cdf.df
#    .groupby('year_Eastern')
#    ['timestamp']
#    .agg('count')
#)

cdf.df.boxplot(column = 'volume', by = 'year_Eastern')
plt.xticks(rotation = 80)
plt.show()
plt.close()

In [None]:
def plot_something_and_diff(df, column):
    plt.figure()
    
    to_plot = df[column].values
    plt.subplot(2, 1, 1)
    plt.plot(df['timestamp_dt_Eastern'].values, to_plot)
    plt.tight_layout()
    
    to_plot = df[column].values[1:] - df[column].values[0:-1]
    plt.subplot(2, 1, 2)
    plt.plot(df['timestamp_dt_Eastern'].values[1:], to_plot)
    plt.tight_layout()
    
    plt.show()
    plt.close()

plot_something_and_diff(cdf.df, 'mid_return')

In [None]:
list_of_dates = [x.date() for x in cdf.df['timestamp_dt_Eastern']]

df_to_fb_prophet = pd.DataFrame({'ds' : list_of_dates, 'y' : cdf.df['mid_return']})
df_to_fb_prophet

In [None]:
from prophet import Prophet
m = Prophet()
m.fit(df_to_fb_prophet)

future = m.make_future_dataframe(periods = 10)
forecast = m.predict(future)
forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']]

In [None]:
plt.figure()
m.plot(forecast)
plt.show()
plt.close()

In [None]:
# 
# Forex spreads are the differences between the bid price (the price at which you can sell a currency pair) and the ask price (the price at which you can buy a currency pair). This spread is essentially the cost of trading and the primary way that forex brokers make money. The size of the spread depends on market liquidity, volatility, and the specific currency pair being traded.

# Major currency pairs like EUR/USD typically have tighter spreads because of high liquidity, while exotic pairs may have wider spreads.7

#For traders, especially those engaged in short-term strategies like day trading or scalping, the spread is a crucial consideration as it directly affects the profitability of each trade. Wider spreads mean a trade needs to move further in the trader's favor just to break even. Some brokers offer fixed spreads, while others provide variable spreads that fluctuate with market conditions. 

In [None]:
start = np.min(cdf.df['timestamp_dt_Eastern']).date()
end = np.max(cdf.df['timestamp_dt_Eastern']).date() #+ datetime.timedelta(days = 1)
date_range = pd.date_range(start = start, end = end, freq = 'D')

df_all_dates = pd.DataFrame({'ds' : date_range, 'dummy' : 1})

df_to_fb_prophet = pd.DataFrame({'ds' : list_of_dates, 'y' : cdf.df['mid_return']})
df_to_fb_prophet['ds'] = pd.to_datetime(df_to_fb_prophet['ds'])



In [None]:
q = np.array([1, 2, 3, 3, 3, 5, 7, 8, 8, 8, 9])
window_size = 3

def window_mean_std(an_array, window_size = 10):
    the_mean = [np.nan] * (window_size - 2)
    the_std = [np.nan] * (window_size - 2)
    
    the_mean.extend([np.nanmean(an_array[(i - window_size):i]) for i in range(window_size, len(an_array) + 1)])
    the_std.extend([np.nanstd(an_array[(i - window_size):i]) for i in range(window_size, len(an_array) + 1)])
    the_mean.append(np.nan)
    the_std.append(np.nan)
    return np.array(the_mean), np.array(the_std)

df_booger = pd.merge(
    df_all_dates,
    df_to_fb_prophet,
    how = 'left',
    on = 'ds',
).drop(columns = ['dummy'])

df_booger['m'], df_booger['s'] = (window_mean_std(df_booger['y'], window_size = window_size))
df_booger = df_booger.iloc[(window_size - 1):,:].copy()
df_booger['y_original'] = df_booger['y']
df_booger['y'] = (df_booger['y'] - df_booger['m']) / df_booger['s']


df_booger

In [None]:
plt.figure()
plt.plot(df_booger['ds'], df_booger['y'])
plt.show()
plt.close()