In [2]:
from random import randint, randrange
import string
from time import time
from tkinter.tix import Tree
from turtle import width
import pandas as pd
from plotly.subplots import make_subplots
import numpy as np
import matplotlib.pyplot as plt
from scipy import rand
from tqdm import tqdm
import mplfinance as mplf
import plotly.graph_objects as go

pd.set_option('float_format', '{:f}'.format)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)

  pd.set_option('display.max_colwidth', -1)


In [53]:
# In this script we add noise to candles. There are different ways of
# adding noise to candles and all the different methods are in the Noise
# class.
def getCandles(fileLocation : string) -> pd.DataFrame():
    # This fuunction reads a dataframe from a csv file
    return pd.read_csv(fileLocation)

def plotDataset(df:pd.DataFrame, method = "mplfinance") -> None:
    """
    The function to plot candlestick data.
    
    Arguments
    ---------
    df: pd.DataFrame: The dataframe containing open, high, close and low of candles in each column. 
        The dataframe has to contain a "date" column representing the open time of each cnadle. the
        dataframe can contain "close_time" and "volume" column but it is not necessary to provide 
        them.
    method: string: The method to plot the data. (Choose between "mplfinance" or "plotly")

    Returns
    -------
    None

    """
    
    if method == "mplfinance":
        import mplfinance as mplf
        
        # Setting the date column as index
        df.index = pd.DatetimeIndex(df['open_time'] if df.columns.tolist().__contains__("open_time") else df['date'])

        mplf.plot(
            df,
            style="charles",
            type="candle",
            volume=False if "volume" not in df.columns.tolist() else True, 
            title={"title": "Candlestick data"},
            tight_layout=True,
        )

    elif method == "plotly":
        import plotly.graph_objects as go  

        if df.columns.tolist().__contains__("open_time"):
            df["open_time"] = pd.to_datetime(df["open_time"], unit = "s")
        else:
            df["date"] = pd.to_datetime(df["date"], unit = "s") 
        
        noisedChart = go.Candlestick(
                x=df['open_time'] if df.columns.tolist().__contains__("open_time") else df['date'],
                open=df['open'],
                high=df['high'],
                low=df['low'],
                close=df['close'],
                name = "Candlestick data",
                # increasing={'line': {'color': 'blue'}},
                # decreasing={'line': {'color': 'purple'}},
                )

        layout = go.Layout(
            title = 'Overview',
            xaxis = go.XAxis(
                rangeslider_visible = False,
                showticklabels=False),
            yaxis = go.YAxis(
                title = 'Price',
                fixedrange = False
            ),
            yaxis2 = dict(range=[0, df["volume"].max()]) # Setting the range of volume bar
        )

        # fig = go.Figure(data=[noisedChart, noisedChar2t], layout = layout)
        fig = go.Figure(layout = layout)
        fig = make_subplots(rows=2, cols=1, shared_xaxes=True, 
                    vertical_spacing=0.03, subplot_titles=('', ''), 
                    row_width=[0.2, 0.7])

        fig.add_trace(noisedChart, row = 1, col = 1)

        #َ Add volume
        if df.columns.tolist().__contains__("volume"):
            fig.add_trace(go.Bar(x=df['open_time'] if df.columns.tolist().__contains__("open_time") else df['date'],
                        y=df['volume'], showlegend = True, name = "Volume"), row = 2, col = 1)

        fig.update_layout(layout)
        fig.show()

    else:
        print("Error, Method only mplfinance or plotly are allowed")

def saveDataset(df:pd.DataFrame, location:string, saveImage = False, method = "mplfinance") -> None:
    """
    The function to save the candlestick data to a pickle file.

    Arguments
    ---------
    df: pd.DataFrame: The dataframe containing open, high, close and low of candles in each column. 
        The dataframe has to contain a "date" column representing the open time of each cnadle. the
        dataframe can contain "close_time" and "volume" column but it is not necessary to provide 
        them.    
    location: string: The location of the file to save the data. It should contain file's name and 
    also it's extention (pkl for pickle files).
    saveImage: boolean: If true the candlestick plot is saved as an image.
    method: string: The method to plot the data. (Choose between "mplfinance" or "plotly")

    Returns
    -------
    None

    Note
    ----
    * To use plotly as a method of saving images, you need to have kaleido package installed.

    """

    if saveImage:
        if method == "mplfinance":    
            import mplfinance as mplf
            
            # Setting the date column as index
            df.index = pd.DatetimeIndex(df['open_time'] if df.columns.tolist().__contains__("open_time") else df['date'])

            mplf.plot(
                df,
                style="charles",
                type="candle",
                volume=False if "volume" not in df.columns.tolist() else True, 
                title={"title": "Candlestick data"},
                tight_layout=True,
                savefig = location.replace(".pkl",".png")
            )
        elif method == "plotly":
            import plotly.graph_objects as go  

            if df.columns.tolist().__contains__("open_time"):
                df["open_time"] = pd.to_datetime(df["open_time"], unit = "s")
            else:
                df["date"] = pd.to_datetime(df["date"], unit = "s") 
            
            noisedChart = go.Candlestick(
                    x=df['open_time'] if df.columns.tolist().__contains__("open_time") else df['date'],
                    open=df['open'],
                    high=df['high'],
                    low=df['low'],
                    close=df['close'],
                    name = "Candlestick data",
                    # increasing={'line': {'color': 'blue'}},
                    # decreasing={'line': {'color': 'purple'}},
                    )

            layout = go.Layout(
                title = 'Overview',
                xaxis = go.XAxis(
                    rangeslider_visible = False,
                    showticklabels=False),
                yaxis = go.YAxis(
                    title = 'Price',
                    fixedrange = False
                ),
                yaxis2 = dict(range=[0, df["volume"].max()]) # Setting the range of volume bar
            )

            # fig = go.Figure(data=[noisedChart, noisedChar2t], layout = layout)
            fig = go.Figure(layout = layout)
            fig = make_subplots(rows=2, cols=1, shared_xaxes=True, 
                        vertical_spacing=0.03, subplot_titles=('', ''), 
                        row_width=[0.2, 0.7])

            fig.add_trace(noisedChart, row = 1, col = 1)

            #َ Add volume
            if df.columns.tolist().__contains__("volume"):
                fig.add_trace(go.Bar(x=df['open_time'] if df.columns.tolist().__contains__("open_time") else df['date'],
                            y=df['volume'], showlegend = True, name = "Volume"), row = 2, col = 1)

            fig.update_layout(layout)
            fig.write_image(location.replace(".pkl",".png"), width=800, height=400)

    df.to_pickle(location)




In [4]:
class Noise():
    """
    The master class that contains the different methods of adding noise
    to dataset.
    """

    def __init__(self) -> None:
        """
        Initialize the class
        """
        pass

    def GaussianSeries(self, dataset: pd.Series, scale) -> pd.Series(dtype = "float64"):
        """
        Gets the gaussian noise for dataframe series. The mean value for calculation is
        the value of each row. The calculation formula of noise for a mean(u) and standard
        deviation (sigma) is stated below

        Noise(z) = exp(-(z-u)^2/2/sigma^2)/sigma/sqrt(2*pi)
        
        Arguments
        ---------
            dataset: a pandas series to add noise to
            scale: The standard deviation of the noise that will be added (In percentage units)

        
        Returns
        -------
            A pandas dataframe containing past data + Noise
        """
        scalePercent = scale / 100

        dataset = dataset.apply(lambda x: x + np.random.normal(loc = 0, scale = x * scalePercent, size = 1).item())

        return dataset

    def GaussianCnadles(self,
        time:pd.Series,
        open:pd.Series, 
        high:pd.Series, 
        close:pd.Series, 
        low:pd.Series, 
        volume:pd.Series, 
        close_time:pd.Series, 
        scale, 
        method: int
        ) -> pd.DataFrame(dtype="float64"):
        """
        Method that will add gaussian noise to a candlesstick dataset. There are two methods supported for now

        method1: The noise will be added to all the parameters of each candle. this will cause the candles to be
        non-continuous, i.e. The close of previous candle will not be equal to open of the current candle. (provided
        that the input candles are continous)

        method2: Adding the noise to high, low and the body of the candle. In this method open price of the first
        candle in dataset will be the same but the open of other candles will be cahnged with respect to the noise of 
        the candle's body (but the candles will stay continous).
        
        Arguments
        ---------
            open, high low, close: pd.Series: all pandas dataframes (or series)
            scale: [float]: The standard deviation of the noise that will be added respectively
            to body, upper wick and lower wick of the candles. if only one variable is passed 
            inside the array, it will be assigend to all the parts of the candle. (The scale has
            to be in percentage units)
        
        Returns
        -------
            A pandas dataframe with columns = ["time","open","high","low","close"]
        """
        _open, _high, _close, _low = None, None, None, None
        shape = time.shape[0]
        
        # Assigning the scales of noise in each part of candle
        if   len(scale) == 1:
            scale_body, scale_high, scale_low = 1 + scale[0] / 100, 1 + scale[0] / 100, 1 + scale[0] / 100
        elif len(scale) == 2:
            scale_body, scale_high, scale_low = 1 + scale[0] / 100, 1 + scale[1] / 100, 1 + scale[1] / 100
        elif len(scale) == 3:
            scale_body, scale_high, scale_low = 1 + scale[0] / 100, 1 + scale[1] / 100, 1 + scale[2] / 100

        # Convert series to dataframe
        frame = {"time":time,"open":open,"high":high,"low":low,"close":close}
        df = pd.DataFrame(frame)

        # We added this method to see if the prices contain negative value (which are not acceptable)
        # Only after the end of data generation we can know this, so we use a while loop 
        _continue = True

        while _continue:
            if method == 1:
                # The logic: In thsi method we do not care about not generating gaps. so we add noise
                # to the body of each candle but we keep the open price the same. this will avoid the 
                # noise to be accumulated and chart looks very much like the original (Without noise)
                
                df["Date"] = time
                df["open"] = open
                df["close"] = close
                df["high"] = high
                df["low"] = low
                df["volume"] = volume

                shape = time.shape[0]
                pert_body = np.random.normal(loc = 0, scale = 1, size = shape)
                pert_body /= max(pert_body)

                pert_upperWick = np.random.normal(loc = 0, scale = 1, size = shape)
                pert_upperWick /= max(pert_upperWick)

                pert_lowerWick = np.random.normal(loc = 0, scale = 1, size = shape)
                pert_lowerWick /= max(pert_lowerWick)

                temp = pd.DataFrame()
                temp["u1"] = df["high"] - df["close"]
                temp["u2"] = df["high"] - df["open"]
                temp["l1"] = df["close"] - df["low"]
                temp["l2"] = df["open"] - df["low"]

                _upperWickLen = temp[["u1","u2"]].max(axis=1)
                _lowerWickLen = temp[["u1","u2"]].max(axis=1)
                _bodyLen = close - open

                _body = _bodyLen * (1 + pert_body) * scale_body
                _high = _upperWickLen * (1 + pert_upperWick) * scale_high
                _low  = _lowerWickLen * (1 + pert_lowerWick) * scale_low
                _volume = volume * abs((_high - _low)/(high - low))

                df["_open"] = open
                df["_close"] = open + _body
                df["_high"] =  df[["_open","_close"]].max(axis=1) +_high
                df["_low"] =  df[["_open","_close"]].min(axis=1) +_low
                df["_volume"] = _volume

                frame = {
                    "date":time,
                    "open":df["_open"],
                    "high":df["_high"],
                    "low":df["_low"],
                    "close":df["_close"],
                    "volume":df["_volume"],
                    "close_time":close_time
                }

                returnDF = pd.DataFrame(frame)

                # Generate the data untill there are no negative values
                if not (returnDF.iloc[:,1:4] < 0).any().any():
                    _continue = False

            elif method == 2:
                # The logic: to keep the candles continouse and avoid making gamps, we use cumsum 
                # of the body + noise adn assign the value to the candle's close. and then we assign
                # the close of the current candle to open of the next candle
                
                df["Date"] = time
                df["open"] = open
                df["close"] = close
                df["high"] = high
                df["low"] = low
                df["volume"] = volume

                shape = time.shape[0]
                pert_body = np.random.normal(loc = 0, scale = 1, size = shape)
                pert_body /= max(pert_body)

                pert_upperWick = np.random.normal(loc = 0, scale = 1, size = shape)
                pert_upperWick /= max(pert_upperWick)

                pert_lowerWick = np.random.normal(loc = 0, scale = 1, size = shape)
                pert_lowerWick /= max(pert_lowerWick)

                temp = pd.DataFrame()
                temp["u1"] = df["high"] - df["close"]
                temp["u2"] = df["high"] - df["open"]
                temp["l1"] = df["close"] - df["low"]
                temp["l2"] = df["open"] - df["low"]

                _upperWickLen = temp[["u1","u2"]].max(axis=1)
                _lowerWickLen = temp[["u1","u2"]].max(axis=1)
                _bodyLen = close - open

                _body = _bodyLen * (1 + pert_body) * scale_body
                _high = _upperWickLen * (1 + pert_upperWick) * scale_high
                _low  = _lowerWickLen * (1 + pert_lowerWick) * scale_low
                _volume = volume * abs((_high - _low)/(high - low))

                _close = np.cumsum(_body) + close.iloc[0]
                df["_close"] = _close


                df["_open"] = df["_close"].shift(1)
                _k = df.columns.get_loc("_open")
                k = df.columns.get_loc("open")
                df.iloc[0,_k] = df.iloc[0,k]

                df["_high"] = df[["_open","_close"]].max(axis=1) + _high
                df["_low"]  = df[["_open","_close"]].min(axis=1) - _low

                df["_volume"] = _volume


                frame = {
                    "date":time,
                    "open":df["_open"],
                    "high":df["_high"],
                    "low":df["_low"],
                    "close":df["_close"],
                    "volume":df["_volume"],
                    "close_time":close_time
                }

                returnDF = pd.DataFrame(frame)

                
                # Generate the data untill there are no negative values
                if not (returnDF.iloc[:,1:4] < 0).any().any():
                    _continue = False

            else:
                print("Error, wrong method number provided!")
                
                _continue = False
        
        return returnDF

In [81]:
class GenerateCandles(Noise):
    """
    This class is for making random stock candlestick charts with different methods
    """

    def __init__(self) -> None:
        
        pass

    def BrownianMotion(self, maxMargin:float, dataset: pd.DataFrame()) -> pd.DataFrame:
        """
        Generates a brownian motion (Also know ans wiener process) with the given parameters.
        Note that the brownian motion is used to generate random stock prices as a time series 
        representable in a line chart. But in some startegies we need candlestick data (open, 
        high, low, close) to be able to run the backtest. We have chosen the random variables 
        generated by wiener process as the closing price of each candle. Also the oppening of
        the next candle is the closing of the previous candle. The only challange is to generate
        the high and low of the candle. we have used a normal distribution to generate theese 
        values but the maximum limit for the shadows will be a percentage of the cnalde body.
        This value will be a parameter of the function.

        Note that adding volume to the generated data is optional. For addign the volume a tuple 
        has to be passed to it. The first variable is the dataset to get the volume basis from.
        The volume basis is the average of the volume for each (High - low) in candle. later for 
        each candle generated, the difference between high and low will be multiplied by this basis 
        to get the volume for it.
        
        Updates:
        --------
        1. Major changes occured. Now the function, requires the user to provide a dataset containing
        the date, open, high, low, close and volume of the asset. Also the generated candle's body is 
        in percentage units instead of dollar units of the underlying asset. 



        Parameters
        ----------
        maxMargin: float: The maximum margin of the entire chart from $0
        dataset: pd.DataFrame: The dataset containing the date, open, high, low, close and volume


        To do:
        ------

        Returns
        -------
        df: pandas.DataFrame: The generated candles containing the date, open, high, low, close, 
        close_time and volume. (date is the oppening time of the candle)
        """
        dataset.reset_index(inplace=True, drop=True)

        # Because of shifting, we lose the last candle, so we add one to the candlesCount
        candlesCount = dataset.shape[0]
        startingPrice = dataset["open"].iloc[0]

        # Initiating the dataframe to be returned
        df = pd.DataFrame(columns = ["open_time","open","high","low","close","volume"])
        df["open_time"] = dataset["open_time"]

        # Calculate the volume basis and candle's basis
        tempBody = (dataset["close"] - dataset["open"]) / dataset[["close", "open"]].min(axis=1)
        tempUpperWick = (dataset["high"] - dataset[["close", "open"]].max(axis = 1) )
        tempLowerWick = (dataset[["close", "open"]].min(axis = 1) - dataset["low"])

        bodyMean = tempBody.mean()
        bodyStd = tempBody.std()

        tempLen = dataset["high"] - dataset["low"]
        volumeBasis = (dataset["volume"].rolling(10, min_periods = 1).mean()/tempLen).replace(to_replace = np.inf, value = 0)
        upperShadowBasis = tempUpperWick.rolling(10, min_periods = 1).mean()
        lowerShadowBasis = tempLowerWick.rolling(10, min_periods = 1).mean()

        upperShadowBasis.fillna(0, inplace=True)
        lowerShadowBasis.fillna(0, inplace=True)
        
        random = np.random.normal(loc = 0, scale = bodyStd, size = candlesCount)
        
        df["open"] = (1 + random.cumsum() * 1.1) * startingPrice 
        df["close"] = df["open"].shift(-1)
        
        _body = df["close"] - df["open"]
        
        lstUpperShadow = []
        lstLowerShadow = []
        # The amount of noise for generating the high and low wicks
        for i in range(candlesCount):
            lstUpperShadow.append(abs(np.random.normal(loc = upperShadowBasis[i], scale = upperShadowBasis[i], size = 1))[0])
            lstLowerShadow.append(abs(np.random.normal(loc = lowerShadowBasis[i], scale = upperShadowBasis[i], size = 1))[0])
        
        _upperWick = pd.Series(lstUpperShadow)
        _lowerWick = pd.Series(lstLowerShadow)

        up = df[["open","close"]].max(axis=1)
        down = df[["open","close"]].min(axis=1)

        df["low"] = down - tempLowerWick * 1.2
        df["high"] = up + tempUpperWick * 1.2
        df["volume"] = abs(_body * volumeBasis)
        df["close_time"] = df["open_time"].shift(-1)

        # Shifting the data to make negative prices positive
        lowestNum = df["low"].min()

        if lowestNum < 0:
            shift = randint(1,maxMargin)

            df["open"] += shift -lowestNum
            df["close"] += shift -lowestNum
            df["low"] += shift -lowestNum
            df["high"] += shift -lowestNum

        return df.iloc[:-2,:]

In [None]:
import datetime

candles = pd.read_pickle("../../../Data/HistoricalCandles/HistoricalCandles_BTCUSDT_5m_1502942400_1656156900.pkl")
candles["open_time_date"] = pd.to_datetime(candles["open_time"], unit = "s")

tempCandles = candles[
(datetime.datetime.strptime("2018-05-01 00:00:00", '%Y-%m-%d %H:%M:%S') <= candles["open_time_date"]) & 
(candles["open_time_date"] < datetime.datetime.strptime("2019-06-01 00:00:00", '%Y-%m-%d %H:%M:%S'))
]

print(tempCandles.shape[0])
cls = GenerateCandles()

generatedCandles = cls.BrownianMotion(
    maxMargin = 2000,
    dataset = tempCandles
)
plotDataset(generatedCandles, method = "plotly")


In [84]:
import datetime
candles = pd.read_pickle("../../../Data/HistoricalCandles/HistoricalCandles_BTCUSDT_5m_1502942400_1656156900.pkl")
candles["open_time_date"] = pd.to_datetime(candles["open_time"], unit = "s")

date_time_obj = datetime.datetime.strptime("2018-01-01 00:00:00", '%Y-%m-%d %H:%M:%S')
date_time_obj.timestamp()

dates = [
    ["2017-05-01 00:00:00" , str(candles["open_time_date"].iloc[-1])],
    ["2017-05-01 00:00:00" , "2021-01-01 00:00:00"],
    ["2021-01-01 00:00:00" , "2021-07-21 00:00:00"],
    ["2021-07-21 00:00:00" , "2021-10-31 00:00:00"],
    ["2021-10-31 00:00:00" , str(candles["open_time_date"].iloc[-1])],
    ]

cls = GenerateCandles()

for dateCouple in dates:
    for i in range(3):
        if dateCouple[1] == "inf":
            dateCouple[1] = "2030-01-01 00:00:00"
        tempCandles = candles[
            (datetime.datetime.strptime(dateCouple[0], '%Y-%m-%d %H:%M:%S') <= candles["open_time_date"]) & 
            (candles["open_time_date"] < datetime.datetime.strptime(dateCouple[1], '%Y-%m-%d %H:%M:%S'))
            ]


        generatedCandles = cls.BrownianMotion(
            maxMargin = 2000,
            dataset = tempCandles
        )
        fileName = "Generated_" + str(dateCouple[0].replace(":","-")) +"_"+ str(dateCouple[1].replace(":","-")) +f"({i}).pkl"
        # plotDataset(generatedCandles, method = "plotly")
        saveDataset(generatedCandles, location = fileName, saveImage = False)
        print(fileName, generatedCandles["open"].iloc[-1])


Generated_2017-05-01 00-00-00_2022-06-25 11-35-00(0).pkl 2694.0858312173905
Generated_2017-05-01 00-00-00_2022-06-25 11-35-00(1).pkl 1857.216311689317
Generated_2017-05-01 00-00-00_2022-06-25 11-35-00(2).pkl 15397.921170486094
Generated_2017-05-01 00-00-00_2021-01-01 00-00-00(0).pkl 8189.92352459458
Generated_2017-05-01 00-00-00_2021-01-01 00-00-00(1).pkl 15206.517965653664
Generated_2017-05-01 00-00-00_2021-01-01 00-00-00(2).pkl 14549.608263028673
Generated_2021-01-01 00-00-00_2021-07-21 00-00-00(0).pkl 8885.720992724222
Generated_2021-01-01 00-00-00_2021-07-21 00-00-00(1).pkl 12672.421470972473
Generated_2021-01-01 00-00-00_2021-07-21 00-00-00(2).pkl 21308.51974659966
Generated_2021-07-21 00-00-00_2021-10-31 00-00-00(0).pkl 58176.5937821371
Generated_2021-07-21 00-00-00_2021-10-31 00-00-00(1).pkl 21411.429654590665
Generated_2021-07-21 00-00-00_2021-10-31 00-00-00(2).pkl 29174.49859577
Generated_2021-10-31 00-00-00_2022-06-25 11-35-00(0).pkl 76665.35561104357
Generated_2021-10-31 00-

In [80]:
k = pd.read_pickle("Generated_2021-07-21 00-00-00_2021-10-31 00-00-00(2).pkl")
plotDataset(k, method = "plotly")


plotly.graph_objs.XAxis is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.layout.XAxis
  - plotly.graph_objs.layout.scene.XAxis



plotly.graph_objs.YAxis is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.layout.YAxis
  - plotly.graph_objs.layout.scene.YAxis




In [8]:
import datetime
date_time_obj = datetime.datetime.strptime("2018-01-01 00:00:00", '%Y-%m-%d %H:%M:%S')
date_time_obj.timestamp()

1514752200.0