In [1]:
# Parameters
initial_return_thresh = 0.1
upstream = {
    "data_saving": {
        "data_train": "C:\\Users\\berkayg\\Desktop\\Coding env\\crypto-prediction-project\\products\\data\\raw_train_data.csv",
        "data_validation": "C:\\Users\\berkayg\\Desktop\\Coding env\\crypto-prediction-project\\products\\data\\raw_validation_data.csv",
    }
}
product = {
    "nb": "C:\\Users\\berkayg\\Desktop\\Coding env\\crypto-prediction-project\\products\\notebooks\\process_data.ipynb",
    "data_train": "C:\\Users\\berkayg\\Desktop\\Coding env\\crypto-prediction-project\\products\\data\\processed_train_data.csv",
    "data_validation": "C:\\Users\\berkayg\\Desktop\\Coding env\\crypto-prediction-project\\products\\data\\processed_validation_data.csv",
}


In [2]:
import pandas as pd
import numpy as np

from scipy.signal import find_peaks

from datetime import datetime
import math
from sklearn.linear_model import LinearRegression
import plotly.express as px
from plotly import graph_objects as go

#import talib
import ta

In [3]:
columns = ['Close', 'Volume', 'Price', 'trend_ema_fast', 'trend_ema_slow', 'trend_sma_fast', 'trend_sma_slow']

In [4]:
def read_data(path):
    df = pd.read_csv(path, header=[0, 1],index_col=[0], parse_dates=[0])
    df = df.droplevel(0, axis=1)
    return df

In [5]:
df_train = read_data(upstream["data_saving"]["data_train"])
df_validation = read_data(upstream["data_saving"]["data_validation"])
df_total = pd.concat([df_validation, df_train])

# Final version

In [6]:
def _action(x):
    if not pd.isnull(x["loc_min"]):
        return "buy"
    elif not pd.isnull(x["loc_max"]):
        return "sell"
    else:
        return "neutral"

In [7]:
def find_extremas(dataframe, **kwargs):
    """Finds local maximas/minimas in the series
    Returns: List of extremas indices and dataframe with extremas added
    """
    df = dataframe.copy()
    df["returns"] = df["Close"].pct_change().fillna(0) * 100
    df["returns"] = df["returns"].shift(-1)
    df = df.iloc[:-1]
    
    df['loc_max'] = df.iloc[find_peaks(df.reset_index(drop=True)['Close'].to_numpy(), **kwargs)[0]]['Close']
    df['loc_min'] = df.iloc[find_peaks(df.reset_index(drop=True)['Close'].to_numpy() * -1, **kwargs)[0]]['Close']
    
    idx_with_mins = np.where(df['loc_min'] > 0)[0]
    idx_with_maxs = np.where(df['loc_max'] > 0)[0]
    idx_concat = np.concatenate([idx_with_mins, idx_with_maxs])
    
    first_idx_bool = idx_with_mins.min() > idx_with_maxs.min()
    if first_idx_bool and 0 not in idx_concat:
        idx_with_mins = np.append(0, idx_with_mins)
        df.iloc[0, df.columns.get_loc("loc_min")] = df.iloc[0, df.columns.get_loc("Close")]
    
    elif not first_idx_bool and 0 not in idx_concat:
        idx_with_maxs = np.append(0, idx_with_maxs)
        df.iloc[0, df.columns.get_loc("loc_max")] = df.iloc[0, df.columns.get_loc("Close")]
    
    return idx_with_maxs, idx_with_mins, idx_concat, df

In [8]:
def optimize_extremas(df, idx_concat,thresh=1):
    """Remove unsignificant peaks and dips
    Returns: Dataframe
    """
    df["idx"] = ((df["loc_min"].isnull()) & (df["loc_max"].isnull())).cumsum()
    df["idx"] = np.arange(0, df.shape[0])
    df["count"] = np.arange(0, df.shape[0])
    df["count"] = (df["count"].isin(idx_concat)).cumsum()
    df['cum_return'] = df.sort_index(ascending=False).groupby('count')['returns'].transform(np.cumsum)#.shift()
    df.loc[(df["loc_min"].isnull() == False) & (abs(df["cum_return"]) < thresh), "loc_min"] = np.nan
    df.loc[(df["loc_max"].isnull() == False) & (abs(df["cum_return"]) < thresh), "loc_max"] = np.nan
    df["action"] = df.apply(_action, axis=1)
    return df

In [9]:
def plot_series(df):
    hover_data = {"Close": True, "Date": True, "returns":True, "cum_return":True, "idx":True, "count":True}
    fig_obj = px.line(x="Date", y="Close", data_frame=df.reset_index(), hover_data=hover_data)

    extrema_min = px.scatter(x="Date", y="loc_min", data_frame=df.reset_index(), hover_data=hover_data)
    extrema_min.update_traces(marker=dict(color='green'))

    extrema_max = px.scatter(x="Date", y="loc_max", data_frame=df.reset_index(), hover_data=hover_data)
    extrema_max.update_traces(marker=dict(color='red'))

    fig = fig_obj.data + extrema_min.data + extrema_max.data
    return go.Figure(fig)

In [10]:
def prepare_data(df, symbol, prominence=0.8, thresh=0.5):
    print(f"CONFIG: Symbol: {symbol} - Prominence: {prominence} - Thresh: {thresh}")
    try:
        df = df[[symbol]].droplevel(axis=1, level=0)
    except:
        pass
    _, _, idx_concat, df = find_extremas(df, prominence=prominence)
    df = optimize_extremas(df, idx_concat, thresh=thresh)
    fig = plot_series(df)
    print(df.action.value_counts())
    return idx_concat, df[["Open", "High", "Low", "Close", "Volume", "action"]], fig
    

In [11]:
def linear_regression(x, y):
    """
    performs linear regression given x and y. outputs regression coefficient
    """
    #fit linear regression
    lr = LinearRegression()
    lr.fit(x, y)
    
    return lr.coef_[0][0]

def n_day_regression(n, df, idxs):
    """
    n day regression.
    """
    #variable
    _varname_ = f'{n}_reg'
    df[_varname_] = np.nan

    for idx in idxs:
        if idx > n:
            
            y = df['Close'][idx - n: idx].to_numpy()
            x = np.arange(0, n)
            #reshape
            y = y.reshape(y.shape[0], 1)
            x = x.reshape(x.shape[0], 1)
            #calculate regression coefficient 
            coef = linear_regression(x, y)
            df.iloc[idx, df.columns.get_loc(_varname_)] = coef #add the new value
            
    return df


In [12]:
def Supertrend(df, atr_period, multiplier):
    
    high = df['High']
    low = df['Low']
    close = df['Close']
    center = df['center']
    
    # calculate ATR
    price_diffs = [high - low, 
                   high - close.shift(), 
                   close.shift() - low]
    true_range = pd.concat(price_diffs, axis=1)
    true_range = true_range.abs().max(axis=1)
    # default ATR calculation in supertrend indicator
    atr = true_range.ewm(alpha=1/atr_period,min_periods=atr_period).mean() 
    # df['atr'] = df['tr'].rolling(atr_period).mean()
    
    # HL2 is simply the average of high and low prices
    hl2 = (high + low) / 2
    # upperband and lowerband calculation
    # notice that final bands are set to be equal to the respective bands
    # final_upperband = upperband = hl2 + (multiplier * atr)
    # final_lowerband = lowerband = hl2 - (multiplier * atr)

    final_upperband = upperband = center + (multiplier * atr)
    final_lowerband = lowerband = center - (multiplier * atr)

    
    # initialize Supertrend column to True
    supertrend = [True] * len(df)
    
    for i in range(1, len(df.index)):
        curr, prev = i, i-1
        
        # if current close price crosses above upperband
        if close[curr] > final_upperband[prev]:
            supertrend[curr] = True
        # if current close price crosses below lowerband
        elif close[curr] < final_lowerband[prev]:
            supertrend[curr] = False
        # else, the trend continues
        else:
            supertrend[curr] = supertrend[prev]
            
            # adjustment to the final bands
            if supertrend[curr] == True and final_lowerband[curr] < final_lowerband[prev]:
                final_lowerband[curr] = final_lowerband[prev]
            if supertrend[curr] == False and final_upperband[curr] > final_upperband[prev]:
                final_upperband[curr] = final_upperband[prev]

        # to remove bands according to the trend direction
        if supertrend[curr] == True:
            final_upperband[curr] = np.nan
        else:
            final_lowerband[curr] = np.nan
    
    return pd.DataFrame({
        'Supertrend': supertrend,
        'Final Lowerband': final_lowerband,
        'Final Upperband': final_upperband
    }, index=df.index)

In [13]:
def supertrend_processing(df):
    if "action" in df.columns:
        df = df.drop(columns=["action"])
    

    df["ph_arg"] = df["High"].rolling(2).apply(lambda x: np.argmax(x))
    df["ph_arg_rev"] = df["High"].sort_index(ascending=False).rolling(2).apply(lambda x: np.argmax(x))
    df["ph"] = (df["ph_arg"] == 1) & (df["ph_arg_rev"] == 1)
    df["ph"] = df.apply(lambda x: x["High"] if x["ph"] == 1 else np.nan, axis=1)

    df["pl_arg"] = df["Low"].rolling(2).apply(lambda x: np.argmin(x))
    df["pl_arg_rev"] = df["Low"].sort_index(ascending=False).rolling(2).apply(lambda x: np.argmin(x))
    df["pl"] = (df["pl_arg"] == 1) & (df["pl_arg_rev"] == 1)
    df["pl"] = df.apply(lambda x: x["Low"] if x["pl"] == 1 else np.nan, axis=1)

    df.drop(columns=["ph_arg_rev", "ph_arg", "pl_arg_rev", "pl_arg"], inplace=True)
    df = extract_pivot_centers(df)
    df["ph"] = np.where(df["ph"].isnull(), 0, 1)
    df["pl"] = np.where(df["pl"].isnull(), 0, 1)

    atr_period = 3
    high = df['High']
    low = df['Low']
    close = df['Close']
    center = df['center']
    # calculate ATR
    price_diffs = [high - low, 
                   high - close.shift(), 
                   close.shift() - low]
    true_range = pd.concat(price_diffs, axis=1)
    true_range = true_range.abs().max(axis=1)
    # default ATR calculation in supertrend indicator
    atr = true_range.ewm(alpha=1/atr_period,min_periods=atr_period).mean() 
    df["atr"] = atr
    df.dropna(inplace=True)
    
    return df

In [14]:
def extract_pivot_centers(df):
    df["center"] = np.nan
    center = np.nan
    for i, k in enumerate(df.iterrows()):
        ph = k[1]["ph"]
        pl = k[1]["pl"]
        if not pd.isnull(ph):
            lastpp = ph
            center = lastpp
        elif not pd.isnull(pl):
            lastpp = pl
            center = lastpp
        else:
            k[1]["center"] = center
            continue

        if pd.isnull(center):
            center = (center * 2 + lastpp) / 3

        k[1]["center"] = center

    return df

In [15]:
def process_data(df_source):
    df = ta.add_all_ta_features(df_source.drop(columns=["action"]), open="Open", high="High", low="Low", close="Close", volume="Volume")
    
    epsilon = 10e-10
    high = df["High"] - df["Low"]
    close = df["Close"] - df["Low"]
    df["Price"] = close/(high + epsilon)

    
    df = df[columns]
    df = n_day_regression(5, df, np.arange(0, df.shape[0]))
    df = n_day_regression(10, df, np.arange(0, df.shape[0]))
    df = n_day_regression(50, df, np.arange(0, df.shape[0]))

    df['action'] = df_source['action'].map(lambda x: 0 if x=="neutral" else 1)
    #df_processed['action'] = df_symbol['action'].map(action_dictionary)
    df["ema"] = df["trend_ema_fast"] / df["trend_ema_slow"]
    df["sma"] = df["trend_sma_fast"] / df["trend_sma_slow"]


    # import talib
    # df["RSI"] = talib.RSI(df_source["Close"])
    df["Stock_RSI"] = ta.momentum.StochRSIIndicator(df["Close"], window=5).stochrsi()
    df = df.dropna().drop(columns=["trend_ema_fast", "trend_ema_slow", "trend_sma_slow", "trend_sma_fast"])
    return df

In [16]:
df_supertrend = supertrend_processing(df_total)

In [17]:
custom_sum = lambda x: max(0, x.sum() - 1)
look_back_pivots = df_supertrend.rolling(5).agg({"ph": custom_sum, "pl": custom_sum}).add_prefix('5_step_lookback_').fillna(0)
df_supertrend = pd.concat([df_supertrend, look_back_pivots], axis=1)

In [18]:
idx_concat, df_symbol, fig_symbol = prepare_data(df_train, "VIDTUSDT", prominence=None, thresh=initial_return_thresh)

CONFIG: Symbol: VIDTUSDT - Prominence: None - Thresh: 0.1


neutral    13422
buy         3228
sell        3205
Name: action, dtype: int64


In [19]:
idx_concat_valid, df_symbol_valid, fig_symbol_valid = prepare_data(df_validation, "VIDTUSDT", prominence=None, thresh=initial_return_thresh)

CONFIG: Symbol: VIDTUSDT - Prominence: None - Thresh: 0.1


neutral    293
buy        107
sell        99
Name: action, dtype: int64


In [20]:
action_dictionary = {
    "buy": 1,
    "neutral": 0,
    "sell": 2
}

In [21]:
df_processed_valid = process_data(df_symbol_valid)
df_processed_valid.head()


invalid value encountered in double_scalars


invalid value encountered in double_scalars



Unnamed: 0_level_0,Close,Volume,Price,5_reg,10_reg,50_reg,action,ema,sma,Stock_RSI
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2022-03-01 04:15:00,43179.03,270.75635,0.348515,38.685,16.969273,-9.02423,1,0.999809,1.000316,0.134123
2022-03-01 04:20:00,43257.2,140.1357,0.994782,22.799,18.017818,-8.623711,0,1.000064,1.00063,0.513794
2022-03-01 04:25:00,43226.14,53.94089,0.153345,36.165,23.814727,-8.157106,1,1.000205,1.000886,0.255868
2022-03-01 04:30:00,43284.34,147.51626,0.813199,22.386,20.116242,-8.015828,0,1.00042,1.001172,0.497679
2022-03-01 04:35:00,43280.01,61.06185,0.788574,5.623,21.849394,-7.699331,0,1.000577,1.001504,0.89241


In [22]:
df_processed = process_data(df_symbol)
df_processed.head()


invalid value encountered in double_scalars


invalid value encountered in double_scalars



Unnamed: 0_level_0,Close,Volume,Price,5_reg,10_reg,50_reg,action,ema,sma,Stock_RSI
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2022-03-02 21:55:00,44068.55,319.6649,0.866738,24.348,8.514242,1.252123,0,1.000938,1.000815,1.0
2022-03-02 22:00:00,44148.73,227.98166,0.758602,62.846,19.645636,2.491081,1,1.001401,1.001168,1.0
2022-03-02 22:05:00,43998.89,351.73024,0.0,90.114,30.519758,3.481627,0,1.001476,1.001339,0.0
2022-03-02 22:10:00,43999.99,94.95091,0.476972,58.718,34.832364,4.021403,0,1.001519,1.001592,0.004056
2022-03-02 22:15:00,43981.67,99.67554,0.858934,12.386,35.864061,4.859007,1,1.001503,1.001626,0.0


In [23]:
df_processed = pd.concat([df_processed, df_supertrend[["ph", "pl", "center", "5_step_lookback_ph", "5_step_lookback_pl", "atr"]]], axis=1).dropna()

In [24]:
df_processed.head()

Unnamed: 0_level_0,Close,Volume,Price,5_reg,10_reg,50_reg,action,ema,sma,Stock_RSI,ph,pl,center,5_step_lookback_ph,5_step_lookback_pl,atr
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2022-03-02 21:55:00,44068.55,319.6649,0.866738,24.348,8.514242,1.252123,0.0,1.000938,1.000815,1.0,0,0,43796.28,0.0,0.0,116.996647
2022-03-02 22:00:00,44148.73,227.98166,0.758602,62.846,19.645636,2.491081,1.0,1.001401,1.001168,1.0,0,0,43796.28,0.0,0.0,121.204431
2022-03-02 22:05:00,43998.89,351.73024,0.0,90.114,30.519758,3.481627,0.0,1.001476,1.001339,0.0,1,0,44380.0,0.0,0.0,207.839621
2022-03-02 22:10:00,43999.99,94.95091,0.476972,58.718,34.832364,4.021403,0.0,1.001519,1.001592,0.004056,0,0,44380.0,0.0,0.0,171.056414
2022-03-02 22:15:00,43981.67,99.67554,0.858934,12.386,35.864061,4.859007,1.0,1.001503,1.001626,0.0,0,1,43870.0,0.0,0.0,157.374276


In [25]:
df_processed_valid = pd.concat([df_processed_valid, df_supertrend[["ph", "pl", "center", "5_step_lookback_ph", "5_step_lookback_pl", "atr"]]], axis=1).dropna()

In [26]:
df_processed_valid.head()

Unnamed: 0_level_0,Close,Volume,Price,5_reg,10_reg,50_reg,action,ema,sma,Stock_RSI,ph,pl,center,5_step_lookback_ph,5_step_lookback_pl,atr
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2022-03-01 04:15:00,43179.03,270.75635,0.348515,38.685,16.969273,-9.02423,1.0,0.999809,1.000316,0.134123,0,0,43288.11,1.0,1.0,136.635784
2022-03-01 04:20:00,43257.2,140.1357,0.994782,22.799,18.017818,-8.623711,0.0,1.000064,1.00063,0.513794,0,0,43288.11,0.0,0.0,117.283856
2022-03-01 04:25:00,43226.14,53.94089,0.153345,36.165,23.814727,-8.157106,1.0,1.000205,1.000886,0.255868,0,0,43288.11,0.0,0.0,91.492571
2022-03-01 04:30:00,43284.34,147.51626,0.813199,22.386,20.116242,-8.015828,0.0,1.00042,1.001172,0.497679,1,1,43307.72,1.0,1.0,102.715047
2022-03-01 04:35:00,43280.01,61.06185,0.788574,5.623,21.849394,-7.699331,0.0,1.000577,1.001504,0.89241,0,0,43307.72,0.0,0.0,88.373365


In [27]:
df_processed.to_csv(product["data_train"])
df_processed_valid.to_csv(product["data_validation"])