In [1]:
import yfinance as yf
import plotly.graph_objects as go
import plotly.express as px
import pandas as pd
# from backtesting import Backtest, Strategy
import itertools
import talib
from plotly.subplots import make_subplots
import numpy as np
from sklearn.cluster import KMeans
import holidays 

import warnings
warnings.filterwarnings("ignore")

In [2]:
TICKER: str = "SOL-USD"

data_period = {
    "start": "2024-01-01", 
    "end": "2024-12-31",
}

interval = '1d'

lags_count: int  = 7 # Количество лагов (lags)
stride: int = 1 # Интервалы между лагами (stride)

stat_len: int = 7 # Среднее (mean): скользящее среднее цен акций за последние N цен

diff_count: int  = 7 # Количество diff

pct_change_count: int  = 7 # Количество PCT

prev_tick_trend_sum_count: int = 7 


last_n_ticker_cluster_count: int = 7



output_file_name: str = f"{TICKER}_{interval}_{data_period["start"]}-{data_period["end"]}"


### Load data

In [3]:
# Загрузка данных

def load_data(_ticker, _start, _end, _interval):
    stock_data: pd.DataFrame = yf.download(_ticker, start=_start, end=_end, interval=_interval)
    try:
        stock_data: pd.DataFrame = stock_data.drop(columns=['Adj Close'])
    except KeyError:
        ...
    stock_data = stock_data.reset_index()
    stock_data = stock_data.droplevel("Ticker", axis=1)
    stock_data.columns = stock_data.columns.str.lower()

    if "datetime" in stock_data.columns:
        stock_data["datetime"] = pd.to_datetime(stock_data["datetime"])
    else:
        stock_data["datetime"] = pd.to_datetime(stock_data["date"])
    stock_data.columns.name = None
    stock_data.set_index("datetime", inplace=True)

    stock_data["date"] = stock_data.index
    return stock_data

stock_data = load_data(TICKER, data_period["start"], data_period["end"], interval)

[*********************100%***********************]  1 of 1 completed


### Feature Generators


In [4]:
def gen_dates(ohlc: pd.DataFrame) -> pd.DataFrame:
    """Generate features from date."""
    df = ohlc.copy()    
    df['year']=pd.to_numeric(df['date'].dt.strftime('%Y'))
    df['month']=pd.to_numeric(df['date'].dt.strftime('%m'))
    df['day']=pd.to_numeric(df['date'].dt.strftime('%d'))
    # df['year_month']=df['date'].dt.strftime('%Y_%m')
    df['hour']=pd.to_numeric(df['date'].dt.strftime('%H'))
    df['minute']=pd.to_numeric(df['date'].dt.strftime('%M'))
    return df

In [5]:
def lag_shift_close_price(ohlc: pd.DataFrame) -> pd.DataFrame:
    """Generate features from price lags."""
    df = ohlc.copy()

    for i in range(1, lags_count+1, stride):
        name: str = f"shift_{i}"
        df[name] = df["close"].shift(i)

    return df

In [6]:
def stat_base(ohlc: pd.DataFrame) -> pd.DataFrame:
    """Generate features based on statistics."""
    df = ohlc.copy()
    df["mean"] = df["close"].rolling(window=stat_len).mean()
    df["median"] = df["close"].rolling(window=stat_len).median()
    df["sum"] = df["close"].rolling(window=stat_len).sum()
    df["min"] = df["close"].rolling(window=stat_len).min()
    df["max"] = df["close"].rolling(window=stat_len).max()
    df["std"] = df["close"].rolling(window=stat_len).std()

    return df

In [7]:
def diff_close_price(ohlc: pd.DataFrame) -> pd.DataFrame:
    """Generate features based on diff price."""
    df = ohlc.copy()
    for i in range(1, diff_count + 1):
        df[f"diff_{i}"] =  df['close'].diff(i)
    return df

def pct_close_price(ohlc: pd.DataFrame) -> pd.DataFrame:
    df = ohlc.copy()
    for i in range(1, pct_change_count + 1):
        df[f"pct_{i}"] =  df['close'].pct_change(i)
    return df

In [8]:
def last_ticker_trend(ohlc: pd.DataFrame) -> pd.DataFrame:
    df = ohlc.copy()
    df["prev_tick_trend"] = df["close"].diff(1)
    df["prev_tick_trend"] = np.where(df['prev_tick_trend'] > 0 , 1, -1)
    for i in range(2, prev_tick_trend_sum_count+1):
        df[f"prev_tick_trend_sum_{i}"] = df["prev_tick_trend"].rolling(window=i).sum()    

    return df

In [9]:
def close_related_to_others(ohlc: pd.DataFrame) -> pd.DataFrame:
    df = ohlc.copy()
    for rel in ("open", "high", "low"):
        name: str = f"close_{rel}_rel"
        df[name] = df["close"] / df[rel]

    return df

def spread_between_prices(ohlc: pd.DataFrame) -> pd.DataFrame:
    df = ohlc.copy()
    for l, r in ( ("open", "close"), ("high", "low") ):
        name: str = f"{l}_{r}_spread"
        df[name] = df[l] / df[r]

    return df

In [10]:
def price_mutation(ohlc: pd.DataFrame) -> pd.DataFrame:
    df = ohlc.copy()
    df["ohlc_mean"] = ( df["open"] + df["high"] + df["low"] + df["close"] ) / 4
    return df

In [11]:
def camarilla(ohlc: pd.DataFrame) -> pd.DataFrame:
    """Camarilla lines."""
    df = ohlc.copy()
    prefix_name: str = "camarilla"

    df[f"{prefix_name}_h1"] = df["close"] + (df["high"]-df["low"]) * 1.1 / 12
    df[f"{prefix_name}_h2"] = df["close"] + (df["high"]-df["low"]) * 1.1 / 6
    df[f"{prefix_name}_h3"] = df["close"] + (df["high"]-df["low"]) * 1.1 / 4
    df[f"{prefix_name}_h4"] = df["close"] + (df["high"]-df["low"]) * 1.1 / 2
    df[f"{prefix_name}_h5"] = (df["high"] / df["low"]) * df["close"]

    df[f"{prefix_name}_L1"] = df["close"] - (df["high"]-df["low"]) *1.1 /12
    df[f"{prefix_name}_L2"] = df["close"] - (df["high"]-df["low"]) *1.1 /6
    df[f"{prefix_name}_L3"] = df["close"] - (df["high"]-df["low"]) *1.1 /4
    df[f"{prefix_name}_L4"] = df["close"] - (df["high"]-df["low"]) *1.1 /2
    df[f"{prefix_name}_L5"] = df["close"] - (df[f"{prefix_name}_h5"] - df["close"])

    return df

In [12]:
def volume_diff(ohlc: pd.DataFrame) -> pd.DataFrame:
    df = ohlc.copy()
    for i in range(1, diff_count + 1):
        df[f"diff_volume_{i}"] =  df['volume'].diff(i)
    return df

In [13]:
def market_turnover(ohlc: pd.DataFrame) -> pd.DataFrame:
    df = ohlc.copy()
    df["market_turnover"] = df["close"] * df["volume"]

    for i in range(1, diff_count + 1):
        df[f"diff_market_turnover_{i}"] =  df['market_turnover'].diff(i)

    for i in range(1, pct_change_count + 1):
        df[f"market_turnover_pct_{i}"] =  df['market_turnover'].pct_change(i)

    return df


In [14]:
def apply_log(ohlc: pd.DataFrame) -> pd.DataFrame:
    df = ohlc.copy()
    for column in ("close", "high", "low", "open", "volume", "market_turnover"):
        df[f"log_{column}"] = df[column].apply(np.log1p)
    return df

In [15]:
def set_holidays(ohlc: pd.DataFrame) -> pd.DataFrame:
    df = ohlc.copy()
    max_year =  df["year"].max()
    min_year = df["year"].min()
    
    all_holidays: dict[str, set] = {}
    holidays_func: dict[str, object] = {
        "us": holidays.US,
        "uk": holidays.UK,
        "de": holidays.DE,
        "jp": holidays.JP,
    }
    for name, fn in holidays_func.items():
        all_holidays[name] = set()
        for i in range(min_year, max_year+1):
            for d, _ in fn(years = i).items():
                all_holidays[name].add(d)

    for name, _ in holidays_func.items():
        country_holidays = all_holidays[name]
        df[f"{name}_holiday"] = np.where( df['date'].isin(country_holidays ), 1, 0)

    return df

In [16]:
def tech_indicators(ohlc: pd.DataFrame) -> pd.DataFrame:
    df = ohlc.copy()
    df["ema"] = talib.EMA(df["close"])    
    df["rsi"] = talib.RSI(df["close"])
    df["atr"] = talib.ATR(high=df["close"], low=df["low"], close=df["close"])
    df["bollinger_upper"], df["bollinger_middle"], df["bollinger_lower"] = talib.BBANDS(df["close"], timeperiod=20)
    
    df["momentum"] = talib.MOM(df["close"])
    for i in range(1, lags_count+1, stride):
        name: str = f"momentum_shift_{i}"
        df[name] = df["momentum"].shift(i)

    df["obv"] = talib.OBV(df["close"], df["volume"])

    df["MACD"], df["MACDsignal"], df["MACDhist"] = talib.MACD(df["close"], fastperiod=12, slowperiod=26, signalperiod=9)
    

    return df

In [17]:
def tech_relation_with_price(ohlc: pd.DataFrame) -> pd.DataFrame:
    df = ohlc.copy()
    df["ema_close_up_down"] = np.where(df['ema'] < df["close"], 1, -1)
    df["distance_close_ema"] = df["close"] - df["ema"]

    # ToDo Remove rsi_line numbers to settings
    df["rsi_line"] = 0
    df["rsi_line"]  = np.where(df['rsi'] >= 70, 1, df["rsi_line"])
    df["rsi_line"]  = np.where(df['rsi'] <= 30, -1, df["rsi_line"])


    return df

In [18]:
def cluster(ohlc: pd.DataFrame) -> pd.DataFrame:
    df = ohlc.copy()
    kmeans = KMeans(n_clusters=6)

    # ToDo Подумать что включить для кластеризации
    feature_for_cluster = [
        'diff_1', 'pct_1', 'prev_tick_trend', 'prev_tick_trend_sum_2', 
        'close_open_rel', 'close_high_rel', 'close_low_rel', 
    ]
    df["cluster"] = kmeans.fit_predict(df[feature_for_cluster])
    df["cluster"] = df["cluster"].astype("category")



    
    feature_for_close_cluster = ["close"]
    for i in range(1, lags_count+1, stride):
        name: str = f"shift_{i}"
        feature_for_close_cluster.append(name)

    kmeans = KMeans(n_clusters=last_n_ticker_cluster_count)
    df["last_n_ticker_cluster"] = kmeans.fit_predict(df[feature_for_close_cluster])
    df["last_n_ticker_cluster"] = df["last_n_ticker_cluster"].astype("category")


    return df

In [19]:
def drop_nn(ohlc: pd.DataFrame) -> pd.DataFrame:
    df = ohlc.copy()
    df = df.dropna()
    return df

### Features

In [20]:
features: dict = {
    # 0. обработка даты
    "dates": gen_dates,

    # 1. Lags
    "lag_shift_close_price": lag_shift_close_price,

    # 2. На основе статистики
    "stat_base": stat_base,    

    # 3. Diff
    "diff_close_price": diff_close_price,
    "pct_close_price": pct_close_price,
    
    # 4. Trend based
    "last_ticker_trend": last_ticker_trend,

    # 5. Relations
    "close_related_to_others": close_related_to_others,
    "spread_between_prices": spread_between_prices,
    "price_mutation": price_mutation,

    # 6. Volume based
    "volume_diff": volume_diff,

    # 7. Support and resistance lines
    "camarilla": camarilla,
    
    # 8. Market turnover
    "market_turnover": market_turnover,

    # 9. LOG
    "log": apply_log,

    # 10. Extra data
    "set_holidays": set_holidays,

    # Tech. indicators``
    "tech": tech_indicators,
    "tech_relation_with_price": tech_relation_with_price,

    # 12. Cluster
    "drop_non": drop_nn,
    "cluster": cluster,

}

In [21]:
stock_data

Unnamed: 0_level_0,date,close,high,low,open,volume
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2024-01-01,2024-01-01,109.508682,109.508682,101.512810,101.512810,2157671990
2024-01-02,2024-01-02,106.611221,116.882492,106.388985,109.549065,3782057553
2024-01-03,2024-01-03,98.594437,109.848091,92.923668,106.616531,5472216595
2024-01-04,2024-01-04,105.077538,107.982468,97.108330,98.586975,3272723247
2024-01-05,2024-01-05,99.978790,105.235428,95.942726,105.052010,3022127826
...,...,...,...,...,...,...
2024-12-26,2024-12-26,188.218109,199.397980,186.855591,197.475159,2864701219
2024-12-27,2024-12-27,183.828476,193.753357,182.947128,188.216553,2788285851
2024-12-28,2024-12-28,195.013535,195.435440,183.833328,183.833389,2158687217
2024-12-29,2024-12-29,189.744263,197.181076,188.341003,195.022446,2165753102


In [22]:
exp = stock_data.copy()

In [23]:
# Generate features

for name, func in features.items():
    print(f"Generate features { name }")
    exp = func(exp)


Generate features dates
Generate features lag_shift_close_price
Generate features stat_base
Generate features diff_close_price
Generate features pct_close_price
Generate features last_ticker_trend
Generate features close_related_to_others
Generate features spread_between_prices
Generate features price_mutation
Generate features volume_diff
Generate features camarilla
Generate features market_turnover
Generate features log
Generate features set_holidays
Generate features tech
Generate features tech_relation_with_price
Generate features drop_non
Generate features cluster


### Target

In [24]:
def target_next_trend(ohlc: pd.DataFrame) -> pd.DataFrame:
    """Target is next price up or down."""
    df = ohlc.copy()

    df["y_next_trend"] = ohlc['close'].shift(-1)
    df["y_next_trend"] = np.where(df['y_next_trend'] > df['close'] , 1, -1)

    return df



In [25]:
exp = target_next_trend(exp)

In [26]:
exp

Unnamed: 0_level_0,date,close,high,low,open,volume,year,month,day,hour,...,obv,MACD,MACDsignal,MACDhist,ema_close_up_down,distance_close_ema,rsi_line,cluster,last_n_ticker_cluster,y_next_trend
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2024-02-03,2024-02-03,97.879982,101.058983,96.812576,100.451134,1349320819,2024,2,3,0,...,-1.107911e+09,-0.119864,-1.650858,1.530994,1,1.383946,0,2,3,-1
2024-02-04,2024-02-04,95.476959,98.457588,95.266373,97.885063,1304106726,2024,2,4,0,...,-2.412018e+09,-0.213808,-1.363448,1.149640,-1,-0.953330,0,2,3,1
2024-02-05,2024-02-05,95.548660,98.544044,94.292526,95.471466,1635327054,2024,2,5,0,...,-7.766906e+08,-0.279254,-1.146609,0.867355,-1,-0.824750,0,0,3,1
2024-02-06,2024-02-06,96.861076,97.790787,93.280739,95.549614,1699997346,2024,2,6,0,...,9.233067e+08,-0.222653,-0.961818,0.739165,1,0.456204,0,0,3,1
2024-02-07,2024-02-07,100.979279,101.236603,94.720222,96.861893,1733607189,2024,2,7,0,...,2.656914e+09,0.152747,-0.738905,0.891652,1,4.279283,0,0,3,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-12-26,2024-12-26,188.218109,199.397980,186.855591,197.475159,2864701219,2024,12,26,0,...,-9.042196e+09,-9.620007,-7.259207,-2.360800,-1,-21.315721,0,5,5,-1
2024-12-27,2024-12-27,183.828476,193.753357,182.947128,188.216553,2788285851,2024,12,27,0,...,-1.183048e+10,-10.110684,-7.829502,-2.281182,-1,-24.046945,0,5,5,1
2024-12-28,2024-12-28,195.013535,195.435440,183.833328,183.833389,2158687217,2024,12,28,0,...,-9.671794e+09,-9.487640,-8.161130,-1.326511,-1,-12.032087,0,4,5,-1
2024-12-29,2024-12-29,189.744263,197.181076,188.341003,195.022446,2165753102,2024,12,29,0,...,-1.183755e+10,-9.311720,-8.391248,-0.920472,-1,-16.185142,0,5,5,1


### Result

In [27]:
exp = exp.dropna()
del exp["date"]

In [28]:
exp.columns

Index(['close', 'high', 'low', 'open', 'volume', 'year', 'month', 'day',
       'hour', 'minute',
       ...
       'obv', 'MACD', 'MACDsignal', 'MACDhist', 'ema_close_up_down',
       'distance_close_ema', 'rsi_line', 'cluster', 'last_n_ticker_cluster',
       'y_next_trend'],
      dtype='object', length=116)

In [29]:
len(exp.columns)

116

In [30]:
exp.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 332 entries, 2024-02-03 to 2024-12-30
Columns: 116 entries, close to y_next_trend
dtypes: category(2), float64(100), int64(14)
memory usage: 299.3 KB


In [31]:
feature_count: int = len(exp.columns)
exp.to_csv(f"data/{output_file_name}_{feature_count}_feature")