In [9]:
from apscheduler.schedulers.blocking import BlockingScheduler
from apscheduler.schedulers.background import BackgroundScheduler
from tzlocal import get_localzone
from datetime import datetime
import plotly.graph_objects as go
import warnings
warnings.simplefilter('ignore')
from tqdm import tqdm
tqdm.pandas()

import train_model as tm
import data_processing as dp
from data_loader import load_data_at_start_date, load_data_period
from features import FeatureEngineering

In [10]:
fe_params = {
    'emaf': 20,
    'emam': 100,
    'emas': 150,
    'rsi': 14,
    'macd': [12, 26, 9],
 }

fe = FeatureEngineering(fe_params)

In [11]:
def train_model(params):   
    data_store = params['data_store_dir']
    file_compress = params['file_compress']
    tickers = params['tickers']
    trend_features = params['trend_features']
    
    for ticker in tqdm(tickers):
        data_with_trend = dp.get_data(data_store, ticker, compress=file_compress) ## Data contains features and TA indicators
        print(f'Train data length: {len(data_with_trend)}')
        # print(data_with_trend.isnull().sum())
        # print(data_with_trend.tail(5))

        features = [d[ticker] for d in trend_features if ticker in d.keys()][0] + params['trend_indicators']
        data_with_features = data_with_trend[features + params['target_columns']]

        params['features'] = features
        tm.predict_process(data_with_features, params)

    print(f'=== Finish Train models ===')

In [12]:
def get_trend_data(data, params):
    features = params['OHLCV']
    lag_periods = params['lag_periods']
    min_outliers=params['min_outliers']
    max_outliers=params['max_outliers']

    df = fe.clear_invalid_targets(fe.add_target(fe.enrich_with_indicators(data), lag_periods))
    # df = fe.clear_invalid_targets(fe.add_target2(fe.enrich_with_indicators(data)))
    df = fe.validate_outliers(df, 'Close', min_outliers, max_outliers)
    # # print(df.isnull().sum())
    
    return fe.create_trend_features(df, features, lag_periods) 

In [13]:
def feature_generation(params):
    print('=== Start feature generation ===')
    
    tickers = params['tickers']
    data_store = params['data_store_dir']
    new_data_dir = params['new_data_dir']
    file_compress = params['file_compress']

    trend_features = list()
    for ticker in tqdm(tickers):
        print(f'=== Add feature, ticker: {ticker} ===')

        new_df = dp.get_data(new_data_dir, ticker, compress=False) # get new raw data
        new_df.dropna(inplace=True)

        data_with_trend, new_trend_features = get_trend_data(new_df, params)
        # print(data_with_trend.isnull().sum())
        # print(data_with_trend.tail(5))
        dp.merge_and_store_new_data(data_with_trend, ticker, data_store, compress=file_compress) # merge with before saved and validate duplication data
        
        trend_features.append({
            ticker: new_trend_features
        })

        ## Plot
        if params['is_plot']:
            dp.plot_data(data_with_trend)
            
    print('=== Finish feature generation ===')
    return trend_features    

In [14]:
def get_models():
    models = list()
    # models.append(tm.ModelFunc.LOGISTIC_REG)

    # models.append(tm.ModelFunc.LINEAR_REG)
    # models.append(tm.ModelFunc.KNN_REG) 
    # models.append(tm.ModelFunc.DECISION_TREE_REG)
    # models.append(tm.ModelFunc.RANDOM_FOREST_REG)
    # models.append(tm.ModelFunc.CATBOOST_REG)
    # models.append(tm.ModelFunc.XGBOOST_REG)

    # models.append(tm.ModelFunc.XGBOOST_CLASS)
    # models.append(tm.ModelFunc.CATBOOST_CLASS)

    # models.append(tm.ModelFunc.RANDOM_FOREST_CLASS)
    models.append(tm.ModelFunc.DECISION_TREE_CLASS)
    models.append(tm.ModelFunc.KNN_CLASS)
    return models

### Executing job scheduler

In [15]:
params = {
    'new_data_dir': 'crypto_data',
    'data_store_dir': '_data_store',
    'time_interval': '1d',
    'period': -(datetime.now() - datetime(2019, 1, 1)).days,
    'tickers': ['BTC-USD'],
    # 'tickers': ['BTC-USD', 'ETH-USD', 'SOL-USD', 'XRP-USD'],

    'lag_periods': 3, #7
    'min_outliers': .23,
    'max_outliers': .77,
    'trend_indicators': [ 'emaf', 'emam', 'emas', 'rsi', 'macd', 'adx' ],
    'target_columns': ['Target'],
    'OHLCV': ['Open', 'High', 'Low', 'Close', 'Volume'],

    # 'max_train_size': 120,
    # 'test_size': 60,
    'max_train_size': 90,
    'test_size': 30,
    # 'max_train_size': 180,
    # 'test_size': 90,

    'file_compress': True,
    'use_stacking': True,
    'use_blending': False,
    'model_funcs': get_models(),
    'train_func': train_model,
    'is_train': True,
    'is_plot': False,
}

In [16]:
def start_job(params):
    new_data_dir = params['new_data_dir']
    tickers = params['tickers']
    period = params['period']
    time_interval = params['time_interval']

    print(f'=== Start job: {datetime.now()} ===')
    
    # Load new data
    crypto_dir = load_data_at_start_date(tickers, period, time_interval, new_data_dir)
    # crypto_dir = load_data_period(tickers, datetime(2019, 1, 1), datetime(2024, 12, 31), time_interval)

    trend_features = feature_generation(params)

    if params['is_train']:
        params['trend_features'] = trend_features
        train_model(params)
    
    print(f'=== Finish job: {datetime.now()} ===')

In [17]:
start_job(params)

# scheduler = BlockingScheduler(job_defaults={'misfire_grace_time': 15*60})
# scheduler = BackgroundScheduler(job_defaults={'misfire_grace_time': 15*60})
# # scheduler.add_job(start_job, 'cron', day_of_week='mon-fri', hour='*/4', minute=5, jitter=120, timezone=get_localzone())
# scheduler.add_job(start_job, 'interval', seconds=15, args=[params])
# scheduler.start()

=== Start job: 2025-01-11 14:36:55.117992 ===
Start load data, tickers ['BTC-USD'], interval: 1d, from: -2202


[*********************100%***********************]  1 of 1 completed


<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2203 entries, 2019-01-01 to 2025-01-11
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   (BTC-USD, Open)       2203 non-null   float64
 1   (BTC-USD, High)       2203 non-null   float64
 2   (BTC-USD, Low)        2203 non-null   float64
 3   (BTC-USD, Close)      2203 non-null   float64
 4   (BTC-USD, Adj Close)  2203 non-null   float64
 5   (BTC-USD, Volume)     2203 non-null   int64  
dtypes: float64(5), int64(1)
memory usage: 120.5 KB
Download data completed
=== Start feature generation ===


  0%|          | 0/1 [00:00<?, ?it/s]

=== Add feature, ticker: BTC-USD ===
Outliers detected: 0


100%|██████████| 1/1 [00:00<00:00,  1.58it/s]


=== Finish feature generation ===


  0%|          | 0/1 [00:00<?, ?it/s]

Train data length: 2048
=== Start Train models:
 [<function xgboost_classifier_model at 0x7f0731f5eb60>, <function catboot_classifier_model at 0x7f0731f5e660>, <function decision_tree_classifier_model at 0x7f0731f5e8e0>, <function knn_classifier_model at 0x7f0731f5ea20>] ===
     Train size: 4095, Val size: 1755, Test size: 1950
Pred Train size: 4095, Val size: 1755, Test size: 1950
=== Train sample metrics ===
ROC AUC: 1.0000
   Cutoff   Precision      Recall   Accuracy    F1-Score
0    50.0   99.953789  100.000000   99.97558   99.976889
1    60.0  100.000000  100.000000  100.00000  100.000000
2    70.0  100.000000  100.000000  100.00000  100.000000
3    80.0  100.000000   99.907536   99.95116   99.953747
=== Val sample metrics ===
ROC AUC: 0.4863
   Cutoff  Precision     Recall   Accuracy   F1-Score
0    50.0  51.239669  53.621622  48.660969  52.403592
1    60.0  50.991189  50.054054  48.319088  50.518276
2    70.0  50.835322  46.054054  48.091168  48.326716
3    80.0  50.913838  42.

100%|██████████| 1/1 [32:22<00:00, 1942.36s/it]

   Cutoff  Precision     Recall   Accuracy   F1-Score
0    50.0  52.194211  54.061896  49.384615  53.111639
1    60.0  52.234359  50.870406  49.282051  51.543361
2    70.0  51.991389  46.711799  48.871795  49.210392
3    80.0  51.847437  42.069632  48.564103  46.449546
=== Finish Train models ===
=== Finish job: 2025-01-11 15:09:20.610264 ===





In [59]:
scheduler.shutdown(wait=False)