In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost
import yfinance as yf
from sklearn.model_selection import TimeSeriesSplit
import pandas_ta as ta
from sqlalchemy import create_engine

In [None]:
db_filepath = ("place here directory for your destination database")
engine = create_engine(f"sqlite:///{db_filepath}")

#database for saving parametrization of tickers

In [None]:

ticker_list = ['insert tickers you want to check]

#ticker list

In [None]:
params = {
    'min_child_weight': [i for i in range(0, 6)],
    'reg_alpha': [i for i in range(0, 16)]
}


In [None]:
def download_data(ticker, startdate):
    # preparing dataset for training - data is downloaded with yfinance lib, information about tickers comes from Yahoo Finance
    df = yf.download(ticker, startdate)
    df.drop(columns=(['Adj Close', 'Volume']), inplace=True)
    df.reset_index(inplace=True)
    return df

In [None]:
def create_features(data):
    #creation of column features
    data['previous_close'] = data['Close'].shift(1)
    data['previous_open'] = data['Open'].shift(1)
    data['previous_high'] = data['High'].shift(1)
    data['previous_low'] = data['Low'].shift(1)
    data['close_lag_1'] = data['previous_close'].shift(1) / data['previous_close'] - 1
    data['close_lag_7'] = data['previous_close'].shift(7) / data['previous_close'] - 1
    data['close_lag_10'] = data['previous_close'].shift(10) / data['previous_close'] - 1
    data['close_lag_14'] = data['previous_close'].shift(14) / data['previous_close'] - 1
    data['close_lag_21'] = data['previous_close'].shift(21) / data['previous_close'] - 1
    data['close_lag_31'] = data['previous_close'].shift(31) / data['previous_close'] - 1
    data['close_lag_50'] = data['previous_close'].shift(50) / data['previous_close'] - 1
    data['close_lag_100'] = data['previous_close'].shift(100) / data['previous_close'] - 1
    data['open_lag_1'] = data['previous_open'].shift(1) / data['previous_open'] - 1
    data['open_lag_7'] = data['previous_open'].shift(7) / data['previous_open'] - 1
    data['open_lag_10'] = data['previous_open'].shift(10) / data['previous_open'] - 1
    data['open_lag_14'] = data['previous_open'].shift(14) / data['previous_open'] - 1
    data['open_lag_21'] = data['previous_open'].shift(21) / data['previous_open'] - 1
    data['open_lag_31'] = data['previous_open'].shift(31) / data['previous_open'] - 1
    data['open_lag_50'] = data['previous_open'].shift(50) / data['previous_open'] - 1
    data['open_lag_100'] = data['previous_open'].shift(100) / data['previous_open'] - 1
    data['high_lag_1'] = data['previous_high'].shift(1) / data['previous_high'] - 1
    data['high_lag_7'] = data['previous_high'].shift(7) / data['previous_high'] - 1
    data['high_lag_10'] = data['previous_high'].shift(10) / data['previous_high'] - 1
    data['high_lag_14'] = data['previous_high'].shift(14) / data['previous_high'] - 1
    data['high_lag_21'] = data['previous_high'].shift(21) / data['previous_high'] - 1
    data['high_lag_31'] = data['previous_high'].shift(31) / data['previous_high'] - 1
    data['high_lag_50'] = data['previous_high'].shift(50) / data['previous_high'] - 1
    data['high_lag_100'] = data['previous_high'].shift(100) / data['previous_high'] - 1
    data['low_lag_1'] = data['previous_low'].shift(1) / data['previous_low'] - 1
    data['low_lag_7'] = data['previous_low'].shift(7) / data['previous_low'] - 1
    data['low_lag_14'] = data['previous_low'].shift(14) / data['previous_low'] - 1
    data['low_lag_21'] = data['previous_low'].shift(21) / data['previous_low'] - 1
    data['low_lag_31'] = data['previous_low'].shift(31) / data['previous_low'] - 1
    data['low_lag_50'] = data['previous_low'].shift(50) / data['previous_low'] - 1
    data['low_lag_100'] = data['previous_low'].shift(100) / data['previous_low'] - 1
    data['ema_3'] = ta.ema(data['previous_close'], 3)
    data['ema_5'] = ta.ema(data['previous_close'], 5)
    data['ema_7'] = ta.ema(data['previous_close'], 7)
    data['ema_10'] = ta.ema(data['previous_close'], 10)
    data['ema_14'] = ta.ema(data['previous_close'], 14)
    data['ema_21'] = ta.ema(data['previous_close'], 21)
    data['ema_32'] = ta.ema(data['previous_close'], 32)
    data['ema_50'] = ta.ema(data['previous_close'], 50)
    data['ema_100'] = ta.ema(data['previous_close'], 100)
    data['cross_ema_5_3'] = np.where(data['ema_5'] > data['ema_3'], 1, 0)
    data['cross_ema_7_3'] = np.where(data['ema_7'] > data['ema_3'], 1, 0)
    data['cross_ema_10_3'] = np.where(data['ema_10'] > data['ema_3'], 1, 0)
    data['cross_ema_14_3'] = np.where(data['ema_14'] > data['ema_3'], 1, 0)
    data['cross_ema_21_3'] = np.where(data['ema_21'] > data['ema_3'], 1, 0)
    data['cross_ema_32_3'] = np.where(data['ema_32'] > data['ema_3'], 1, 0)
    data['cross_ema_50_3'] = np.where(data['ema_50'] > data['ema_3'], 1, 0)
    data['cross_ema_100_3'] = np.where(data['ema_100'] > data['ema_3'], 1, 0)
    data['cross_ema_7_5'] = np.where(data['ema_7'] > data['ema_5'], 1, 0)
    data['cross_ema_10_5'] = np.where(data['ema_10'] > data['ema_5'], 1, 0)
    data['cross_ema_14_5'] = np.where(data['ema_14'] > data['ema_5'], 1, 0)
    data['cross_ema_21_5'] = np.where(data['ema_21'] > data['ema_5'], 1, 0)
    data['cross_ema_32_5'] = np.where(data['ema_32'] > data['ema_5'], 1, 0)
    data['cross_ema_50_5'] = np.where(data['ema_50'] > data['ema_5'], 1, 0)
    data['cross_ema_100_5'] = np.where(data['ema_100'] > data['ema_5'], 1, 0)
    data['cross_ema_10_7'] = np.where(data['ema_10'] > data['ema_7'], 1, 0)
    data['cross_ema_14_7'] = np.where(data['ema_14'] > data['ema_7'], 1, 0)
    data['cross_ema_21_7'] = np.where(data['ema_21'] > data['ema_7'], 1, 0)
    data['cross_ema_32_7'] = np.where(data['ema_32'] > data['ema_7'], 1, 0)
    data['cross_ema_50_7'] = np.where(data['ema_50'] > data['ema_7'], 1, 0)
    data['cross_ema_100_7'] = np.where(data['ema_100'] > data['ema_7'], 1, 0)
    data['cross_ema_14_10'] = np.where(data['ema_14'] > data['ema_10'], 1, 0)
    data['cross_ema_21_10'] = np.where(data['ema_21'] > data['ema_10'], 1, 0)
    data['cross_ema_32_10'] = np.where(data['ema_32'] > data['ema_10'], 1, 0)
    data['cross_ema_50_10'] = np.where(data['ema_50'] > data['ema_10'], 1, 0)
    data['cross_ema_100_10'] = np.where(data['ema_100'] > data['ema_10'], 1, 0)
    data['cross_ema_21_14'] = np.where(data['ema_21'] > data['ema_14'], 1, 0)
    data['cross_ema_32_14'] = np.where(data['ema_32'] > data['ema_14'], 1, 0)
    data['cross_ema_50_14'] = np.where(data['ema_50'] > data['ema_14'], 1, 0)
    data['cross_ema_100_14'] = np.where(data['ema_100'] > data['ema_14'], 1, 0)
    data['cross_ema_32_21'] = np.where(data['ema_32'] > data['ema_21'], 1, 0)
    data['cross_ema_50_21'] = np.where(data['ema_50'] > data['ema_21'], 1, 0)
    data['cross_ema_100_21'] = np.where(data['ema_100'] > data['ema_21'], 1, 0)
    data['cross_ema_50_32'] = np.where(data['ema_50'] > data['ema_32'], 1, 0)
    data['cross_ema_100_32'] = np.where(data['ema_100'] > data['ema_32'], 1, 0)
    data['cross_ema_100_50'] = np.where(data['ema_100'] > data['ema_50'], 1, 0)

    
    return data

In [None]:
def create_model(train, test, FEATURES, TARGET, min_child_weight, reg_alpha):
    # building XGBoost model - you can change number of n_estimators (model is set up for 10% of stopping rounds depending on number of n_estimators)

    X_train = train[FEATURES]
    y_train = train[TARGET]

    X_test = test[FEATURES]
    y_test = test[TARGET]

    n_estimators=2500

    xgb_model = xgboost.XGBRegressor(learning_rate=0.01, device='cuda', n_estimators=n_estimators, seed=42, early_stopping_rounds=n_estimators*0.1,
                                     min_child_weight=min_child_weight, reg_alpha=reg_alpha, max_depth=0)
    xgb_model.fit(X_train, y_train, eval_set=[(X_test, y_test)])

    return xgb_model

In [None]:
def create_prediction(test_set, xgb_model, FEATURES):

# function for making prediction, all measurments are claculated with absolute values, prediction also gives you information either you shoud place long or short position,
# it calculates mean error in terms of price, but also accuracy depending on which position was predicted by model vs actual data

    future = pd.date_range(test_set.at[0, 'Date'], test_set.at[max(test_set.index), 'Date'], freq='D')
    future_df = pd.DataFrame(future)

    test_set['pred'] = xgb_model.predict(test_set[FEATURES])

    prediction = test_set[['Close', 'pred', 'Open']]

    prediction['abs_accuracy'] = np.abs((test_set.Close / test_set.pred - 1) * 100)

    prediction['position'] = np.where(prediction['Close'] > prediction['Open'], 'long', 'short')
    prediction['predicted_position'] = np.where(prediction['pred'] > prediction['Open'], 'long', 'short')
    prediction['accuracy'] = np.where(prediction['position'] == prediction['predicted_position'], 1, 0)

    mean_error = prediction['abs_accuracy'].mean()
    mean_accuracy_position = prediction.accuracy.sum() / prediction.accuracy.count()


    return mean_error, mean_accuracy_position

In [None]:
for ticker in ticker_list:
    try:
        df = download_data(ticker, '2015-01-01')
        data_w_features = create_features(df)

        features = list(data_w_features.columns)

        FEATURES = features
        TARGET = ['Close']
            
    # data split for 7 training folds with 10 days of data each. You can use TimeSeriesSplit function from sklearn to achieve same thing
        
        train_1 = data_w_features[:-70]
        test_1 = data_w_features[-70:-60]

        train_2 = data_w_features[:-60]
        test_2 = data_w_features[-60:-50]

        train_3 = data_w_features[:-50]
        test_3 = data_w_features[-50:-40]

        train_4 = data_w_features[:-40]
        test_4 = data_w_features[-40:-30]

        train_5 = data_w_features[:-30]
        test_5 = data_w_features[-30:-20]

        train_6 = data_w_features[:-20]
        test_6 = data_w_features[-20:-10]

        train_7 = data_w_features[:-10]
        test_7 = data_w_features[-10:]

        train_test_dict = {
            1: [train_1, test_1],
            2: [train_2, test_2],
            3: [train_3, test_3],
            4: [train_4, test_4],
            5: [train_5, test_5],
            6: [train_6, test_6],
            7: [train_7, test_7]
        }


        features.remove('Close')
        features.remove('Open')
        features.remove('High')
        features.remove('Low')
        features.remove('Date')


        for i in range(1, 8):
            train = train_test_dict[i][0]
            test = train_test_dict[i][1]

            test.reset_index(inplace=True)


        for min_child_weight in params['min_child_weight']:
            for reg_alpha in params['reg_alpha']:
                for i in range(1, 8):

                    average_scores = pd.DataFrame(columns=['ticker', 'train_fold', 'mean_error', 'mean_accuracy_position', 'min_child_weight', 'reg_alpha'])

                    train = train_test_dict[i][0]
                    test = train_test_dict[i][1]

                    xgb_model = create_model(train, test, FEATURES, TARGET, min_child_weight, reg_alpha)

                    mean_error, mean_accuracy_position = create_prediction(test, xgb_model, FEATURES)

                    average_scores.loc[0] = ticker, i, mean_error, mean_accuracy_position, min_child_weight, reg_alpha
                    

                    average_scores.to_sql('hyper_xgboost_all_tickers', engine, if_exists='append')
    except:
        pass

