# Model training
This notebook contains the preparation of the training, the training itself and the analysis of the training results.

In [1]:
# Imports
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split

## Training preparation

In [2]:
df = pd.read_csv("./../data_works/train_set_pipeline/stock_data.csv", low_memory=False, index_col=0)
df.head()

Unnamed: 0,meme_stock,ticker,price_2020-12-01,wsv_2020-12-01,price_2020-12-16,wsv_2020-12-16,price_2021-01-04,wsv_2021-01-04,price_2021-01-20,wsv_2021-01-20,...,volume,fiftyTwoWeekHigh,fiveYearAvgDividendYield,fiftyTwoWeekLow,bid,tradeable,dividendYield,bidSize,dayHigh,fax
0,True,GME,4.2775,86,3.49,20,4.75,91,9.3425,597,...,,,,,,,,,,
1,True,AMC,4.43,1,2.87,1,2.2,2,3.29,44,...,,,,,,,,,,
2,True,BBBY,21.290001,0,19.43,0,17.969999,3,25.110001,2,...,,,,,,,,,,
3,True,FIZZ,43.345825,0,38.819767,0,40.557332,0,44.121933,1,...,304714.0,57.65,,38.1,47.31,False,,800.0,53.98,
4,True,BB,8.36,44,8.35,4,6.7,3,13.23,430,...,,,,,,,,,,


In [3]:
# df[df.ticker.apply(lambda x: not x.isnumeric())]
cols_to_drop = ["city", "state", "country", "financialCurrency", "shortName", "longName", "exchangeTimezoneShortName",\
    "isEsgPopulated", "quoteType", "market", "morningStarRiskRating", "fundInceptionDate", "lastFiscalYearEnd", "mostRecentQuarter",\
        "legalType", "lastDividendDate", "startDate", "fax", "zip"]
cols_to_encode = ["sector", "industry", "recommendationKey", "exchange", "fundFamily", "tradeable"]
# These have overfitting potential: industry
dummies = pd.get_dummies(df[cols_to_encode])
df = pd.concat([df, dummies], axis=1)

In [4]:
# These feature are potentially needed later on, but not for training
y = df["meme_stock"]
cols_to_drop += ["logo_url", "ticker", "meme_stock"]
df.drop(cols_to_drop+cols_to_encode, axis=1, inplace=True)
X = df.to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=41)
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

## Hyperparameter tuning

In [5]:
xgb.config_context(verbosity=0)

<contextlib._GeneratorContextManager at 0x12fa22fc430>

In [11]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

# Adjust function from hyperopt package to return integers instead of floats (error in the package)
from hyperopt.pyll import scope
from hyperopt.pyll_utils import validate_label, validate_distribution_range

@validate_label
@validate_distribution_range
def quniform(label, *args, **kwargs):
    return scope.int(scope.hyperopt_param(label, scope.quniform(*args, **kwargs)))

space = {
    'max_depth': hp.quniform("max_depth", 5, 15, 1),
    'learning_rate': hp.uniform("learning_rate", 0.01, 0.5),
    'grow_policy': hp.choice("grow_policy", ["depthwise", "lossguide"]),
    'seed': hp.randint("seed", 100),
    'objective': 'binary:logistic',
}
evallist = [(dtrain, 'train'), (dtest, 'eval')]

In [12]:
def hyperparameter_tuning(space):
    """ Perform hyperparameter tuning with given hyperparameter space. """

    evals_result = {}
    model = xgb.train(space, dtrain, 10, evallist, evals_result=evals_result)

    return {'loss': evals_result["eval"]["logloss"][-1], 'status': STATUS_OK, 'model': model}

In [13]:
# Run Optimization
trials = Trials()
best_params = fmin(fn=hyperparameter_tuning, space=space, algo=tpe.suggest, max_evals=150, trials=trials)
best_params

[0]	train-logloss:0.62766	eval-logloss:0.64155         
[1]	train-logloss:0.58319	eval-logloss:0.59968         
[2]	train-logloss:0.54342	eval-logloss:0.56943         
[3]	train-logloss:0.50685	eval-logloss:0.53805         
[4]	train-logloss:0.47614	eval-logloss:0.51512         
[5]	train-logloss:0.44998	eval-logloss:0.49648         
[6]	train-logloss:0.42764	eval-logloss:0.47937         
[7]	train-logloss:0.40909	eval-logloss:0.46288         
[8]	train-logloss:0.38437	eval-logloss:0.44399         
[9]	train-logloss:0.36497	eval-logloss:0.43295         
[0]	train-logloss:0.62043	eval-logloss:0.63595                                   
[1]	train-logloss:0.57218	eval-logloss:0.59067                                   
[2]	train-logloss:0.52980	eval-logloss:0.55893                                   
[3]	train-logloss:0.48826	eval-logloss:0.52219                                   
                                                                                 




[4]	train-logloss:0.45345	eval-logloss:0.49568
[5]	train-logloss:0.42544	eval-logloss:0.47258                                   
[6]	train-logloss:0.40389	eval-logloss:0.46010                                   
[7]	train-logloss:0.38474	eval-logloss:0.44693                                   
[8]	train-logloss:0.36339	eval-logloss:0.43475                                   
[9]	train-logloss:0.34979	eval-logloss:0.42617                                   
[0]	train-logloss:0.57675	eval-logloss:0.60252                                    
[1]	train-logloss:0.51011	eval-logloss:0.54151                                    
[2]	train-logloss:0.44840	eval-logloss:0.49366                                    
[3]	train-logloss:0.40730	eval-logloss:0.45984                                    
[4]	train-logloss:0.37665	eval-logloss:0.44323                                    
[5]	train-logloss:0.34309	eval-logloss:0.42498                                    
[6]	train-logloss:0.31410	eval-logloss:0.3922

{'grow_policy': 0,
 'learning_rate': 0.3941418458845585,
 'max_depth': 6.0,
 'seed': 48}

## Model training
Training a final XGB model with best hyperparameters from hyperparameter tuning.

In [9]:
# Set hyperparameters
param = {
    'max_depth': 10, # Huge number of features require complex model
    'objective': 'binary:logistic',
}

In [14]:
evals_result = {}
best_params["grow_policy"] = ["depthwise", "lossguide"][best_params["grow_policy"]] if isinstance(best_params["grow_policy"], np.int64) else best_params["grow_policy"]
best_params["max_depth"] = int(best_params["max_depth"])
best_params["objective"] = "binary:logistic"
model = xgb.train(best_params, dtrain, 10, evallist, evals_result=evals_result)

[0]	train-logloss:0.49748	eval-logloss:0.54442
[1]	train-logloss:0.41529	eval-logloss:0.47457
[2]	train-logloss:0.36281	eval-logloss:0.44121
[3]	train-logloss:0.31098	eval-logloss:0.38599
[4]	train-logloss:0.26747	eval-logloss:0.33519
[5]	train-logloss:0.23565	eval-logloss:0.31086
[6]	train-logloss:0.21565	eval-logloss:0.28816
[7]	train-logloss:0.19592	eval-logloss:0.25827
[8]	train-logloss:0.17900	eval-logloss:0.25693
[9]	train-logloss:0.16610	eval-logloss:0.24646


## Inference

## Model analysis
Analysing the feature importance to better understand dependencies in the data and to condense the dataset down to the relevant features for performant handling in production later on.