# Model training
This notebook contains the preparation of the training, the training itself and the analysis of the training results.

In [91]:
# Imports
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split

## Training preparation

In [92]:
df = pd.read_csv("./../data_works/train_set_pipeline/stock_data.csv", low_memory=False, index_col=0)
df.head()

Unnamed: 0,meme_stock,ticker,price_2020-12-01,wsv_2020-12-01,price_2020-12-16,wsv_2020-12-16,price_2021-01-04,wsv_2021-01-04,price_2021-01-20,wsv_2021-01-20,...,volume,fiftyTwoWeekHigh,fiveYearAvgDividendYield,fiftyTwoWeekLow,bid,tradeable,dividendYield,bidSize,dayHigh,fax
0,True,GME,4.2775,86,3.49,20,4.75,91,9.3425,597,...,,,,,,,,,,
1,True,AMC,4.43,1,2.87,1,2.2,2,3.29,44,...,,,,,,,,,,
2,True,BBBY,21.290001,0,19.43,0,17.969999,3,25.110001,2,...,,,,,,,,,,
3,True,FIZZ,43.345825,0,38.819767,0,40.557332,0,44.121933,1,...,304714.0,57.65,,38.1,47.31,False,,800.0,53.98,
4,True,BB,8.36,44,8.35,4,6.7,3,13.23,430,...,,,,,,,,,,


In [93]:
# df[df.ticker.apply(lambda x: not x.isnumeric())]
cols_to_drop = ["city", "state", "country", "financialCurrency", "shortName", "longName", "exchangeTimezoneShortName",\
    "isEsgPopulated", "quoteType", "market", "morningStarRiskRating", "fundInceptionDate", "lastFiscalYearEnd", "mostRecentQuarter",\
        "legalType", "lastDividendDate", "startDate", "fax", "zip"]
cols_to_encode = ["sector", "industry", "recommendationKey", "exchange", "fundFamily", "tradeable"]
# These have overfitting potential: industry
dummies = pd.get_dummies(df[cols_to_encode])
df = pd.concat([df, dummies], axis=1)

In [94]:
# These feature are potentially needed later on, but not for training
y = df["meme_stock"]
cols_to_drop += ["logo_url", "ticker", "meme_stock"]
df.drop(cols_to_drop+cols_to_encode, axis=1, inplace=True)
X = df.to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=41)
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

## Model training

In [95]:
# Set hyperparameters
param = {
    'max_depth': 10, # Huge number of features require complex model
    'objective': 'binary:logistic',
}

evallist = [(dtrain, 'train'), (dtest, 'eval')]

In [96]:
num_round = 10
model = xgb.train(param, dtrain, num_round, evallist)

[0]	train-logloss:0.53725	eval-logloss:0.57307
[1]	train-logloss:0.46015	eval-logloss:0.50463
[2]	train-logloss:0.39301	eval-logloss:0.45916
[3]	train-logloss:0.34707	eval-logloss:0.42346
[4]	train-logloss:0.31284	eval-logloss:0.40676
[5]	train-logloss:0.27735	eval-logloss:0.39097
[6]	train-logloss:0.25314	eval-logloss:0.36115
[7]	train-logloss:0.23054	eval-logloss:0.33207
[8]	train-logloss:0.21044	eval-logloss:0.33050
[9]	train-logloss:0.19609	eval-logloss:0.33158


