In [None]:
import pandas as pd # load and manipulate data and for One-Hot Encoding
import numpy as np # calculate the mean and standard deviation
import xgboost as xgb # XGBoost stuff
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split # split  data into training and testing sets
from sklearn.metrics import balanced_accuracy_score, roc_auc_score, make_scorer # for scoring during cross validation
from sklearn.model_selection import GridSearchCV # cross validation
from sklearn.metrics import confusion_matrix # creates a confusion matrix
from sklearn.metrics import plot_confusion_matrix # draws a confusion matrix

In [None]:
df = pd.read_csv('data.csv')
df.head()

In [None]:
df.sort_values(['date'], ascending=[True])

## Make sure that we have good looking data.

We want to ensure that the types of the colums are correct.
The following shows, that we do have float64,int64 everywhere as the type. However it does not ensure that nowhere a `nan` (or `None` if would be objects) can be found.

> So make sure that data at this point does neither contain None nor Null


In [None]:
df.dtypes

## Split data between what is used to predict to what we want to predict.

`X` is the data we use to predict.
`y` is the data we want to predict.

In our case, `y`=`adjusted_close` and `X` is everyting else.

In [None]:
X = df.drop(['adjusted_close', 'date'], axis=1).copy()
X.head()

In [None]:
y = df['adjusted_close'].copy()
y.head()

## Format X to be suitable for XGBoost (One-Hot Encoding)

In [None]:
X.dtypes

In [None]:
df.describe()

In [None]:
data_dmatrix = xgb.DMatrix(data=X, label=y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [None]:
X_train

In [None]:
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 1, alpha = 10, n_estimators = 10)

In [None]:
xg_reg.fit(X_train,y_train)

In [None]:
preds = xg_reg.predict(X_test)
preds

In [None]:
params = {"objective":"reg:squarederror",'colsample_bytree': 0.3,'learning_rate': 0.1,
                'max_depth': 1, 'alpha': 10}

In [None]:
cv_results = xgb.cv(dtrain=data_dmatrix, params=params, nfold=3,
                    num_boost_round=50,early_stopping_rounds=10,metrics="rmse", as_pandas=True, seed=123)

In [None]:
cv_results.head()

In [None]:
print((cv_results["test-rmse-mean"]).tail(1))

In [None]:
xg_reg = xgb.train(params=params, dtrain=data_dmatrix, num_boost_round=10)

In [None]:
xgb.plot_tree(xg_reg,num_trees=0)
plt.rcParams['figure.figsize'] = [100, 10]
plt.show()