In [None]:
import pandas as pd # load and manipulate data and for One-Hot Encoding
import numpy as np # calculate the mean and standard deviation
import xgboost as xgb # XGBoost stuff
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split # split  data into training and testing sets
from sklearn.metrics import balanced_accuracy_score, roc_auc_score, make_scorer # for scoring during cross validation
from sklearn.model_selection import GridSearchCV # cross validation
from sklearn.metrics import confusion_matrix # creates a confusion matrix
from sklearn.metrics import plot_confusion_matrix # draws a confusion matrix

In [None]:
df = pd.read_csv('data.csv')
df.head()

> Ensure that the `date` is acending!

## Ensure that we have good looking data.

We want to ensure that the types of the colums are correct.
The following shows, that we do have float64,int64 everywhere as the type. However it does not ensure that nowhere a `nan` (or `None` if would be objects) can be found.

> So make sure that data at this point does neither contain None nor Null


In [None]:
df.dtypes

In [None]:
len(df[df['date'].isna()]) == 0

In [None]:
len(df[df['high'].isna()]) == 0

In [None]:
len(df[df['low'].isna()]) == 0

In [None]:
len(df[df['adjusted_close'].isna()]) == 0

In [None]:
len(df[df['volume'].isna()]) == 0

In [None]:
len(df[df['SMA_20'].isna()]) == 0

In [None]:
len(df[df['SMA_20_gt_EMA21'].isna()]) == 0

In [None]:
len(df[df['RSI_14'].isna()]) == 0

In [None]:
len(df[df['OBV'].isna()]) == 0

In [None]:
len(df[df['ROC_1'].isna()]) == 0

In [None]:
len(df[df['ROC_BOOL'].isna()]) == 0

All above should return `True`.

## Add a row with the ROC_BOOL shifted by -1

Since we want to target the `shifted_ROC_BOOL`, we need a way to tell XGBoost if it did a good job at regression. Therefore we create a copy of that column and shift it by -1.
 
You can see that `df[1].ROC_BOOL == df[0].shifted_ROC_BOOL`.  


In [None]:
df.insert(5, 'shifted_ROC_BOOL', df['ROC_BOOL'].shift(-1))

In [None]:
df.tail()

### Since the shifting creates a `NaN` at the oldest entry, delete that line.

In [None]:
len(df.index)

In [None]:
df.dropna(inplace=True)

In [None]:
len(df.index)

The `len(df.index)` should now be one less than the previous `len(df.index)`.

## Split data between what we use to predict and what we want to predict.

`X` is the data we use to predict.
`y` is the data we want to predict.

In our case, `y`=`adjusted_close` and `X` is everyting else.

In [None]:
X = df.drop(['open', 'high', 'low', 'shifted_ROC_BOOL', 'ROC_1', 'ROC_BOOL', 'date'], axis=1).copy()
X.head()

In [None]:
y = df['shifted_ROC_BOOL'].copy()
y.head()

## Format X to be suitable for XGBoost (One-Hot Encoding)

In [None]:
X.dtypes

In [None]:
df.describe()

### Define data matrix

In [None]:
data_dmatrix = xgb.DMatrix(data=X, label=y)

### We are going to use 80% off the data for training, the and 20% (`0.2`) for testing.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

### Then we create a model and define some parameters

In [None]:
model = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 100, alpha = 10, n_estimators = 1000)

### Then it is time to fit the data to our model.

In [None]:
model.fit(X_train,y_train)

### Next we use the remaining data to create test predictions.

In [None]:
preds = model.predict(X_test)
preds

In [None]:
preds[len(preds)-1]

### Desicion Tree

In [None]:
xgb.plot_tree(model, num_trees=0)
plt.rcParams['figure.figsize'] = [1000, 100]
plt.show()

### Confusion Matrix

In [None]:
cm = {"tp":0, "fp":0, "fn":0, "tn":0}

In [None]:
preds

In [None]:
preds_bool = (preds > 0.5).astype(int)
preds_bool

In [None]:
y_test

In [None]:
ytest2 = y_test.to_numpy().astype(int)
ytest2

In [None]:
len(preds_bool)

In [None]:
for i in range(len(y_test)):
    if(ytest2[i]):
        if(preds_bool[i]):
            cm['tp'] += 1
        else:
            cm['fp'] += 1
    else:
        if(preds_bool[i]):
            cm['tn'] += 1
        else:
            cm['fn'] += 1

In [None]:
cm_draw = [[cm['tp'], cm['fp']], [cm['tn'], cm['fn']]]
pd.DataFrame(cm_draw, columns=["Fall", "Rise"], index=["Fall", "Rise"])

In [None]:
def percent(number, l):
   return f"{int((number/l)*100)} %"

In [None]:
cm_draw = [[percent(cm['tp'], len(ytest2)), percent(cm['fp'], len(ytest2))], [percent(cm['tn'], len(ytest2)), percent(cm['fn'], len(ytest2))]]
pd.DataFrame(cm_draw, columns=["Fall", "Rise"], index=["Fall", "Rise"])

- Insample (rerun on data that used to train, should have high %)  
- Out of sample (~65%, ideal 80-90%)
- 5 years data range  
- close instead of adjusted_close

