In [None]:
import pandas as pd # load and manipulate data and for One-Hot Encoding
import numpy as np # calculate the mean and standard deviation
import xgboost as xgb # XGBoost stuff
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split # split  data into training and testing sets
from sklearn.metrics import balanced_accuracy_score, roc_auc_score, make_scorer # for scoring during cross validation
from sklearn.model_selection import GridSearchCV # cross validation
from sklearn.metrics import confusion_matrix # creates a confusion matrix
from sklearn.metrics import plot_confusion_matrix # draws a confusion matrix
from sklearn.metrics import mean_squared_error

In [None]:
df = pd.read_csv('data.csv')

In [None]:
df['shifted_adjusted_close'] = df['adjusted_close'].shift(-1)

In [None]:
df.dropna(inplace=True)

In [None]:
df.drop(['date'], axis=1, inplace=True)

In [None]:
df

In [None]:
def train_test_split2(data, perc):
    """
        data contains [X, y]
        perc is the percentage of data to use as testdata
        returns [X_80%, y_80%], [X_20%, y_20%]
    """
    data = data.values
    n = int(len(data) * (1 - perc))
    return data[:n], data[n:]

In [None]:
train, test = train_test_split2(df, 0.2)

In [None]:
print(len(df))
print(len(train))
print(len(test))

In [None]:
def xgb_predict(train, y_test_0):
    """
        train  contains [...X, y]
        y_test is the value we use to test our prediction
    """
    train = np.array(train)
    X, y = train[:, :-1], train[:, -1]
    model = xgb.XGBRegressor(objective="reg:squarederror", n_estimators=1000)
    model.fit(X, y)
    pred = model.predict(y_test_0)
    return pred[0]

In [None]:
len(xgb_predict(train, test[0][:-1]))
#len(test[0])
#np.array(train[0]).reshape(1, -1)

In [None]:
def validate(data, perc):
    """
        data: contains [X, y]
        prec: percentage to use as testdata
    """
    predictions = []

    train, test = train_test_split2(data, perc)

    history = [x for x in train]

    for i in range(len(test)):
        test_X, test_y = test[i, :-1], test[i, -1]

        pred = xgb_predict(history, test_X[0])
        predictions.append(pred)

        history.append(test[i])

    error = mean_squared_error(test[:, -1], predictions, squared=False)

    return error, test[:, -1], predictions

In [None]:
%%time
rmse, y, pred = validate(df, 0.2)

print(rmse)