This code uses XGBOOST for regression. <br>
The algorithm is retrained at every step to predict the next test observation. <br>
Uses **only lagged values** as features. <br>
If we have enough (>20) points from new concept, we drop the old train data and retrain the algorithm only on the points for new concept.

In [14]:
from xgboost import XGBRegressor
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from math import sqrt
import time
import json
import functions

function for calculating *Symmetric Mean Absolute Percentage Error*.

In [15]:
def smape(predictions, actual):
    difference = np.abs(predictions-actual)
    summation = np.abs(actual)+np.abs(predictions)
    error = np.mean(difference/summation)
    return error

In [16]:
def xgboost_forecast(train, test_X):
    train_X, train_y = train.iloc[:,1:], train.iloc[:,0]
    
    model = XGBRegressor(objective = 'reg:squarederror', n_estimators = 100, random_state = 40)
    model.fit(train_X, train_y)
    yhat = model.predict(test_X)
    
    return yhat[0]

Since now we can't use *ada_preprocessing()* to extract lagged values and concept, because it can potentially mess up breakpoint detection, we need separate function for this.

In [17]:
def manual_preprocessing(values):
    #receives the list of values up until and including the test point
    
    columns = ["t", "t-1", "t-2", "t-3", "t-4", "t-5"]
    
    #retrieve lagged values
    data = [values[-1], values[-2], values[-3], values[-4], values[-5], values[-6]]
    df = pd.DataFrame(columns=columns, data=[data])
    return df

In [18]:
def is_enough(data):
    new_concept = max(list(data["concept"]))
    number_of_points = sum(data["concept"]==new_concept)
    print(number_of_points)
    return number_of_points

In [19]:
#worst performing datasets from xgboost with retrain
list_of_names = ["nonlinear1_abrupt", "nonlinear1_inc"]

# list_of_names = ["linear1_abrupt", "linear2_abrupt", "linear3_abrupt",
#                 "nonlinear1_abrupt", "nonlinear2_abrupt", "nonlinear3_abrupt",
#                 "linear1_inc", "linear2_inc", "linear3_inc",
#                 "nonlinear1_inc", "nonlinear2_inc", "nonlinear3_inc"]

In [20]:
#dictionary to store the overall error
smape_dict = {}

#store the error for every step for both datasets
error1 = []
error2 = []

for name in list_of_names:
    start = time.perf_counter()
    
    #loading the data
    file_path = "data/"+name+"_series"
    data = pd.read_csv(file_path).iloc[:,0].to_list()
    
    #70/30 train/test split
    split = int(0.7*len(data))
    train, test = data[:split], data[split:]
    
    #get breakpoints for train set
    history = functions.ada_preprocessing(train)
    
    #note the last concept that appeared
    last_num_concepts = max(list(history["concept"]))
    
    predictions = []
    ground_truth = []
    
    for i in range(len(test)):
        #add new test observation to train series
        train.append(test[i])
        
        #pass all the values available in series up to and including the new test point
        test_df = manual_preprocessing(train)
        
        ground_truth.append(train[-1])
        
        #training data = history
        prediction = xgboost_forecast(history.loc[:,"t":"t-5"], test_df.loc[:,"t-1":"t-5"])
        predictions.append(prediction)
        
        #new dataframe with the predicted test observation already appended
        history = functions.ada_preprocessing(train)
        
        #note the real concept for the test observation
        new_num_concepts = max(list(history["concept"]))
        
        #if the number of concepts change, check if we have enough datapoints for new concept
        if new_num_concepts>last_num_concepts:
            #if we have more than 20 points for new concept, keep them and drop the rest of the data
            points = is_enough(history)
            if points>=20:
                history = history.tail(points)
                last_num_concepts = new_num_concepts
            #otherwise just keep using the same dataset
        
        #error at every step by daniel's request :D
        if name == "nonlinear1_abrupt":
            error1.append(smape(np.asarray(predictions), np.asarray(ground_truth)))
        else:
            error2.append(smape(np.asarray(predictions), np.asarray(ground_truth)))
            
    end = time.perf_counter()
    print("Time wasted: {:.2f}m".format((end-start)/60))
    
    error = smape(np.asarray(predictions), np.asarray(ground_truth))
    smape_dict[name] = error
    print("SMAPE: {:.4f}".format(error))
    plt.plot(ground_truth, label = "Expected", color = "black")
    plt.plot(predictions, label = "Predicted", color = "red")
    plt.legend()
    plt.title("{} without concept".format(name))    
    plt.show()

23
24
25
26


KeyboardInterrupt: 

In [None]:
error1

In [None]:
error2

In [13]:
max(list(history["concept"]))

3