In [1]:
import numpy as np
import pandas as pd
from xgboost import XGBRegressor
from dateutil.relativedelta import relativedelta
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 100)

In [52]:
import numpy as np
import pandas as pd
from xgboost import XGBRegressor
from dateutil.relativedelta import relativedelta
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 100)
class XGBTimeSeries:
    def __init__(self, dataset, datecol_name, valuecol_name, agg_level,
    num_lags):
        self.src_dataset = dataset
        self.datecol_name = datecol_name
        self.valuecol_name = valuecol_name
        if agg_level not in ['H', 'D', 'M', 'Y']:
            raise KeyError('Invalid aggregation level - must be in (H, D, M,Y)')
        self.agg_level = agg_level
        self.num_lags = num_lags
        self.add_attributes = list(self.src_dataset.columns)
        self.add_attributes.remove(self.valuecol_name)
        self.add_attributes.remove(self.datecol_name)
        self.dataset = self._preprocess_dataset()
        self.X, self.y = self._feature_target_split()
        self.model = None
        self.model_add1 = None
        
        
    def _preprocess_dataset(self):
        df = self.src_dataset.copy()
        df[self.datecol_name] = pd.to_datetime(df[self.datecol_name])
        if self.agg_level == 'H':
            df['Year'] = df[self.datecol_name].apply(lambda x: x.year)
            df['Month'] = df[self.datecol_name].apply(lambda x: x.month)
            df['DayOfWeek'] = df[self.datecol_name].apply(lambda x:
            x.strftime('%A'))
            df['IsWeekend'] = df['DayOfWeek'].apply(lambda x: 1 if x in ['Saturday', 'Sunday'] else 0)
            df['Hour'] = df[self.datecol_name].apply(lambda x: x.hour)
            df['IsWorkTime'] = df['Hour'].apply(lambda x: 1 if x in [9, 10, 11, 12, 13, 14, 15, 16, 17] else 0)
        elif self.agg_level == 'D':
            df['Year'] = df[self.datecol_name].apply(lambda x: x.year)
            df['Month'] = df[self.datecol_name].apply(lambda x: x.month)
            df['DayOfWeek'] = df[self.datecol_name].apply(lambda x:
            x.strftime('%A'))
            df['IsWeekend'] = df['DayOfWeek'].apply(lambda x: 1 if x in             ['Saturday', 'Sunday'] else 0)
        elif self.agg_level == 'M':
            df['Year'] = df[self.datecol_name].apply(lambda x: x.year)
            df['Month'] = df[self.datecol_name].apply(lambda x: x.month)
        else: # year
            df['Year'] = df[self.datecol_name].apply(lambda x: x.year)
        # create dummy variables from DayOfWeek - categorical
        if 'DayOfWeek' in df.columns:
            dummies = pd.get_dummies(df['DayOfWeek'], prefix='Is',
            prefix_sep='')
            dummies = dummies[['IsMonday', 'IsTuesday', 'IsWednesday',
            'IsThursday', 'IsFriday', 'IsSaturday']]
            df = pd.concat([df, dummies], axis=1)
            df.drop('DayOfWeek', axis=1, inplace=True)
        # Create lagged variables
        for i in range(1, self.num_lags + 1):
            df[f'LAG_{i}'] = df[self.valuecol_name].shift(i)
        for col in self.add_attributes:
            for i in range(1, self.num_lags + 1):
                df[f'LAG_{col}_{i}'] = df[col].shift(i)
        df.dropna(inplace=True)
        return df
    
    def _feature_target_split(self):
        # Drop date column because it can't be used
        X = self.dataset.drop([self.datecol_name, self.valuecol_name]+self.add_attributes, axis=1)
        y = self.dataset[self.valuecol_name]
        return X, y
    
    def _agg_equals_expression(self, date, i):
        if self.agg_level == 'H':
            return date - relativedelta(hours=i)
        elif self.agg_level == 'D':
            return date - relativedelta(days=i)
        elif self.agg_level == 'M':
            return date - relativedelta(months=i)
        else: # year
            return date - relativedelta(years=i)
        
    def _get_column_list(self):
        add_attributes = []
        if self.agg_level == 'H':
            return ['Year', 'Month', 'IsWeekend', 'Hour', 'IsWorkTime',
            'IsMonday', 'IsTuesday', 'IsWednesday', 'IsThursday', 'IsFriday',
            'IsSaturday'] + add_attributes
        elif self.agg_level == 'D':
            return ['Year', 'Month', 'IsWeekend', 'IsMonday', 'IsTuesday',
            'IsWednesday', 'IsThursday', 'IsFriday', 'IsSaturday'] + add_attributes
        elif self.agg_level == 'M':
            return ['Year', 'Month'] + add_attributes
        else:
            return ['Year'] + add_attributes
        
    @staticmethod
    def _one_hot_weekdays(day):
        if day == 'Monday':
            return [1, 0, 0, 0, 0, 0]
        elif day == 'Tuesday':
            return [0, 1, 0, 0, 0, 0]
        elif day == 'Wednesday':
            return [0, 0, 1, 0, 0, 0]
        elif day == 'Thursday':
            return [0, 0, 0, 1, 0, 0]
        elif day == 'Friday':
            return [0, 0, 0, 0, 1, 0]
        elif day == 'Saturday':
            return [0, 0, 0, 0, 0, 1]
        else:
            return [0, 0, 0, 0, 0, 0]
        
    def _get_attributes(self, date):
        if self.agg_level == 'H':
            year = date.year
            month = date.month
            dayofweek = date.strftime('%A')
            isweekend = 1 if dayofweek in ['Saturday', 'Sunday'] else 0
            dayofweek_onehot = self._one_hot_weekdays(dayofweek)
            hour = date.hour
            isworkhour = 1 if hour in [9, 10, 11, 12, 13, 14, 15, 16, 17] else 0
            return [year, month, isweekend, hour, isworkhour,*dayofweek_onehot]
        elif self.agg_level == 'D':
            year = date.year
            month = date.month
            dayofweek = date.strftime('%A')
            isweekend = 1 if dayofweek in ['Saturday', 'Sunday'] else 0
            dayofweek_onehot = self._one_hot_weekdays(dayofweek)
            return [year, month, isweekend, *dayofweek_onehot]
        elif self.agg_level == 'M':
            year = date.year
            month = date.month
            return [year, month]
        else:
            year = date.year
            return [year]
        
    def _get_prediction_from_to_dates(self, data, n_periods):
        if self.agg_level == 'H':
            from_date = data.iloc[-1][self.datecol_name] +            relativedelta(hours=1)
            thru_date = data.iloc[-1][self.datecol_name] +            relativedelta(hours=n_periods)
            return from_date, thru_date
        elif self.agg_level == 'D':
            from_date = data.iloc[-1][self.datecol_name] +            relativedelta(days=1)
            thru_date = data.iloc[-1][self.datecol_name] +            relativedelta(days=n_periods)
            return from_date, thru_date
        elif self.agg_level == 'M':
            from_date = data.iloc[-1][self.datecol_name] + relativedelta(months=1)
            thru_date = data.iloc[-1][self.datecol_name] + relativedelta(months=n_periods)
            return from_date, thru_date
        else: # year
            from_date = data.iloc[-1][self.datecol_name] +            relativedelta(years=1)
            thru_date = data.iloc[-1][self.datecol_name] +            relativedelta(years=n_periods)
            return from_date, thru_date
        
        
    def add_feature_sigle_predict(self, data, date):
        model_add1 = XGBRegressor()
        temp_col = self.add_attributes[0]
        X_add = self.dataset[["Year",'Month']+[i for i in self.dataset.columns if '_Temp' in i ]]
        y_add = self.dataset[temp_col]
        
        model_add1.fit(X_add, y_add)
        self.model_add1 = model_add1
        #_single_predict_add ver
        attributes = self._get_attributes(date)
        column_names = self._get_column_list()
        lags = []
        lags_arr = []
        for i in range(1, self.num_lags + 1):
            for col in self.add_attributes:
                lags.append(data[data[self.datecol_name] == self._agg_equals_expression(date, i)][col].values[0])
                lags_arr.append(f'LAG_{col}_{i}')
            
        concatenated = [*attributes, *lags]
        concatenated = pd.DataFrame(concatenated)
        concatenated = concatenated.T
        concatenated.columns = [*column_names, *lags_arr]
        prediction = self.model_add1.predict(concatenated)
        return prediction[0]
        
        
    def _single_predict(self, data, date):
        attributes = self._get_attributes(date)
        column_names = self._get_column_list()
        lags = []
        lags_arr = []
        for i in range(1, self.num_lags + 1):
            lags.append(data[data[self.datecol_name] == self._agg_equals_expression(date, i)][self.valuecol_name].values[0])
            lags_arr.append(f'LAG_{i}')
        #####################################################    
        for i in range(1, self.num_lags + 1):
            for col in self.add_attributes:
                lags.append(data[data[self.datecol_name] == self._agg_equals_expression(date, i)][col].values[0])
                lags_arr.append(f'LAG_{col}_{i}')
            
        concatenated = [*attributes, *lags]
        concatenated = pd.DataFrame(concatenated)
        concatenated = concatenated.T
        concatenated.columns = [*column_names, *lags_arr]
        prediction = self.model.predict(concatenated)
        return prediction[0]
    
    
    def fit(self, X=None, y=None, hyperparameters=None):
        if X is None: X = self.X
        if y is None: y = self.y
        if hyperparameters is None:
            model = XGBRegressor()
        else:
            model = XGBRegressor(**hyperparameters)
        model.fit(X, y)
        self.model = model
        
    def predict(self, n_periods):
        '''
        Public method, used make predictions for a given number of periods.
        Arguments:
        n_periods -- int - number of periods to forecast in the future
        Returns:
        list of dicts - <Date: Prediction> pairs
        '''
        data = self.dataset.copy()
        pred_from_date, pred_thru_date =self._get_prediction_from_to_dates(data, n_periods)
        freq = 'MS' if self.agg_level == 'M' else self.agg_level
        pred_date_range = pd.date_range(start=pred_from_date, end=pred_thru_date, freq=freq)
        for date in pred_date_range:
            spred = self._single_predict(data, date)
            spred_add = self.add_feature_sigle_predict(data, date)
            attributes = self._get_attributes(date)
            
            lags = list(data[-self.num_lags:][self.valuecol_name].values)
            #adding add_atrribute cols
            for col in self.add_attributes:
                add_col_values =  data[-self.num_lags:][col].values
            lags.extend(add_col_values)
            
            lags = lags[::-1]
            
            new_history = [date, spred, spred_add, *attributes, *lags]
            new_history = pd.DataFrame(new_history)
            new_history = new_history.T
            new_history.columns = data.columns
            data = pd.concat([data, new_history])
            data = data.reset_index(drop=True)
        predictions = []
        for _, row in data[-n_periods:].iterrows():
            predictions.append({
            'Date': row[self.datecol_name],
            'Prediction': row[self.valuecol_name]
            })
        return predictions

In [53]:
df  = pd.read_csv('./dataset.csv')
df.columns = ['Date', 'Passengers']

df['Temp'] = np.random.randint(0, 38, size = len(df))

train_set, test_set = df[:-12], df[-12:]

train_set['Date'] = train_set['Date'].apply(lambda x: (x + '-01'))
train_set['Date'] = train_set['Date'].astype('datetime64')

model = XGBTimeSeries(
dataset=train_set,
datecol_name='Date',
valuecol_name='Passengers',
agg_level='M',
num_lags=12
)

model.fit(hyperparameters={'n_estimators': 2000})
preds = model.predict(12)

In [54]:
preds

[{'Date': Timestamp('1960-01-01 00:00:00'), 'Prediction': 397.97696},
 {'Date': Timestamp('1960-02-01 00:00:00'), 'Prediction': 379.43735},
 {'Date': Timestamp('1960-03-01 00:00:00'), 'Prediction': 441.46313},
 {'Date': Timestamp('1960-04-01 00:00:00'), 'Prediction': 453.36874},
 {'Date': Timestamp('1960-05-01 00:00:00'), 'Prediction': 469.70456},
 {'Date': Timestamp('1960-06-01 00:00:00'), 'Prediction': 501.3307},
 {'Date': Timestamp('1960-07-01 00:00:00'), 'Prediction': 545.7566},
 {'Date': Timestamp('1960-08-01 00:00:00'), 'Prediction': 558.99603},
 {'Date': Timestamp('1960-09-01 00:00:00'), 'Prediction': 495.79346},
 {'Date': Timestamp('1960-10-01 00:00:00'), 'Prediction': 467.99545},
 {'Date': Timestamp('1960-11-01 00:00:00'), 'Prediction': 429.5537},
 {'Date': Timestamp('1960-12-01 00:00:00'), 'Prediction': 468.0182}]

In [None]:
preds = pd.DataFrame(preds)
eval_df = pd.DataFrame({
'Date': test_set['Date'],
'Actual': test_set['Passengers'],
'Predicted': preds['Prediction'].values
})
eval_df['AbsDiff'] = np.abs(eval_df['Actual'] - eval_df['Predicted'])
eval_df['PctDiff'] = (eval_df['AbsDiff'] / eval_df['Actual']) * 100

In [None]:
from sklearn.metrics import mean_squared_error
rmse = lambda act, pred: np.sqrt(mean_squared_error(act, pred))
rmse(eval_df['Actual'], eval_df['Predicted'])
# The result was 36.45489072358621 for me, indicating that our
# model is expected to make an error of roughly 37 passengers,
# positive or negative, on an average month.

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(12, 7))
plt.plot(eval_df['Actual'], label='Actual')
plt.plot(eval_df['Predicted'], label='Predicted')
plt.title('Airline Passengers - Actual vs Predicted', size=20)
plt.legend();