In [12]:
import pandas as pd
import numpy as np
import xgboost
from xgboost import XGBRegressor
from tqdm import tqdm
# make sure that you have xgboost installed:
# pip install xgboost

In [13]:
import numpy as np
import xgboost
from xgboost import XGBRegressor
from tqdm import tqdm

# make sure that you have xgboost installed:
# pip install xgboost

def add_lag(X, y, lag):
    '''
    Args:
    - X:    [ len, n_attr ]
    - y:    [ len ]
    Returns:
    - X_lag [ len - lag, (n_attr + 1) * lag ]
    - y_lag [ len - lag ]
    '''
    y_lag = y[lag:]                             # [ len - lag ]
    X_lag = [ np.concatenate([X[i-lag:i], y[i-lag:i][:, None]], 1)  for i in range(lag, len(X), 1) ] # each [ lag, n_attr + 1 ]
    X_lag = [ x.reshape(-1) for x in X_lag ]    # each [ lag * n_attr + 1 ]    
    X_lag = np.stack(X_lag, 0)                  # [ len - lag, (n_attr + 1) * lag ]
    return X_lag, y_lag

def add_lag_X(X, lag):
    '''
    Args:
    - X:    [ len, n_attr ]
    Returns:
    - X_lag [ len, n_attr * lag ]
    '''
    X_lag = [ X[i-lag:i]  for i in range(lag, len(X), 1) ] # each [ lag, n_attr + 1 ]
    X_lag = [ x.reshape(-1) for x in X_lag ]    # each [ lag * n_attr + 1 ]    
    X_lag = np.stack(X_lag, 0)                  # [ len - lag, (n_attr + 1) * lag ]
    X_lag = np.concatenate([np.zeros([lag, X_lag.shape[1]]),X_lag],0)
    return X_lag
    
class XGBoost_AR:
    '''Autoregressive adaptation of XGBoost'''
    def __init__(self, kwds, lag):
        self.model   = XGBRegressor(**kwds)
        self.lag = lag

    def fit(self, Xtr, ytr):
        '''
        Args:
        - Xtr:    [ len, n_attr ]
        - ytr:    [ len ]
        '''
        Xtr_, ytr_ = add_lag(Xtr, ytr, self.lag)
        self.model.fit(Xtr_, ytr_)

    def predict(self, Xte):
        '''
        Args:
        - Xte:  [ len_te, n_attr ]
        Returns:
        - yte:  [ len_te ]
        '''
        yte = np.ones(self.lag)
        for i in tqdm(range(self.lag + 1, len(Xte), 1), desc = 'Predicting'):
            Xte_, _ = add_lag(Xte[:i], yte[:i], self.lag)
            yte = self.model.predict(Xte_)
            yte = np.concatenate([np.ones(self.lag) * np.mean(yte), yte], 0)
        return yte
    
class XGBoost_AR_Half:
    '''A more computationally efficient version than its full version, but may be less powerful'''
    def __init__(self, kwds, lag):
        self.model   = XGBRegressor(**kwds)
        self.lag = lag

    def fit(self, Xtr, ytr):
        '''
        Args:
        - Xtr:    [ len, n_attr ]
        - ytr:    [ len ]
        '''
        Xtr_ = add_lag_X(Xtr, self.lag)
        self.model.fit(Xtr_, ytr)

    def predict(self, Xte):
        '''
        Args:
        - Xte:  [ len_te, n_attr ]
        Returns:
        - yte:  [ len_te ]
        '''
        Xte_ = add_lag_X(Xte, self.lag)
        yte = self.model.predict(Xte_)
        return yte

In [14]:
class MultiresBoosting:

    def __init__(self, df):
        # df = pd.read_excel(training_path)
        self.df = df
        time_metadata = df.iloc[:, :4]
        self.y = df.iloc[:, 4].values
        self.X = df.iloc[:, 5:].values
        # index to date
        time_metadata['Date'] = time_metadata['Year'].astype('str') + '_' + time_metadata['Month'].astype('str') + '_' + time_metadata['Day'].astype('str')
        self.P_date = pd.get_dummies(time_metadata['Date']).values
        # index to fake month
        time_metadata['Fake Month'] = time_metadata['Year'].astype('str') + '_' + time_metadata['Month'].astype('str')
        self.P_fmonth = pd.get_dummies(time_metadata['Fake Month']).values
        # index to month
        self.P_month = pd.get_dummies(time_metadata['Month']).values    # [ num_data, num_months ]
        # index to hour
        self.P_hour  = pd.get_dummies(time_metadata['Hour']).values    # [ num_data, num_hours ]
        kwds = {
        'objective': 'reg:squarederror',
        'max_depth': 10,
        'learning_rate': 1e-1,
        'n_estimators': 100
        }

        self.models = [ XGBoost_AR_Half(kwds, lag=11) for _ in range(3) ]
        
    def fit(self):
        # Step 1: de-trend the data 
        self.avg_month  = self.df.groupby('Month').mean()['Load'].values    # [ 12 ]
        self.avg_hour   = self.df.groupby('Hour').mean()['Load'].values     # [ 24 ]
        self.y = self.y - self.avg_month @ self.P_month.T - self.avg_hour @ self.P_hour.T # [ num_data ]

        # Step 2: fitting first level model
        P_fmonth = self.P_fmonth / self.P_fmonth.sum(0)
        X_fmonth = P_fmonth.T @ self.X  # [ num_fake_months, num_covs ]
        y_fmonth = P_fmonth.T @ self.y  # [ num_fake_months ]
        self.models[0].fit(X_fmonth, y_fmonth)   # TODO
        y_fit_fmonth = self.models[0].predict(X_fmonth)

        # Step 3: fitting second level model
        P_date = self.P_date / self.P_date.sum(0) # [ num_data, num_dates ]
        X_date = P_date.T @ self.X  # [ num_date, num_covs ]
        y_date = P_date.T @ self.y - P_date.T @ self.P_fmonth @ y_fit_fmonth # [ num_date ]
        self.models[1].fit(X_date, y_date)   # TODO
        y_fit_date = self.models[1].predict(X_date)

        # Step 3: fitting third level model
        X = self.X
        y = self.y - self.P_date @ y_fit_date - self.P_fmonth @ y_fit_fmonth
        self.models[2].fit(X, y)    # TODO

    def predict(self, df):
        '''
        df: the test dataframe
        '''
        time_metadata = df.iloc[:, :4]
        X = df.iloc[:, 5:].values
        # index to date
        time_metadata['Date'] = time_metadata['Year'].astype('str') + '_' + time_metadata['Month'].astype('str') + '_' + time_metadata['Day'].astype('str')
        P_date = pd.get_dummies(time_metadata['Date']).values
        # index to fake month
        time_metadata['Fake Month'] = time_metadata['Year'].astype('str') + '_' + time_metadata['Month'].astype('str')
        P_fmonth = pd.get_dummies(time_metadata['Fake Month']).values
        # index to month
        P_month = pd.get_dummies(time_metadata['Month']).values    # [ num_data, num_months ]
        # index to hour
        P_hour  = pd.get_dummies(time_metadata['Hour']).values    # [ num_data, num_hours ]

        # TODO: input X and get y
        y =         self.models[2].predict(X)
        P_date_ =    P_date / P_date.sum(0) # [ num_data, num_dates ]
        X_date =    P_date_.T @ X  # [ num_date, num_covs ]
        y_date =    self.models[1].predict(X_date)           
        P_fmonth_ =  P_fmonth / P_fmonth.sum(0)
        X_fmonth =  P_fmonth_.T @ X  # [ num_fake_months, num_covs ]
        y_fmonth =  self.models[0].predict(X_fmonth)
        output =    y + y_fmonth @ P_fmonth.T + y_date @ P_date.T + self.avg_month @ P_month.T + self.avg_hour @ P_hour.T
        return output   # [ num_test_data ]

In [15]:
train = pd.read_excel('data/training.xlsx')
test = pd.read_excel("data/testing.xlsx")

In [16]:
final_model = MultiresBoosting(train)
final_model.fit()
y_pred = final_model.predict(test)
y_pred

array([2019.99830945, 1954.11322053, 1919.79584707, ..., 2244.85536549,
       2093.41008037, 1988.1158298 ])

In [18]:
results = pd.DataFrame({'Load':y_pred})
results.head()

Unnamed: 0,Load
0,2019.998309
1,1954.113221
2,1919.795847
3,1928.367666
4,1990.887502


In [19]:
results.to_excel('Results_CMUGridElfs.xlsx',index = False)