# Jobathon Nov 2022 Time series Model

## Table of Contents
### 1. [Read Train and Test ](#read)
### 2. [Feature Generation](#feature)
### 3. [Train and Validation Split](#split)
### 4. [Model Evaluation using Unobserved Components](#model_eval)
### 5. [Model Finalization for Test Prediction](#model_final)

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/jobathon-nov-2022/sample_submission.csv
/kaggle/input/jobathon-nov-2022/train.csv
/kaggle/input/jobathon-nov-2022/test.csv


In [2]:
pd.options.display.max_columns=500
pd.options.display.max_rows=500


In [3]:
from pandas.tseries.holiday import *

In [4]:
!pip install pmdarima

Collecting pmdarima
  Downloading pmdarima-2.0.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pmdarima
Successfully installed pmdarima-2.0.1
[0m

<a id='read'></a>
## Read Train and Test Data

In [5]:
train=pd.read_csv('/kaggle/input/jobathon-nov-2022/train.csv')
print(train.shape)

(94992, 3)


In [6]:
test=pd.read_csv('/kaggle/input/jobathon-nov-2022/test.csv')
print(test.shape)

(26304, 2)


In [7]:
train.head()

Unnamed: 0,row_id,datetime,energy
0,1,2008-03-01 00:00:00,1259.985563
1,2,2008-03-01 01:00:00,1095.5415
2,3,2008-03-01 02:00:00,1056.2475
3,4,2008-03-01 03:00:00,1034.742
4,5,2008-03-01 04:00:00,1026.3345


In [8]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94992 entries, 0 to 94991
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   row_id    94992 non-null  int64  
 1   datetime  94992 non-null  object 
 2   energy    93092 non-null  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 2.2+ MB


In [9]:
train['energy'].describe()
targetcol='energy'

In [10]:
train['datetime']=pd.to_datetime(train['datetime'],infer_datetime_format=True)
test['datetime']=pd.to_datetime(test['datetime'],infer_datetime_format=True)

In [11]:
train['datetime'].min(),train['datetime'].max()

(Timestamp('2008-03-01 00:00:00'), Timestamp('2018-12-31 23:00:00'))

<a id='feature'></a>
## Feature Generation

In [12]:
#create hour map based on business, non - business , sleeping hours etc.
hour_map={0:0,1:0,2:0,3:0,4:0,5:0,
         6:1,7:1,8:1,
         9:2,10:2,11:2,
         12:3,13:3,14:3,15:3,
         16:4,17:4,
         18:5,19:5,20:5,
         21:6,22:6,23:6}
 
#Monday and Sunday as group 1, Saturday as group 2, Otherdays as group 3
dayofweek_map = {0:1,6:1,
                5:2,
                1:3,2:3,3:3,4:3}

create basic date related features

In [13]:
def gen_datefeats(data):
    data['year']=data['datetime'].dt.year
    data['month']=data['datetime'].dt.month
    data['day']=data['datetime'].dt.day
    data['hour']=data['datetime'].dt.hour
    data['weekofyear']=data['datetime'].dt.isocalendar().week
    data['dayofweek']=data['datetime'].dt.dayofweek
    data['dayofweek_grp']=data['dayofweek'].replace(dayofweek_map)
    data['quarter']=data['datetime'].dt.quarter
    data['is_weekend']=data['datetime'].dt.dayofweek > 4
    data['day_part']=data['hour'].replace(hour_map)

In [14]:
gen_datefeats(train)
gen_datefeats(test)

create holiday features with special holiday denoting christmas long holidays

In [15]:
#generate holidays feature
def gen_holiday_feat(data,start,end):
    cal = USFederalHolidayCalendar()
    holiday_dates = cal.holidays(start=start, end=end)
    data['is_holiday'] = False
    mask = data['datetime'].dt.date.astype('datetime64').isin(holiday_dates)
    data.loc[mask,'is_holiday']=True   
    
    data['special_holiday']=False
    mask= ((data['datetime'].dt.month==12) & (data['datetime'].dt.day>=24))  \
           | ((data['datetime'].dt.month==1) & (data['datetime'].dt.day<3)) 
    data.loc[mask,'special_holiday']=True   
        
    return holiday_dates

In [16]:
holidays= gen_holiday_feat(train,train['datetime'].dt.date.min(),train['datetime'].dt.date.max())
print(holidays)
print(train['special_holiday'].value_counts())
train['is_holiday'].value_counts()

DatetimeIndex(['2008-05-26', '2008-07-04', '2008-09-01', '2008-10-13',
               '2008-11-11', '2008-11-27', '2008-12-25', '2009-01-01',
               '2009-01-19', '2009-02-16',
               ...
               '2018-01-01', '2018-01-15', '2018-02-19', '2018-05-28',
               '2018-07-04', '2018-09-03', '2018-10-08', '2018-11-12',
               '2018-11-22', '2018-12-25'],
              dtype='datetime64[ns]', length=107, freq=None)
False    92400
True      2592
Name: special_holiday, dtype: int64


False    92424
True      2568
Name: is_holiday, dtype: int64

In [17]:
train.head()

Unnamed: 0,row_id,datetime,energy,year,month,day,hour,weekofyear,dayofweek,dayofweek_grp,quarter,is_weekend,day_part,is_holiday,special_holiday
0,1,2008-03-01 00:00:00,1259.985563,2008,3,1,0,9,5,2,1,True,0,False,False
1,2,2008-03-01 01:00:00,1095.5415,2008,3,1,1,9,5,2,1,True,0,False,False
2,3,2008-03-01 02:00:00,1056.2475,2008,3,1,2,9,5,2,1,True,0,False,False
3,4,2008-03-01 03:00:00,1034.742,2008,3,1,3,9,5,2,1,True,0,False,False
4,5,2008-03-01 04:00:00,1026.3345,2008,3,1,4,9,5,2,1,True,0,False,False


generate hour aggregate features month-wise, quarter-wise, week of year-wise and week day group wise

In [18]:
def gen_mean_feats(train,test,cols,newcolname):
    grouped=train.groupby(cols)[targetcol].mean().reset_index()
    grouped.columns=cols+[newcolname]
    train=train.merge(grouped,on=cols)    
    test=test.merge(grouped,on=cols)   
    return train,test
    
def gen_mean_feats_all(train,test):
    train,test=gen_mean_feats(train,test,['month','hour'],'month_hour_mean')
    train,test=gen_mean_feats(train,test,['quarter','hour'],'quarter_hour_mean')
    train,test=gen_mean_feats(train,test,['weekofyear','hour'],'weekofyear_mean')
    train,test=gen_mean_feats(train,test,['dayofweek_grp','hour'],'dayofweek_grp_mean')  
    
    train.sort_values('datetime',inplace=True)
    train.reset_index(drop=True,inplace=True)
    test.sort_values('datetime',inplace=True)
    test.reset_index(drop=True,inplace=True)
    return train,test

In [19]:
train,test=gen_mean_feats_all(train,test)

In [20]:
train.head(10)

Unnamed: 0,row_id,datetime,energy,year,month,day,hour,weekofyear,dayofweek,dayofweek_grp,quarter,is_weekend,day_part,is_holiday,special_holiday,month_hour_mean,quarter_hour_mean,weekofyear_mean,dayofweek_grp_mean
0,1,2008-03-01 00:00:00,1259.985563,2008,3,1,0,9,5,2,1,True,0,False,False,1397.227508,1504.829986,1392.384333,1654.027936
1,2,2008-03-01 01:00:00,1095.5415,2008,3,1,1,9,5,2,1,True,0,False,False,1326.027668,1412.376876,1306.621744,1566.045064
2,3,2008-03-01 02:00:00,1056.2475,2008,3,1,2,9,5,2,1,True,0,False,False,1287.670058,1352.371501,1259.495448,1511.553768
3,4,2008-03-01 03:00:00,1034.742,2008,3,1,3,9,5,2,1,True,0,False,False,1262.746667,1315.962924,1230.416947,1478.16859
4,5,2008-03-01 04:00:00,1026.3345,2008,3,1,4,9,5,2,1,True,0,False,False,1260.754616,1298.825422,1221.394201,1473.652068
5,6,2008-03-01 05:00:00,1033.7685,2008,3,1,5,9,5,2,1,True,0,False,False,1275.482187,1305.534563,1230.498917,1493.890931
6,7,2008-03-01 06:00:00,1086.78,2008,3,1,6,9,5,2,1,True,1,False,False,1337.872697,1358.662339,1295.098037,1566.537865
7,8,2008-03-01 07:00:00,1211.742,2008,3,1,7,9,5,2,1,True,1,False,False,1464.527791,1470.300187,1412.6619,1697.378282
8,9,2008-03-01 08:00:00,1293.693,2008,3,1,8,9,5,2,1,True,1,False,False,1556.391201,1543.597804,1493.338281,1781.379368
9,10,2008-03-01 09:00:00,1318.9155,2008,3,1,9,9,5,2,1,True,2,False,False,1577.528044,1589.814926,1517.786396,1804.679326


In [21]:
test.head()

Unnamed: 0,row_id,datetime,year,month,day,hour,weekofyear,dayofweek,dayofweek_grp,quarter,is_weekend,day_part,month_hour_mean,quarter_hour_mean,weekofyear_mean,dayofweek_grp_mean
0,94993,2019-01-01 00:00:00,2019,1,1,0,1,1,3,1,False,0,1661.002682,1504.829986,1711.103516,1625.815769
1,94994,2019-01-01 01:00:00,2019,1,1,1,1,1,3,1,False,0,1543.596544,1412.376876,1592.733254,1543.548695
2,94995,2019-01-01 02:00:00,2019,1,1,2,1,1,3,1,False,0,1462.390875,1352.371501,1506.964956,1493.577402
3,94996,2019-01-01 03:00:00,2019,1,1,3,1,1,3,1,False,0,1410.076914,1315.962924,1439.720612,1465.82546
4,94997,2019-01-01 04:00:00,2019,1,1,4,1,1,3,1,False,0,1378.289974,1298.825422,1415.453093,1459.997669


In [22]:
train['year'].unique()

array([2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018])

In [23]:
test['datetime'].min(),test['datetime'].max()

(Timestamp('2019-01-01 00:00:00'), Timestamp('2021-12-31 23:00:00'))

In [24]:
holidays= gen_holiday_feat(test,test['datetime'].dt.date.min(),test['datetime'].dt.date.max())
print(len(holidays))
print(holidays)
print(test['special_holiday'].value_counts())
test['is_holiday'].value_counts()

31
DatetimeIndex(['2019-01-01', '2019-01-21', '2019-02-18', '2019-05-27',
               '2019-07-04', '2019-09-02', '2019-10-14', '2019-11-11',
               '2019-11-28', '2019-12-25', '2020-01-01', '2020-01-20',
               '2020-02-17', '2020-05-25', '2020-07-03', '2020-09-07',
               '2020-10-12', '2020-11-11', '2020-11-26', '2020-12-25',
               '2021-01-01', '2021-01-18', '2021-02-15', '2021-05-31',
               '2021-07-05', '2021-09-06', '2021-10-11', '2021-11-11',
               '2021-11-25', '2021-12-24', '2021-12-31'],
              dtype='datetime64[ns]', freq=None)
False    25584
True       720
Name: special_holiday, dtype: int64


False    25560
True       744
Name: is_holiday, dtype: int64

In [25]:
targetcol = 'energy'

In [27]:
# train[targetcol].fillna(train[targetcol].mean(),inplace=True)
train[targetcol].fillna(method='ffill',inplace=True)

Create Lag Features

In [28]:
def create_lag(data,lagno_list):
    res = pd.DataFrame()
    for i in lagno_list:
        shifted = data.shift(i)
        res=pd.concat([res,shifted],axis=1)

#     res=pd.concat([data.shift(i) for i in lagno_list],axis=1)
    res.columns=[f'lag_{i}' for i in lagno_list]
    return res

In [29]:
test.shape

(26304, 18)

In [30]:
train['istrain']=1
test['istrain']=0
combined = pd.concat([train,test],axis=0) 
#lag 1 year, 3 year, quarter, month,week
lag_df = create_lag(combined[targetcol],[24,168,720,2160,8760,26304])
combined=pd.concat([combined,lag_df],axis=1)
train=combined[combined['istrain']==1]
test=combined[combined['istrain']==0]

del combined,train['istrain'],test['istrain'],test[targetcol]
print(train.shape,test.shape)

(94992, 25) (26304, 24)


In [31]:
train.head()

Unnamed: 0,row_id,datetime,energy,year,month,day,hour,weekofyear,dayofweek,dayofweek_grp,quarter,is_weekend,day_part,is_holiday,special_holiday,month_hour_mean,quarter_hour_mean,weekofyear_mean,dayofweek_grp_mean,lag_24,lag_168,lag_720,lag_2160,lag_8760,lag_26304
0,1,2008-03-01 00:00:00,1259.985563,2008,3,1,0,9,5,2,1,True,0,False,False,1397.227508,1504.829986,1392.384333,1654.027936,,,,,,
1,2,2008-03-01 01:00:00,1095.5415,2008,3,1,1,9,5,2,1,True,0,False,False,1326.027668,1412.376876,1306.621744,1566.045064,,,,,,
2,3,2008-03-01 02:00:00,1056.2475,2008,3,1,2,9,5,2,1,True,0,False,False,1287.670058,1352.371501,1259.495448,1511.553768,,,,,,
3,4,2008-03-01 03:00:00,1034.742,2008,3,1,3,9,5,2,1,True,0,False,False,1262.746667,1315.962924,1230.416947,1478.16859,,,,,,
4,5,2008-03-01 04:00:00,1026.3345,2008,3,1,4,9,5,2,1,True,0,False,False,1260.754616,1298.825422,1221.394201,1473.652068,,,,,,


In [32]:
train.columns

Index(['row_id', 'datetime', 'energy', 'year', 'month', 'day', 'hour',
       'weekofyear', 'dayofweek', 'dayofweek_grp', 'quarter', 'is_weekend',
       'day_part', 'is_holiday', 'special_holiday', 'month_hour_mean',
       'quarter_hour_mean', 'weekofyear_mean', 'dayofweek_grp_mean', 'lag_24',
       'lag_168', 'lag_720', 'lag_2160', 'lag_8760', 'lag_26304'],
      dtype='object')

Fill null values in train data using previous hour values

In [33]:
cols = [col for col in train.columns if col.startswith('lag_')]
target_mean = train[targetcol].mean()
for col in cols:
    train[col].fillna(0,inplace=True)

<a id='split'></a>
## Train and Validation Split

Validation Set from 2016 to 2018 <br>
Train Set from 2008 to 2015

In [35]:
import datetime 
train_start = datetime.datetime(year=2008,month=1,day=1,hour=0)
val_start = datetime.datetime(year=2016,month=1,day=1,hour=0)
val_end = datetime.datetime(year=2018,month=12,day=31,hour=23)

X_val= train[(train['datetime']>=val_start) & (train['datetime']<=val_end)].copy()
X_train= train[(train['datetime']>=train_start) & (train['datetime']<val_start)].copy()
print(X_train.shape)
print(X_val.shape)
X_val.head()           

(68688, 25)
(26304, 25)


Unnamed: 0,row_id,datetime,energy,year,month,day,hour,weekofyear,dayofweek,dayofweek_grp,quarter,is_weekend,day_part,is_holiday,special_holiday,month_hour_mean,quarter_hour_mean,weekofyear_mean,dayofweek_grp_mean,lag_24,lag_168,lag_720,lag_2160,lag_8760,lag_26304
68688,68689,2016-01-01 00:00:00,2002.247,2016,1,1,0,53,4,3,1,False,0,True,True,1661.002682,1504.829986,1874.91005,1625.815769,2045.017,2274.753,1983.0616,1878.0918,2287.2788,1479.1658
68689,68690,2016-01-01 01:00:00,1843.387,2016,1,1,1,53,4,3,1,False,0,True,True,1543.596544,1412.376876,1734.62185,1543.548695,1881.1468,2118.4592,1827.3788,1878.0918,2120.5124,1375.2627
68690,68691,2016-01-01 02:00:00,1735.1178,2016,1,1,2,53,4,3,1,False,0,True,True,1462.390875,1352.371501,1633.81205,1493.577402,1781.7982,2020.4548,1719.965,1627.704,2015.9704,1307.8261
68691,68692,2016-01-01 03:00:00,1666.197,2016,1,1,3,53,4,3,1,False,0,True,True,1410.076914,1315.962924,1578.1656,1465.82546,1719.5984,1936.5034,1648.2336,1564.5266,1939.974,1261.5295
68692,68693,2016-01-01 04:00:00,1630.3924,2016,1,1,4,53,4,3,1,False,0,True,True,1378.289974,1298.825422,1542.4391,1459.997669,1657.1542,1895.8108,1603.264,1541.553,1885.3868,1228.8682


In [36]:
val_target = X_val[targetcol]
print(X_val[targetcol].isnull().sum())

0


Create simple validation prediction baseline using train mean value 

In [38]:
from sklearn.metrics import mean_squared_error

In [39]:
#compute baseline error by predicting train energy mean as the energy for all time
val_preds_baseline = np.full(len(X_val),train[targetcol].mean())

In [42]:
#compute error score on baseline predictions
val_score = mean_squared_error(val_target,val_preds_baseline,squared=False)
print('valid score:',val_score)

valid score: 431.7300102305795


<a id='model_eval'></a>
## Model Evaluation using Unobserved Components

In [43]:
X_val.shape
X_train.shape

(68688, 25)

In [44]:
import statsmodels.api as sm

In [46]:
def get_exog(data):
    exog_data =data[[
#                     'is_weekend',
                     'hour',
                     'month_hour_mean','quarter_hour_mean','weekofyear_mean',
                      'dayofweek_grp_mean',  
                    ]]
#     exog_data =data[['is_weekend','month','hour','is_holiday','special_holiday']]
    cols = ['hour']
    exog_data = pd.get_dummies(exog_data,prefix=cols,columns=cols)
    return exog_data.astype('int')

In [47]:
X_val.shape

(26304, 25)

In [48]:
X_train.index

Index([    0,     1,     2,     3,     4,     5,     6,     7,     8,     9,
       ...
       68678, 68679, 68680, 68681, 68682, 68683, 68684, 68685, 68686, 68687],
      dtype='object', length=68688)

In [50]:
# exog_train = get_exog(X_train)
# exog_test = get_exog(X_val)
# print(exog_train.columns)

# y_train = X_train[targetcol].copy()
# y_test = X_val[targetcol].copy()

In [51]:
exog_train = X_train[['month_hour_mean','quarter_hour_mean','weekofyear_mean','dayofweek_grp_mean']].astype('int')
exog_test = X_val[['month_hour_mean','quarter_hour_mean','weekofyear_mean','dayofweek_grp_mean']].astype('int')

y_train = X_train[targetcol].copy()
y_test = X_val[targetcol].copy()


Create UCM model with 3 seasonalities and set other parameters as determined during EDA

In [52]:
%%time
#splitting time series to train and test subsets

#Unobserved Components model definition
model_UC1 = sm.tsa.UnobservedComponents(y_train,
                                        autoregressive=2,
                                        level='lldtrend',
                                        exog=exog_train,
                                        cycle=False,
                                        irregular=False,
                                        stochastic_level = False,
                                        stochastic_trend = False,
                                        stochastic_freq_seasonal = [False,False,True],
                                        freq_seasonal=[{'period': 24, 'harmonics': 1},
                                                       {'period': 168, 'harmonics': 1},
                                                       {'period': 8766, 'harmonics': 2}
                                                      ]
                                       )
#fitting model to train data
model_UC1res = model_UC1.fit()

#printing statsmodels summary for model
print(model_UC1res.summary())

print("")
#calculating mean absolute error and root mean squared error for in-sample prediction of model
print(f"In-sample mean absolute error (MAE): {'%.0f' % model_UC1res.mae}, In-sample root mean squared error (RMSE): {'%.0f' % np.sqrt(model_UC1res.mse)}")

#model forecast

  self._init_dates(dates, freq)
 This problem is unconstrained.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =           10     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  6.60858D+00    |proj g|=  2.06045D-02

At iterate    5    f=  6.58712D+00    |proj g|=  3.30647D-03

At iterate   10    f=  6.44908D+00    |proj g|=  3.46181D-02

At iterate   15    f=  5.54640D+00    |proj g|=  1.41002D-01

At iterate   20    f=  5.20858D+00    |proj g|=  5.39570D-02

At iterate   25    f=  5.10238D+00    |proj g|=  1.91475D-02

At iterate   30    f=  4.78196D+00    |proj g|=  4.72114D-01

At iterate   35    f=  4.65346D+00    |proj g|=  4.26132D-02

At iterate   40    f=  4.65243D+00    |proj g|=  5.99829D-03

At iterate   45    f=  4.64664D+00    |proj g|=  3.93707D-02

At iterate   50    f=  4.64117D+00    |proj g|=  1.61984D-02

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cau



                                 Unobserved Components Results                                 
Dep. Variable:                                  energy   No. Observations:                68688
Model:                local linear deterministic trend   Log Likelihood             -318792.995
                                + freq_seasonal(24(1))   AIC                         637605.989
                               + freq_seasonal(168(1))   BIC                         637697.361
                   + stochastic freq_seasonal(8766(2))   HQIC                        637634.195
                                               + AR(2)                                         
Date:                                 Sun, 20 Nov 2022                                         
Time:                                         17:45:19                                         
Sample:                                     01-01-1970                                         
                                        

Predict validation data set using the above trained model

In [53]:
forecast_UC1 = model_UC1res.forecast(steps=26304,exog=exog_test.reset_index(drop=True),
#                                      signal_only='abcd'
                                    )
#calculating root mean squared error
RMSE_UC1 = mean_squared_error(y_test,forecast_UC1,squared=False)
print('valid score:',RMSE_UC1)

valid score: 210.14759367685278


In [57]:
import joblib

Save the model file and validation set and validation predictions

In [58]:
np.save('val_forecast_UC.npy',forecast_UC1)
joblib.dump(model_UC1res,'UC_model_val.pkl',compress=True)
X_val.to_csv('X_val.csv',index=False)

In [59]:
pd.Series(forecast_UC1.describe())

count    26304.000000
mean      1889.893129
std        239.034764
min       1265.702516
25%       1719.378875
50%       1876.667720
75%       2071.568487
max       2500.430957
Name: predicted_mean, dtype: float64

In [64]:
import matplotlib.pyplot as plt

<a id='model_final'></a>
## Model Finalization for Test Prediction

Train the model using complete data and perform test prediction using this model

In [66]:
exog_train_full = train[['month_hour_mean','quarter_hour_mean','weekofyear_mean','dayofweek_grp_mean']].astype('int')
exog_test_full = test[['month_hour_mean','quarter_hour_mean','weekofyear_mean','dayofweek_grp_mean']].astype('int')


Use the same model configuration as that of model evaluation phase, but fit the model on full train data

In [67]:
%%time
#splitting time series to train and test subsets
y_train_full = train[targetcol].copy()

#Unobserved Components model definition
model_UC_full = sm.tsa.UnobservedComponents(y_train_full,
                                        autoregressive=2,
                                        level='lldtrend',
                                        exog=exog_train_full,
                                        cycle=False,
                                        irregular=False,
                                        stochastic_level = False,
                                        stochastic_trend = False,
                                        stochastic_freq_seasonal = [False,False,True],
                                        freq_seasonal=[{'period': 24, 'harmonics': 1},
                                                       {'period': 168, 'harmonics': 1},
                                                       {'period': 8766, 'harmonics': 2}
                                                      ]
                                       )
#fitting model to train data
model_UC_full_res = model_UC_full.fit()

#printing statsmodels summary for model
print(model_UC_full_res.summary())

print("")
#calculating mean absolute error and root mean squared error for in-sample prediction of model
print(f"In-sample mean absolute error (MAE): {'%.0f' % model_UC_full_res.mae}, In-sample root mean squared error (RMSE): {'%.0f' % np.sqrt(model_UC_full_res.mse)}")

  self._init_dates(dates, freq)


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =           10     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  6.74571D+00    |proj g|=  1.65115D-02


 This problem is unconstrained.



At iterate    5    f=  6.72746D+00    |proj g|=  2.89559D-03

At iterate   10    f=  6.60047D+00    |proj g|=  2.28351D-02

At iterate   15    f=  5.59100D+00    |proj g|=  1.85184D-01

At iterate   20    f=  5.23756D+00    |proj g|=  5.52484D-02

At iterate   25    f=  5.15310D+00    |proj g|=  7.38187D-02

At iterate   30    f=  4.81392D+00    |proj g|=  1.17701D-01

At iterate   35    f=  4.72962D+00    |proj g|=  1.20128D-02

At iterate   40    f=  4.72134D+00    |proj g|=  3.74185D-02

At iterate   45    f=  4.71795D+00    |proj g|=  2.72116D-03





At iterate   50    f=  4.71793D+00    |proj g|=  7.27483D-05

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
   10     50     63      1     0     0   7.275D-05   4.718D+00
  F =   4.7179349548425202     

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT                 
                                 Unobserved Components Results                                 
Dep. Variable:                                  energy   No. Observations:                94992
Model:                local linear deterministic trend   Log Likelihood             -448166.077
                                + freq_seasonal(24(1))   AIC             

In [68]:
#Predict Test Set
forecast_UC_test = model_UC_full_res.forecast(steps=26304,exog=exog_test_full)

In [69]:
joblib.dump(model_UC_full_res,'UC_model_full.pkl',compress=True)

['UC_model_full.pkl']

In [70]:
print(len(test),len(forecast_UC_test))

26304 26304


In [71]:
forecast_UC_test.index

DatetimeIndex(['1970-01-01 00:00:00.000094992',
               '1970-01-01 00:00:00.000094993',
               '1970-01-01 00:00:00.000094994',
               '1970-01-01 00:00:00.000094995',
               '1970-01-01 00:00:00.000094996',
               '1970-01-01 00:00:00.000094997',
               '1970-01-01 00:00:00.000094998',
               '1970-01-01 00:00:00.000094999',
                  '1970-01-01 00:00:00.000095',
               '1970-01-01 00:00:00.000095001',
               ...
               '1970-01-01 00:00:00.000121286',
               '1970-01-01 00:00:00.000121287',
               '1970-01-01 00:00:00.000121288',
               '1970-01-01 00:00:00.000121289',
               '1970-01-01 00:00:00.000121290',
               '1970-01-01 00:00:00.000121291',
               '1970-01-01 00:00:00.000121292',
               '1970-01-01 00:00:00.000121293',
               '1970-01-01 00:00:00.000121294',
               '1970-01-01 00:00:00.000121295'],
              dtype=

In [72]:
pd.Series(forecast_UC_test).describe()

count    26304.000000
mean      2175.961016
std        253.653072
min       1531.577046
25%       1992.191260
50%       2166.163701
75%       2367.162571
max       2824.182355
Name: predicted_mean, dtype: float64

In [73]:
subm = pd.DataFrame()
subm['row_id'] = test['row_id']
subm[targetcol]=forecast_UC_test.values
subm.to_csv('submission.csv',index=False)

In [74]:
#check saved submission data
pd.read_csv('submission.csv').head()

Unnamed: 0,row_id,energy
0,94993,2043.411609
1,94994,1926.74625
2,94995,1846.524746
3,94996,1790.194676
4,94997,1767.701016
