In [1]:
import pandas as pd

In [2]:
df_train = pd.read_csv('data/DailyDelhiClimateTrain.csv')
df_test = pd.read_csv('data/DailyDelhiClimateTest.csv')

In [3]:
df_train.shape

(1462, 5)

In [4]:
df_test.shape

(114, 5)

In [5]:
df_train.head()

Unnamed: 0,date,meantemp,humidity,wind_speed,meanpressure
0,2013-01-01,10.0,84.5,0.0,1015.666667
1,2013-01-02,7.4,92.0,2.98,1017.8
2,2013-01-03,7.166667,87.0,4.633333,1018.666667
3,2013-01-04,8.666667,71.333333,1.233333,1017.166667
4,2013-01-05,6.0,86.833333,3.7,1016.5


In [6]:
df_test.head()

Unnamed: 0,date,meantemp,humidity,wind_speed,meanpressure
0,2017-01-01,15.913043,85.869565,2.743478,59.0
1,2017-01-02,18.5,77.222222,2.894444,1018.277778
2,2017-01-03,17.111111,81.888889,4.016667,1018.333333
3,2017-01-04,18.7,70.05,4.545,1015.7
4,2017-01-05,18.388889,74.944444,3.3,1014.333333


In [7]:
# Import the adfuller function from the statsmodels module
from statsmodels.tsa.stattools import adfuller

# Create a function to apply the Augmented Dickey Fuller test
def adf_test(timeseries):
    print ('Results of Dickey-Fuller Test:')
    dftest = adfuller(timeseries, autolag='AIC')
    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
    for key,value in dftest[4].items():
        dfoutput['Critical Value (%s)'%key] = value
    print (dfoutput)

In [9]:
columns_to_test = df_train.drop('date', axis=1).columns

for column in columns_to_test:
    print(f'ADF Test Results for {column}:')
    adf_test(df_train[column])
    print('\n')

ADF Test Results for meantemp:
Results of Dickey-Fuller Test:
Test Statistic                   -2.021069
p-value                           0.277412
#Lags Used                       10.000000
Number of Observations Used    1451.000000
Critical Value (1%)              -3.434865
Critical Value (5%)              -2.863534
Critical Value (10%)             -2.567832
dtype: float64


ADF Test Results for humidity:
Results of Dickey-Fuller Test:
Test Statistic                   -3.675577
p-value                           0.004470
#Lags Used                       15.000000
Number of Observations Used    1446.000000
Critical Value (1%)              -3.434880
Critical Value (5%)              -2.863541
Critical Value (10%)             -2.567835
dtype: float64


ADF Test Results for wind_speed:
Results of Dickey-Fuller Test:
Test Statistic                   -3.838097
p-value                           0.002541
#Lags Used                       24.000000
Number of Observations Used    1437.000000
Crit

# Interpeting results

Excluding meantemp, the features are stationary as the test statistic is lower than the critical values (with more than 99% confidence).
In order to meet the assumptions for Bayesian forecasting, we will difference meantemp to try to make it stationary.

In [10]:
# Function to difference series
def difference(dataset):
    diff = list()
    for i in range(1, len(dataset)):
        value = dataset[i] - dataset[i - 1]
        diff.append(value)
    return pd.Series(diff)

# difference the 'meantemp' series
diff_meantemp = difference(df_train['meantemp'])

# You can print the differenced series
print(diff_meantemp)

0      -2.600000
1      -0.233333
2       1.500000
3      -2.666667
4       1.000000
          ...   
1456    0.367391
1457   -1.979296
1458   -1.142857
1459    0.957393
1460   -5.052632
Length: 1461, dtype: float64


In [11]:
# create a new dataframe, dropping the first row from the original df_train
# as the differenced series is now shorter by 1
df_train_diff = df_train.drop(df_train.index[0])

# replace the 'meantemp' column with the differenced series
df_train_diff['meantemp'] = diff_meantemp.values

# run adftest to see if it's now stationary
adf_test(df_train_diff['meantemp'])

Results of Dickey-Fuller Test:
Test Statistic                -1.637872e+01
p-value                        2.764863e-29
#Lags Used                     9.000000e+00
Number of Observations Used    1.451000e+03
Critical Value (1%)           -3.434865e+00
Critical Value (5%)           -2.863534e+00
Critical Value (10%)          -2.567832e+00
dtype: float64


# Success!
All of our features look to be stationary. At the moment, we won't check for the other assumptions of Bayesian forecasting:

- No serial correlation
- Homoscedasticity
- No endogeneity

Now we can begin applying BVAR via pymc3.