# NYC EMS Incidents 2013-2017

https://data.cityofnewyork.us/Public-Safety/EMS-Incident-Dispatch-Data/76xm-jjuj

Use Case: Predict number of EMS calls incidents in order to be adequately prepared to handle all of them. 

In [1]:
import matplotlib.pyplot as plt
import statsmodels.api as sm
import pandas as pd
import numpy as np
import datetime
import itertools
import warnings
import sklearn

from statsmodels.graphics.tsaplots import plot_pacf, plot_acf
from statsmodels.tsa.seasonal import seasonal_decompose
from pandas import Grouper
from matplotlib.pylab import rcParams
from sklearn.model_selection import TimeSeriesSplit
from fbprophet import Prophet

warnings.filterwarnings('ignore')
plt.style.use('ggplot')

In [2]:
coerced_data = pd.read_csv('../data/ems_datetime_fixed.csv')

In [3]:
coerced_data = coerced_data.drop(['Unnamed: 0', 'INCIDENT_DATETIME'], axis=1).set_index('proper_time')

In [4]:
coerced_data.head()

Unnamed: 0_level_0,INITIAL_CALL_TYPE,INITIAL_SEVERITY_LEVEL_CODE,FINAL_CALL_TYPE,FINAL_SEVERITY_LEVEL_CODE,VALID_DISPATCH_RSPNS_TIME_INDC,DISPATCH_RESPONSE_SECONDS_QY,VALID_INCIDENT_RSPNS_TIME_INDC,INCIDENT_RESPONSE_SECONDS_QY,HELD_INDICATOR,INCIDENT_DISPOSITION_CODE,BOROUGH,ZIPCODE,POLICEPRECINCT,STANDBY_INDICATOR,Change_In_Severity
proper_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2013-01-01 00:00:04,RESPIR,4,RESPIR,4,Y,101,Y,797.0,N,82.0,BRONX,10472.0,43.0,N,0
2013-01-01 00:00:19,CARD,3,CARD,3,Y,59,Y,851.0,N,93.0,BRONX,10454.0,40.0,N,0
2013-01-01 00:01:04,ARREST,1,ARREST,1,Y,29,Y,429.0,N,83.0,QUEENS,11418.0,102.0,N,0
2013-01-01 00:01:16,SICK,6,SICK,6,Y,56,Y,828.0,N,82.0,BRONX,10453.0,46.0,N,0
2013-01-01 00:01:26,INJURY,5,INJURY,5,Y,32,Y,856.0,N,82.0,BRONX,10457.0,48.0,N,0


One of our goals is to measure the frequency of calls over different time periods, so we need a way to tally calls when we call the "resample" method. Here we'll add a column where we assign a simple value of 1 to every call, and soon we'll use it to tally.

In [5]:
coerced_data['count'] = 1

In [6]:
coerced_data.head()

Unnamed: 0_level_0,INITIAL_CALL_TYPE,INITIAL_SEVERITY_LEVEL_CODE,FINAL_CALL_TYPE,FINAL_SEVERITY_LEVEL_CODE,VALID_DISPATCH_RSPNS_TIME_INDC,DISPATCH_RESPONSE_SECONDS_QY,VALID_INCIDENT_RSPNS_TIME_INDC,INCIDENT_RESPONSE_SECONDS_QY,HELD_INDICATOR,INCIDENT_DISPOSITION_CODE,BOROUGH,ZIPCODE,POLICEPRECINCT,STANDBY_INDICATOR,Change_In_Severity,count
proper_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2013-01-01 00:00:04,RESPIR,4,RESPIR,4,Y,101,Y,797.0,N,82.0,BRONX,10472.0,43.0,N,0,1
2013-01-01 00:00:19,CARD,3,CARD,3,Y,59,Y,851.0,N,93.0,BRONX,10454.0,40.0,N,0,1
2013-01-01 00:01:04,ARREST,1,ARREST,1,Y,29,Y,429.0,N,83.0,QUEENS,11418.0,102.0,N,0,1
2013-01-01 00:01:16,SICK,6,SICK,6,Y,56,Y,828.0,N,82.0,BRONX,10453.0,46.0,N,0,1
2013-01-01 00:01:26,INJURY,5,INJURY,5,Y,32,Y,856.0,N,82.0,BRONX,10457.0,48.0,N,0,1


In [8]:
coerced_data.index = pd.to_datetime(coerced_data.index)

In [9]:
df = coerced_data

In [21]:
burns = df[(df['INITIAL_CALL_TYPE'] == 'BURNMA')]['BOROUGH']

In [28]:
plt.plot(burns.value_counts(), )

TypeError: bar() missing 1 required positional argument: 'height'

### Other Exogenous Variables

In [None]:
weather_data = pd.read_csv('weather_data.csv')

In [None]:
weather_data.head()

In [None]:
weather_data.set_index('Date', inplace=True)

In [None]:
weather_data.index = pd.to_datetime(weather_data.index)

In [None]:
weather_data.index

In [None]:
weekly_average_temperature_data = pd.DataFrame(weather_data['Avg Temp'].resample('W').mean())

In [None]:
weekly_sum_precipitation = pd.DataFrame(weather_data['Precipitation Water Equiv'].resample('W').sum())

In [None]:
weekly_sum_snowfall = pd.DataFrame(weather_data['Snowfall'].resample('W').sum())

In [None]:
holiday_data = pd.read_csv('holiday_data.csv')

In [None]:
holiday_data.head()

In [None]:
holiday_data.set_index('Date', inplace=True)

In [None]:
holiday_data.index = pd.to_datetime(holiday_data.index)

In [None]:
holiday_data.index

In [None]:
weekly_sum_holidays = holiday_data.resample('W').sum()

In [None]:
weekly_sum_holidays.head()

### Now we have a time series to play with!

In [None]:
len(coerced_data['count'].resample('W').sum()) - 26

In [None]:
weekly_call_volume = coerced_data['count'].resample('W').sum()

In [None]:
daily_call_volume = coerced_data['count'].resample('D').sum()

In [None]:
weekly_average_response_time = coerced_data['INCIDENT_RESPONSE_SECONDS_QY'].resample('W').mean()

In [None]:
weekly_average_response_time = weekly_average_response_time / 60

In [None]:
daily_average_response_time = coerced_data['INCIDENT_RESPONSE_SECONDS_QY'].resample('D').mean()

In [None]:
daily_average_response_time = daily_average_response_time / 60

In [None]:
weekly_average_response_time_df = pd.DataFrame(data=weekly_average_response_time, index=weekly_average_response_time.index)

In [None]:
weekly_average_response_time_df = pd.merge(weekly_average_response_time_df, weekly_average_temperature_data, left_index=True, right_index=True)
weekly_average_response_time_df = pd.merge(weekly_average_response_time_df, weekly_sum_precipitation, left_index=True, right_index=True)
weekly_average_response_time_df = pd.merge(weekly_average_response_time_df, weekly_sum_snowfall, left_index=True, right_index=True)
weekly_average_response_time_df = pd.merge(weekly_average_response_time_df, weekly_sum_holidays, left_index=True, right_index=True)

In [None]:
weekly_average_response_time_df.columns = ['avg_response_time_min', 'avg_temp', 'total_precip', 'total_snowfall', 'total_holidays']

In [None]:
weekly_average_response_time_df.head()

In [None]:
weekly_call_volume_df = pd.DataFrame(data=weekly_call_volume, index=weekly_call_volume.index)

In [None]:
weekly_call_volume_df = pd.merge(weekly_call_volume_df, weekly_average_temperature_data, left_index=True, right_index=True)
weekly_call_volume_df = pd.merge(weekly_call_volume_df, weekly_sum_precipitation, left_index=True, right_index=True)
weekly_call_volume_df = pd.merge(weekly_call_volume_df, weekly_sum_snowfall, left_index=True, right_index=True)
weekly_call_volume_df = pd.merge(weekly_call_volume_df, weekly_sum_holidays, left_index=True, right_index=True)

In [None]:
weekly_call_volume_df.columns = ['sum of weekly calls', 'avg_temp', 'total_precip', 'total_snowfall', 'total_holidays']

In [None]:
weekly_call_volume_df.head()

## Let's see what this all looks like

In [None]:
from statsmodels.tsa.stattools import adfuller

def dickey_fuller(ser):
    #Perform Dickey-Fuller test:
    print ('Results of Dickey-Fuller Test:')
    dftest = adfuller(ser.values)

    # Extract and display test results in a user friendly manner
    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
    for key,value in dftest[4].items():
        dfoutput['Critical Value (%s)'%key] = value
    print (dfoutput)
    return None

In [None]:
plt.plot(weekly_average_response_time_df['avg_temp'])
plt.show()
dickey_fuller(weekly_average_response_time_df['avg_temp'])

In [None]:
plt.plot(weekly_average_response_time_df['total_precip'])
plt.show()
dickey_fuller(weekly_average_response_time_df['total_precip'])

In [None]:
plt.plot(weekly_average_response_time_df['total_snowfall'])
plt.show()
dickey_fuller(weekly_average_response_time_df['total_snowfall'])

In [None]:
plt.plot(weekly_average_response_time_df['total_holidays'])
plt.show()
dickey_fuller(weekly_average_response_time_df['total_holidays'])

In [None]:
plt.plot(weekly_call_volume)
plt.show()

In [None]:
plt.plot(weekly_average_response_time)

In [None]:
dickey_fuller(weekly_call_volume), print('\n'), dickey_fuller(weekly_average_response_time)

In [None]:
dickey_fuller(daily_average_response_time), dickey_fuller(daily_call_volume)

At first glance, we get a decent P-val for our stationarity check, but we know we can do better. There must be seasonality to our data.

In [None]:
# for year in [2013, 2014, 2015, 2016, 2017]:
#     print('\n' + str(year) + '\n')
#     print(dickey_fuller(weekly_average_response_time['{}-01-01'.format(str(year)):'{}-12-31'.format(str(year))]))

In [None]:
def rolling_statistics(timeseries):
    rolmean = timeseries.rolling(window = 8, center = False).mean()
    rolstd = timeseries.rolling(window = 8, center = False).std()
    fig = plt.figure(figsize=(12,7))
    orig = plt.plot(timeseries, color='blue',label='Original')
    mean = plt.plot(rolmean, color='red', label='Rolling Mean')
    std = plt.plot(rolstd, color='black', label = 'Rolling Std')
    plt.legend(loc='best')
    plt.title('Rolling Mean & Standard Deviation')
    plt.show(block=False)

In [None]:
rolling_statistics(weekly_average_response_time)
rolling_statistics(weekly_call_volume)

In [None]:
rolling_statistics(daily_average_response_time)
rolling_statistics(daily_call_volume)

In [None]:
decomposition = seasonal_decompose(weekly_average_response_time, freq=52)  
fig = plt.figure()  
fig = decomposition.plot()  
fig.set_size_inches(15, 8)

In [None]:
decomposition = seasonal_decompose(daily_average_response_time, freq=365)  
fig = plt.figure()  
fig = decomposition.plot()  
fig.set_size_inches(15, 8)

## Differencing

We take the first difference between weeks to improve stationarity. We tried taking a seasonal difference, but it didn't improve our DF score.

In [None]:
# Log mode:
# weekly_average_response_time = weekly_average_response_time.apply(lambda x: np.log(x))
# Tried this, didn't help.

In [None]:
week_first_diff = weekly_average_response_time - weekly_average_response_time.shift(1)

In [None]:
daily_first_diff = daily_average_response_time - daily_average_response_time.shift(1)

In [None]:
week_first_diff = week_first_diff.dropna()
dickey_fuller(week_first_diff)

In [None]:
daily_first_diff = daily_first_diff.dropna()
dickey_fuller(daily_first_diff)

In [None]:
rcParams['figure.figsize'] = 14, 5
plot_acf(week_first_diff, lags = 10);

rcParams['figure.figsize'] = 14, 5
plot_pacf(week_first_diff, lags = 10);

In [None]:
rcParams['figure.figsize'] = 14, 5
plot_acf(daily_first_diff, lags = 10);

rcParams['figure.figsize'] = 14, 5
plot_pacf(daily_first_diff, lags = 10);

### Seasonal Difference:

In [None]:
weekly_seasonal_difference = (weekly_average_response_time - weekly_average_response_time.shift(52)).dropna()

In [None]:
daily_seasonal_difference = (daily_average_response_time - daily_average_response_time.shift(52)).dropna()

In [None]:
dickey_fuller(weekly_seasonal_difference)

In [None]:
dickey_fuller(daily_seasonal_difference)

In [None]:
rcParams['figure.figsize'] = 14, 5
plot_acf(weekly_seasonal_difference, lags = 10);

rcParams['figure.figsize'] = 14, 5
plot_pacf(weekly_seasonal_difference, lags = 10);

In [None]:
rcParams['figure.figsize'] = 14, 5
plot_acf(daily_seasonal_difference, lags = 10);

rcParams['figure.figsize'] = 14, 5
plot_pacf(daily_seasonal_difference, lags = 10);

### Seasonal FIRST Differences

In [None]:
weekly_seasonal_first_difference = (week_first_diff - week_first_diff.shift(52)).dropna()

In [None]:
daily_seasonal_first_difference = (daily_first_diff - daily_first_diff.shift(52)).dropna()

In [None]:
dickey_fuller(weekly_seasonal_first_difference)

In [None]:
dickey_fuller(daily_seasonal_first_difference)

In [None]:
rcParams['figure.figsize'] = 14, 5
plot_acf(weekly_seasonal_first_difference, lags = 10);

rcParams['figure.figsize'] = 14, 5
plot_pacf(weekly_seasonal_first_difference, lags = 10);

In [None]:
rcParams['figure.figsize'] = 14, 5
plot_acf(daily_seasonal_first_difference, lags = 10);

rcParams['figure.figsize'] = 14, 5
plot_pacf(daily_seasonal_first_difference, lags = 10);

# Forecasting

### SARIMA Step 1: Grid Search for Ideal Params

####  Test Train Split:

In [None]:
endogenous_train = weekly_average_response_time[:-52]
exogenous_train = weekly_average_response_time_df.drop(['avg_response_time_min'], axis=1)[:-52]
endogenous_test = weekly_average_response_time[-52:]
exogenous_test = weekly_average_response_time_df.drop(['avg_response_time_min'], axis=1)[-52:]

In [None]:
# Define the p, d and q parameters to take any value between 0 and 2
p = d = q = range(0, 2)

# Generate all different combinations of p, q and q triplets
pdq = list(itertools.product(p, d, q))

# Generate all different combinations of seasonal p, q and q triplets
pdqs = [(x[0], x[1], x[2], 52) for x in list(itertools.product(p, d, q))]

In [None]:
# Run a grid with pdq and seasonal pdq parameters calculated above and get the best AIC value
ans = []
for comb in pdq:
    for combs in pdqs:
        try:
            mod = sm.tsa.statespace.SARIMAX(weekly_average_response_time,
                                            exog=weekly_average_response_time_df.drop(['avg_response_time_min'],                                                                                             axis=1),
                                            order=comb,
                                            seasonal_order=combs,
                                            enforce_stationarity=False,
                                            enforce_invertibility=False)

            output = mod.fit()
            ans.append([comb, combs, output.aic])
            print('ARIMA {} x {}52 : AIC Calculated ={}'.format(comb, combs, output.aic))
        except:
            continue
            
# Find the parameters with minimal AIC value.

ans_df = pd.DataFrame(ans, columns=['pdq', 'pdqs', 'aic'])
ans_df.loc[ans_df['aic'].idxmin()]

We noticed that when we grid search SARIMAX params with access to exogenous variables (and hold-out a testing set), we get slightly different SARIMAX params than if we were to run the grid search on all data (without exogenous vars).

Now plug ideal params into SARIMAX model:

In [None]:
ARIMA_MODEL = sm.tsa.statespace.SARIMAX(endogenous_train,
                                        exog=exogenous_train,
                                order=(1, 0, 1),
                                seasonal_order=(0, 1, 1, 52),
                                enforce_stationarity=False,
                                enforce_invertibility=False)

output = ARIMA_MODEL.fit()

print(output.summary().tables[1])

In [None]:
output.plot_diagnostics(figsize=(15, 8))
plt.show()

In [None]:
# Get dynamic predictions with confidence intervals as above.

pred_static = output.get_prediction(start = endogenous_test.index[0],
                                     end = endogenous_test.index[-1],
                                     exog = exogenous_test,
                                     dynamic = False, 
                                     full_results = True)

pred_static_conf = pred_static.conf_int()

In [None]:
# Plot the dynamic forecast with confidence intervals.

ax = weekly_average_response_time[100:].plot(label='observed', figsize=(18, 6))
pred_static.predicted_mean.plot(label='Static Forecast', ax=ax)

# ax.fill_between(pred_static_conf.index,
#                 pred_static_conf.iloc[:, 0],
#                 pred_static_conf.iloc[:, 1], color='g', alpha=.3)

ax.fill_betweenx(ax.get_ylim(), 
                 weekly_average_response_time[-52:].index[0], 
                 '2017-12-31', 
                 alpha=.1, zorder=-1)

ax.set_xlabel('Week')
ax.set_ylabel('Average Response Time')

plt.legend()
plt.show()

#### Predictive Power: Metrics

In [None]:
response_time_mean = np.zeros((len(weekly_average_response_time),1))
response_time_mean.fill(np.mean(weekly_average_response_time[:-52]))
response_time_mean = pd.DataFrame(response_time_mean, 
                                      index=weekly_average_response_time.index)

baseline_mse = np.mean((response_time_mean[-52:][0] - endogenous_test)**2)
baseline_rmse = np.sqrt(baseline_mse)

print('The MSE when guessing the mean is {}'.format(baseline_mse))
print('The RMSE when guessing the mean is {}'.format(baseline_rmse))

In [None]:
mse = ((pred_static.predicted_mean - endogenous_test)**2).mean()
rmse = np.sqrt(mse)
print('The MSE for this model is {}'.format(mse))
print('The RMSE for this model is {}'.format(rmse))

## Facebook Prophet

In [None]:
df = pd.DataFrame(weekly_average_response_time) # Formatting a DF how Prophet likes it
df.columns = ['y']
df['ds'] = df.index
m = Prophet(weekly_seasonality=True)
m.fit(df)

In [None]:
future = m.make_future_dataframe(periods=52, freq='W')
forecast = m.predict(future)

In [None]:
forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']][:3]

In [None]:
forecast.index = pd.date_range(start=df.index[0], periods=len(forecast), freq='W')

In [None]:
plt.figure(figsize=(18,8))
plt.plot(df['y']['2016-07-01':])
plt.plot(forecast['yhat']['2016-07-01':])
plt.show()

In [None]:
plt.figure(figsize=(18,8))
plt.plot(df['y']['2015-07-01':])
plt.plot(forecast['yhat']['2015-07-01':])
plt.show()

In [None]:
fig1 = m.plot(forecast, xlabel='Date', ylabel='EMS Avg Response Time')

## Let's see what some multivariate stuff looks like 

In [None]:
# Run a grid with pdq and seasonal pdq parameters calculated above and get the best AIC value
ans = []
for comb in pdq:
    for combs in pdqs:
        try:
            mod = sm.tsa.statespace.SARIMAX(weekly_average_response_time,
                                            order=comb,
                                            seasonal_order=combs,
                                            enforce_stationarity=False,
                                            enforce_invertibility=False)

            output = mod.fit()
            ans.append([comb, combs, output.aic])
            print('ARIMA {} x {}52 : AIC Calculated ={}'.format(comb, combs, output.aic))
        except:
            continue

In [None]:
# Find the parameters with minimal AIC value.

ans_df = pd.DataFrame(ans, columns=['pdq', 'pdqs', 'aic'])
ans_df.loc[ans_df['aic'].idxmin()]

In [None]:
weekly_average_response_time_df.head()

In [None]:
endogenous_train = weekly_average_response_time[:-52]
exogenous_train = weekly_average_response_time_df.drop(['avg_response_time_min'], axis=1)[:-52]
endogenous_test = weekly_average_response_time[-52:]
exogenous_test = weekly_average_response_time_df.drop(['avg_response_time_min'], axis=1)[-52:]

In [None]:
ARIMA_MODEL = sm.tsa.statespace.SARIMAX(endogenous_train,
                                exog = exogenous_train,
                                order=(1, 0, 1),
                                seasonal_order=(0, 1, 1, 52),
                                enforce_stationarity=False,
                                enforce_invertibility=False)

output = ARIMA_MODEL.fit()

print(output.summary().tables[1])

In [None]:
output.plot_diagnostics(figsize=(15, 8))
plt.show()

###### Static 

In [None]:
endogenous_test.index[0:2]

In [None]:
prediction_f = output.get_forecast(steps=52, exog = exogenous_test)
pred_conf_f = prediction_f.conf_int()

In [None]:
prediction = output.get_prediction(start = endogenous_test.index[0],
                                    end = endogenous_test.index[-1],
                                    exog = exogenous_test,
                                    dynamic = False)

pred_conf = prediction.conf_int()

In [None]:
pred_conf_f.head()

In [None]:
pred_conf.head()

In [None]:
# Plot the static forecast with confidence intervals.

ax = weekly_average_response_time[-100:].plot(label='observed', figsize=(18, 6))
prediction.predicted_mean.plot(label='Static Forecast', ax=ax)

# ax.fill_between(pred_conf.index,
#                 pred_conf.iloc[:, 0],
#                 pred_conf.iloc[:, 1], color='g', alpha=.3)

# ax.fill_betweenx(ax.get_ylim(), 
#                  weekly_average_response_time[-50:-49].index[0], 
#                  '2017-12-31', 
#                  alpha=.1, zorder=-1)

ax.set_xlabel('Date')
ax.set_ylabel('Avg Weekly Call Time')

plt.legend()
plt.show()

In [None]:
response_time_mean = np.zeros((len(weekly_average_response_time),1))
response_time_mean.fill(np.mean(weekly_average_response_time[:-52]))
response_time_mean = pd.DataFrame(response_time_mean, 
                                      index=weekly_average_response_time.index)

baseline_mse = np.mean((response_time_mean[-52:][0] - endogenous_test)**2)
baseline_rmse = np.sqrt(baseline_mse)
print('The MSE when guessing the mean is {}'.format(baseline_mse))
print('The RMSE when guessing the mean is {}'.format(baseline_rmse))

In [None]:
mse = ((prediction.predicted_mean - endogenous_test)**2).mean()
rmse = np.sqrt(mse)
print('The MSE for this model is {}'.format(mse))
print('The RMSE for this model is {}'.format(rmse))

In [None]:
print(f'''Our model does {np.round(((baseline_rmse - rmse) / baseline_rmse * 100), 
                                    decimals=2)}% better than guessing the mean response time!''')

## Let's look at some stuff with call volume

In [None]:
endogenous_train = weekly_call_volume[:-52]
exogenous_train = weekly_call_volume_df.drop(['sum of weekly calls'], axis=1)[:-52]
endogenous_test = weekly_call_volume[-52:]
exogenous_test = weekly_call_volume_df.drop(['sum of weekly calls'], axis=1)[-52:]

In [30]:
# Run a grid with pdq and seasonal pdq parameters calculated above and get the best AIC value
def arima_gs(ts):
    ans = []
    for comb in pdq:
        for combs in pdqs:
            try:
                mod = sm.tsa.statespace.SARIMAX(ts,
                                                order=comb,
                                                seasonal_order=combs,
                                                enforce_stationarity=False,
                                                enforce_invertibility=False)

                output = mod.fit()
                ans.append([comb, combs, output.aic])
                print('ARIMA {} x {}52 : AIC Calculated ={}'.format(comb, combs, output.aic))
            except:
                continue

    # Find the parameters with minimal AIC value.

    ans_df = pd.DataFrame(ans, columns=['pdq', 'pdqs', 'aic'])
    ans_df.loc[ans_df['aic'].idxmin()]
    print(ans_df)
    print(ans_df.loc[ans_df['aic'].idxmin()])

In [None]:
# arima_gs(weekly_call_volume) # Use to get ideal arima params.

In [None]:
ARIMA_MODEL = sm.tsa.statespace.SARIMAX(endogenous_train,
                                exog = exogenous_train,
                                order=(1, 0, 1),
                                seasonal_order=(0, 1, 1, 52),
                                enforce_stationarity=False,
                                enforce_invertibility=False)

output = ARIMA_MODEL.fit()

print(output.summary().tables[1])

In [None]:
output.plot_diagnostics(figsize=(15, 8))
plt.show()

In [None]:
prediction = output.get_prediction(start = endogenous_test.index[0],
                                    end = endogenous_test.index[-1],
                                    exog = exogenous_test,
                                    dynamic = False)

pred_conf = prediction.conf_int()

In [None]:
# Plot the static forecast with confidence intervals.

ax = weekly_call_volume[-100:].plot(label='observed', figsize=(18, 8))
prediction.predicted_mean.plot(label='Static Forecast', ax=ax)

# ax.fill_between(pred_conf.index,
#                 pred_conf.iloc[:, 0],
#                 pred_conf.iloc[:, 1], color='g', alpha=.3)

# ax.fill_betweenx(ax.get_ylim(), 
#                  weekly_average_response_time[-50:-49].index[0], 
#                  '2017-12-31', 
#                  alpha=.1, zorder=-1)

ax.set_xlabel('Date')
ax.set_ylabel('Avg Weekly Call Volume')

plt.legend()
plt.show()

In [None]:
call_vol_mean = np.zeros((len(weekly_call_volume),1))
call_vol_mean.fill(np.mean(weekly_call_volume[:-52]))
call_vol_mean = pd.DataFrame(call_vol_mean, 
                                      index=weekly_call_volume.index)

In [None]:
baseline_mse = np.mean((call_vol_mean[-52:][0] - endogenous_test)**2)
baseline_rmse = np.sqrt(baseline_mse)
print('The MSE when guessing the mean is {}'.format(baseline_mse))
print('The RMSE when guessing the mean is {}'.format(baseline_rmse))

In [None]:
mse = ((prediction.predicted_mean - endogenous_test)**2).mean()
rmse = np.sqrt(mse)
print('The MSE for this model is {}'.format(mse))
print('The RMSE for this model is {}'.format(rmse))

In [None]:
print(f'''Our model does {np.round(((baseline_rmse - rmse) / baseline_rmse * 100), 
                                    decimals=2)}% better than guessing the mean call volume!''')

In [None]:
print(f'Our model is off by about {np.round((rmse/7),2)} calls per day on average.')
print(f'There are usually around {np.round((26908.908046/7),2)} calls per day.')

## Some EDA

In [None]:
weekly_call_volume_df.describe()

In [None]:
weekly_average_response_time_df.describe()

In [None]:
weekly_average_response_time_df.nlargest(10, 'avg_response_time_min')

In [None]:
weekly_average_response_time_df.nsmallest(10, 'avg_response_time_min')

In [None]:
weekly_call_volume_df.nlargest(10, 'sum of weekly calls')

In [None]:
weekly_call_volume_df.nsmallest(10, 'sum of weekly calls')