In [None]:
# Packages
import numpy as np
import pandas as pd

#import csvs
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
graph = pd.read_csv("graph.csv")

# generate dictionary of data on confirmed cases and deaths for each state
# enables us to do state by state inference
statesdata = {}
states = pd.Series.unique(train['Province_State'])
num_states = len(states)
for s in states:
    statesdata[s] = train.loc[train['Province_State'] == s ,:]

In [None]:
days = 26 # our goal is to predict 26 days

def predictARIMA(X, p, d, q, days):
    model = ARIMA(X, order=(p,d,q)) # initialize ARIMA model with order params
    model_fit = model.fit() # fit model to time series
    start = len(X) # starts with first day 
    forecast = model_fit.predict(start = start, end = start + days) # predict 26 days
    return(forecast) # return forecast as list

# def predictARIMA_Validation(X, days, C = True):
#     if C == True:
#         arima = arima = pm.auto_arima(X, error_action='ignore', trace=True, max_p=3, max_d=2, max_q=2,
#                                       suppress_warnings=True, maxiter=25, stepwise=False,
#                                       seasonal= False, out_of_sample_size=10)
#     else:
#         arima = arima = pm.auto_arima(X, error_action='ignore', trace=True, max_p=9, max_d=2, max_q=7, start_p=5, start_q=4,
#                                       suppress_warnings=True, maxiter=25, stepwise=False,
#                                       seasonal= False, out_of_sample_size=10)
#     forecast = arima.predict(n_periods = days)
#     return(forecast)

def difference(dataset, interval = 1):
    diff = list()
    for i in range(interval, len(dataset)):
        value = dataset[i]-dataset[i-interval]
        diff.append(value)
    return np.array(diff)

# invert differenced value
def inverse_difference(history, yhat, interval=1):
    #yhat = np.exp(yhat)
    return yhat + history[-1]

from statsmodels.tsa.arima.model import ARIMA

proj = {} # dictionary of data frames with projections for each state (key = state, value = df)
cp, cd, cq = 2,2,1
dp, dd, dq = 4,2,3
for s in states:
    
    a = statesdata[s]
    a = a.reset_index()
    confirmed = a['Confirmed']
    deaths = a['Deaths']
    
    #run ARIMA on confirmed
    X = confirmed.values
    forecastC = predictARIMA(X, cp,cd,cq, days)
    
    #run ARIMA on deaths
    Y= deaths.values
    forecastD = predictARIMA(Y, dp,dd,dq, days)
    
    df = {'Confirmed': forecastC, 'Deaths': forecastD}
    proj[s] = pd.DataFrame(df)


In [None]:
order = test.loc[0:49,'Province_State'] # get order of states in submission

# format submission
conf = [] # will be the confirmed column
dead = [] # will be the deaths column
fid = 0
for i in range(0,days):
    for j in order:
        projection = proj[j].iloc[i]
        conf.append(int(projection['Confirmed']))
        dead.append(int(projection['Deaths']))
        fid+=1 

In [None]:
# store the projections
test['Confirmed'] = conf
test['Deaths'] = dead

In [None]:
submission = test.drop(columns=['Province_State', 'Date'])
submission

In [None]:
#submission.to_csv('Team31_1.csv', index = False, header = True)

## Our best parameters:
Upon performing validation on the training data, we found our best parameters to be-

Confirmed cases: p = 2, d = 2, q = 1

Deaths: p = 4, d = 2, q = 3

In [None]:
def MAPE(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true))