In [5]:
# Packages
import numpy as np
import sklearn 
import pandas as pd
import matplotlib.pyplot as plt
import scipy
from scipy.integrate import odeint
from scipy.optimize import minimize
from utility_code.utility import utils

In [6]:
trainrd2 = pd.read_csv("train_round2.csv")
datestofind = trainrd2['Date'] == '09-01-2020'
start = datestofind[datestofind == True].index[0]
datestofind2 = trainrd2['Date'] == '09-26-2020'
end = datestofind[datestofind == True].index[49]

bestsub = pd.read_csv("team31-nov29-2.csv")

validation = trainrd2.iloc[start:end,:]
def MAPE(pred, valid):
    pred = pred.reset_index()
    valid = valid.reset_index()
    pred = pred.astype('int64')
    valid = valid.astype('int64')
    v = pred.subtract(valid)
    v = v.divide(valid)
    v = v.abs()
    v = v.sum(axis = 0)
    #v = v[0]+v[1]+v[2]
    n = len(pred)
    return v/n

In [7]:
#import csvs
train = pd.read_csv("ucla2020-cs145-covid19-prediction/train.csv")
test = pd.read_csv("ucla2020-cs145-covid19-prediction/test.csv")
graph = pd.read_csv("ucla2020-cs145-covid19-prediction/graph.csv")
supp = pd.read_csv("data-test/raw_data_test.csv", skiprows=2, thousands=',')
supp = supp[supp['Location'].isin(train['Province_State'])]
supp['Population'] = supp['Number of COVID-19 Cases'].divide(supp['COVID-19 Cases per 1,000,000 Population']) * 1e6

states = pd.Series.unique(train['Province_State'])
num_states = len(states)

In [9]:
## MANIPULATES MODEL AND MODEL PARAMS
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import Ridge
reg = MultiOutputRegressor(estimator=Ridge(random_state=123, max_iter=600))


## MANIPULATES FEATURES FROM DF TO USE AND WINDOW SIZE
#only look at the features in features list
features = ['Confirmed','Deaths']
num_features = len(features)

#stratify by state (into state dictionary)
statesdata = {}
for s in states:
    statesdata[s] = train.loc[train['Province_State'] == s,features]

WINDOW_SIZE = 4
state_feature_indices = utils.get_column_indices(statesdata['California'],features)

#append the feature spaces from the W days prior (where W is the window length)
new_features = []
for day in range(WINDOW_SIZE):
    for f in features:
        new_features.append(f + "(-"+ str(WINDOW_SIZE-day) + " days)")
all_new_features = new_features + features

## Set up dictionary of projections
proj = {}

## Loop over states
for s in states:
    
    a = statesdata[s]

    #fill the knn data using days from training set
    knndata = pd.DataFrame(columns = all_new_features)
    num_training_days = len(statesdata['California'])

    #fill the table
    for d in range(WINDOW_SIZE,num_training_days):
        knndata_row_index = knndata.shape[0]
        knn_row = utils.flatten_dataframe(a,slice(d-WINDOW_SIZE,d+1), state_feature_indices)
        utils.dataframe_append_row(knndata,knn_row,s,d)   
    
    # Actual recursive prediction
    days_to_predict = 26
    for d in range(days_to_predict):
        x = knndata.drop(columns=features)
        y = knndata.drop(columns=features)
        #x = knndata.drop(['Confirmed', 'Deaths'], axis = 1)
        #y = knndata[['Confirmed', 'Deaths']]
        toguess = 1
        trainx = x.head(len(x))
        trainy = y.head(len(y))
        # testy = y.tail(toguess)

        reg.fit(trainx, trainy)

        rmv = [i for i in range(num_features)]
        ftrs = knndata.drop(columns=knndata.columns[rmv]).tail(1)
        #ftrs = knndata.drop(columns=knndata.columns[[0,1]]).tail(1)
        #ftrs.drop(columns=knndata.columns[[0,1]])

        ftrs.columns = knndata.columns[0:num_features*WINDOW_SIZE]

        new = reg.predict(ftrs)
        ftrs = np.append(ftrs, new)
        ftrs = ftrs.astype(int)
        #if d==0: print(ftrs)
        knndata = knndata.append(dict(zip(knndata.columns, ftrs)), ignore_index=True)

        # append to knndata
        #if d == 0: 
            #print(knndata)
    done = knndata.tail(days_to_predict)
    done = done[['Confirmed', 'Deaths']]
    #print(done)
    proj[s] = done
    
## Get ordering of states in test    
order = test.loc[0:49,'Province_State']

# format submission
conf = []
dead = []
fid = 0
for i in range(days_to_predict):
    for j in order:
        projection = proj[j].iloc[i]
        #print(j, 'day', i)
        conf.append(int(projection['Confirmed']))
        dead.append(int(projection['Deaths']))
        #print(fid)
        fid+=1
    
test['Confirmed'] = conf
test['Deaths'] = dead

print("Error of best submission so far: nov29-2 with random state 123, defaults for Ridge and WINDOW SIZE 4")
print(MAPE(bestsub[['Confirmed', 'Deaths']], validation[['Confirmed', 'Deaths']]))
print("Error of current run")
print(MAPE(test[['Confirmed', 'Deaths']], validation[['Confirmed', 'Deaths']]))

Error of best submission so far: nov29-2 with random state 123, defaults for Ridge and WINDOW SIZE 4
index        0.037565
Confirmed    0.000080
Deaths       0.000178
dtype: float64
Error of current run
index        0.037565
Confirmed    0.001255
Deaths       0.001052
dtype: float64


In [None]:
submission = test.drop(columns=['Date', 'Province_State'])
print(submission)
filename = "blah.csv"
submission.to_csv(filename)