In [1]:
# Packages
import numpy as np
import sklearn 
import pandas as pd
import matplotlib.pyplot as plt

from utility_code.utility import utils

In [2]:
#import csvs
train = pd.read_csv("ucla2020-cs145-covid19-prediction/train.csv")
test = pd.read_csv("ucla2020-cs145-covid19-prediction/test.csv")
graph = pd.read_csv("ucla2020-cs145-covid19-prediction/graph.csv")

In [14]:
test

Unnamed: 0,ForecastID,Province_State,Date,Confirmed,Deaths
0,0,Alabama,09-01-2020,-1,-1
1,1,Alaska,09-01-2020,-1,-1
2,2,Arizona,09-01-2020,-1,-1
3,3,Arkansas,09-01-2020,-1,-1
4,4,California,09-01-2020,-1,-1
...,...,...,...,...,...
1295,1295,Virginia,09-26-2020,-1,-1
1296,1296,Washington,09-26-2020,-1,-1
1297,1297,West Virginia,09-26-2020,-1,-1
1298,1298,Wisconsin,09-26-2020,-1,-1


In [3]:
#only look at the features in features list
features = ['Confirmed','Deaths']
num_features = len(features)

In [4]:
#array of states
states = pd.Series.unique(train['Province_State'])
num_states = len(states)

In [5]:
#stratify by state (into state dictionary)
statesdata = {}
for s in states:
    statesdata[s] = train.loc[train['Province_State'] == s ,features]
    
#feature name -> column indices
state_feature_indices = utils.get_column_indices(statesdata['California'],features)

In [6]:
state_feature_indices

[0, 1]

In [7]:
#set the WINDOW size -- how many days in the past to look
WINDOW_SIZE = 3

In [8]:
#append the feature spaces from the W days prior (where W is the window length)
new_features = []
for day in range(WINDOW_SIZE):
    for f in features:
        new_features.append(f + "(-"+ str(WINDOW_SIZE-day) + " days)")
all_new_features = new_features + features

In [9]:
all_new_features

['Confirmed(-3 days)',
 'Deaths(-3 days)',
 'Confirmed(-2 days)',
 'Deaths(-2 days)',
 'Confirmed(-1 days)',
 'Deaths(-1 days)',
 'Confirmed',
 'Deaths']

In [14]:
from itertools import chain

#fill the knn data using days from training set
knndata = pd.DataFrame(columns = all_new_features)
num_training_days = len(statesdata['California'])

#fill the table
for d in range(WINDOW_SIZE,num_training_days):
    for s in states:
        knndata_row_index = knndata.shape[0]
        knn_row = utils.flatten_dataframe(statesdata[s],slice(d-WINDOW_SIZE,d+1), state_feature_indices)
        utils.dataframe_append_row(knndata,knn_row,s,d)     

In [52]:
knndata

Unnamed: 0,Confirmed(-6 days),Deaths(-6 days),Confirmed(-5 days),Deaths(-5 days),Confirmed(-4 days),Deaths(-4 days),Confirmed(-3 days),Deaths(-3 days),Confirmed(-2 days),Deaths(-2 days),Confirmed(-1 days),Deaths(-1 days),Confirmed,Deaths
Alabama (day 6),3563,93,3734,99,3953,114,4075,118,4345,133,4571,148,4712,153
Alaska (day 6),272,8,277,8,285,9,293,9,300,9,309,9,314,9
Arizona (day 6),3542,115,3705,122,3809,131,3964,142,4237,150,4511,169,4724,180
Arkansas (day 6),1280,27,1410,29,1498,32,1569,33,1620,37,1695,37,1744,38
California (day 6),22795,640,23931,714,25356,767,26686,860,27677,956,29157,1037,30491,1140
Colorado (day 6),7307,289,7691,306,7950,327,7956,328,8286,355,8691,372,9047,389
Connecticut (day 6),12035,554,13381,602,13989,671,14755,868,15884,971,16809,1036,17550,1086
Delaware (day 6),1625,35,1758,41,1926,43,2014,46,2070,55,2317,61,2538,67
Florida (day 6),19895,461,21019,499,21628,571,22511,596,23343,668,24759,725,25492,748
Georgia (day 6),12452,433,13315,465,14578,525,14987,552,15669,587,17194,650,17669,673


In [16]:
#number of nearest neighbors
K_NEIGHBORS = 2

#number of days to predict in the future (for now let's predict for entirety of test labels)
NUM_DAYS_INTO_FUTURE = int(test.shape[0] / num_states)

In [17]:
from sklearn.neighbors import NearestNeighbors
knn = NearestNeighbors(n_neighbors = K_NEIGHBORS)

#get rid of previously predicted rows
knndata = knndata.iloc[:(num_training_days - WINDOW_SIZE)*num_states,:]

#feature name -> column indices
knn_feature_indices = utils.get_column_indices(knndata,features)

#build table for next NUM_DAYS_INTO_FUTURE entries
for d in range(num_training_days,num_training_days + NUM_DAYS_INTO_FUTURE):
    
    #refit the model after every day
    knn.fit(knndata.iloc[:,:-1 * num_features])
    
    for s in states:
        #get rows for past WINDOW_SIZE days
        row_indicies = slice(knndata.shape[0] - num_states * WINDOW_SIZE, knndata.shape[0], num_states)
        knn_row_features = utils.flatten_dataframe(knndata,row_indicies,slice(-1 * num_features,None,None))
        #find nearest neighbors based on these features
        knn_ret = knn.kneighbors([knn_row_features])[1][0]
        knn_ret_df = knndata.iloc[knn_ret,-1 * num_features:]
        #interpolate btwn the labels in the df
        for f in features:
            knn_row_features += [int(knn_ret_df[f].mean())]
        utils.dataframe_append_row(knndata,knn_row_features,s,d)     

In [18]:
#analyze labels vs predictions

pd.set_option("display.max_rows", None, "display.max_columns", None)
knn_predictions = knndata.iloc[(num_training_days - WINDOW_SIZE)*num_states:,-1 * num_features:]
knn_labels = utils.dataframe_copy_rows_and_filter_columns(test,knn_predictions)
knn_errors = knn_predictions.subtract(knn_labels)

In [19]:
knn_predictions.shape[0]

1300

In [57]:
knn_predictions

Unnamed: 0,Confirmed,Deaths
Alabama (day 142),127761,1852
Alaska (day 142),5302,55
Arizona (day 142),201594,5022
Arkansas (day 142),62289,1063
California (day 142),706821,12951
Colorado (day 142),57690,1835
Connecticut (day 142),52623,4465
Delaware (day 142),17373,604
Florida (day 142),625932,11695
Georgia (day 142),269067,5604


In [29]:
num_training_days * num_states

7100

In [21]:
submission = knn_predictions
submission['ForecastID'] = np.arange(len(knn_predictions))
submission = submission[['ForecastID','Confirmed','Deaths']]

In [22]:
submission

Unnamed: 0,ForecastID,Confirmed,Deaths
Alabama (day 142),0,128613,1687
Alaska (day 142),1,5315,51
Arizona (day 142),2,201748,5029
Arkansas (day 142),3,62024,1008
California (day 142),4,709213,12979
Colorado (day 142),5,58022,1811
Connecticut (day 142),6,52687,4465
Delaware (day 142),7,17628,598
Florida (day 142),8,626520,11935
Georgia (day 142),9,269722,5618


In [24]:
submission.to_csv('test_submission_Nov10.csv',index = False)