In [1]:
# Packages
import numpy as np
import sklearn 
import pandas as pd
import matplotlib.pyplot as plt

from utility_code.utility import utils

In [2]:
#import csvs
train_import = pd.read_csv("ucla2020-cs145-covid19-prediction/train.csv")
test_import = pd.read_csv("ucla2020-cs145-covid19-prediction/test.csv")
graph = pd.read_csv("ucla2020-cs145-covid19-prediction/graph.csv")

In [3]:
#only look at the features in features list
features = ['Confirmed','Deaths']
num_features = len(features)

In [4]:
#array of states
states = pd.Series.unique(train_import['Province_State'])
num_states = len(states)

In [5]:
#split training into training and testing
PERCENT_TRAIN = 80
train , test = utils.split_dataframe(train_import,PERCENT_TRAIN,num_states)

In [6]:
#stratify by state (into state dictionary)
statesdata = {}
for s in states:
    statesdata[s] = train.loc[train['Province_State'] == s ,features]
    
#feature name -> column indices
state_feature_indices = utils.get_column_indices(statesdata['California'],features)

In [7]:
#set the WINDOW size
WINDOW_SIZE = 10

In [8]:
#append the feature spaces from the W days prior (where W is the window length)
new_features = []
for day in range(WINDOW_SIZE):
    for f in features:
        new_features.append(f + "(-"+ str(WINDOW_SIZE-day) + " days)")
all_new_features = new_features + features

In [9]:
from itertools import chain

#fill the knn data using days from training set
knndata = pd.DataFrame(columns = all_new_features)
num_training_days = len(statesdata['California'])

#fill the table
for d in range(WINDOW_SIZE,num_training_days):
    for s in states:
        knndata_row_index = knndata.shape[0]
        knn_row = utils.flatten_dataframe(statesdata[s],slice(d-WINDOW_SIZE,d+1), state_feature_indices)
        utils.dataframe_append_row(knndata,knn_row,s,d)     

In [10]:
#number of nearest neighbors
K_NEIGHBORS = 4

#number of days to predict in the future (for now let's predict for entirety of test labels)
NUM_DAYS_INTO_FUTURE = int(test.shape[0] / num_states)

In [11]:
from sklearn.neighbors import NearestNeighbors
knn = NearestNeighbors(n_neighbors = K_NEIGHBORS)

#get rid of previously predicted rows
knndata = knndata.iloc[:(num_training_days - WINDOW_SIZE)*num_states,:]

#feature name -> column indices
knn_feature_indices = utils.get_column_indices(knndata,features)

#build table for next NUM_DAYS_INTO_FUTURE entries
for d in range(num_training_days,num_training_days + NUM_DAYS_INTO_FUTURE):
    
    #refit the model after every day
    knn.fit(knndata.iloc[:,:-1 * num_features])
    
    for s in states:
        #get rows for past WINDOW_SIZE days
        row_indicies = slice(knndata.shape[0] - num_states * WINDOW_SIZE, knndata.shape[0], num_states)
        knn_row_features = utils.flatten_dataframe(knndata,row_indicies,slice(-1 * num_features,None,None))
        #find nearest neighbors based on these features
        knn_ret = knn.kneighbors([knn_row_features])[1][0]
        knn_ret_df = knndata.iloc[knn_ret,-1 * num_features:]
        #interpolate btwn the labels in the df
        for f in features:
            knn_row_features += [int(knn_ret_df[f].mean())]
        utils.dataframe_append_row(knndata,knn_row_features,s,d)     

In [16]:
#analyze labels vs predictions
pd.set_option("display.max_rows", None, "display.max_columns", None)
knn_predictions = knndata.iloc[(num_training_days - WINDOW_SIZE)*num_states:,-1 * num_features:]
knn_labels = utils.dataframe_copy_rows_and_filter_columns(test,knn_predictions)
knn_error = knn_predictions.subtract(knn_labels)

In [17]:
#print error
print(knn_error)

                         Confirmed Deaths
Alabama (day 113)             -873    144
Alaska (day 113)                39     22
Arizona (day 113)            -1030    -14
Arkansas (day 113)             250    390
California (day 113)         -4676    -45
Colorado (day 113)             944    279
Connecticut (day 113)         -252     -5
Delaware (day 113)            -106      0
Florida (day 113)             8672   2057
Georgia (day 113)            -2258     -2
Hawaii (day 113)              -204    -15
Idaho (day 113)                  9    156
Illinois (day 113)           -1298     -9
Indiana (day 113)             -576     -5
Iowa (day 113)                -182     -6
Kansas (day 113)               686     99
Kentucky (day 113)             556    147
Louisiana (day 113)           -586   1366
Maine (day 113)                -12     -1
Maryland (day 113)            -870     -8
Massachusetts (day 113)       -199    -10
Michigan (day 113)            -599    217
Minnesota (day 113)           -785