In [1]:
# Packages
import numpy as np
import sklearn 
import pandas as pd
import matplotlib.pyplot as plt

from utility_code.utility import utils

In [2]:
#import csvs
train_import = pd.read_csv("ucla2020-cs145-covid19-prediction/train.csv")
test_import = pd.read_csv("ucla2020-cs145-covid19-prediction/test.csv")
graph = pd.read_csv("ucla2020-cs145-covid19-prediction/graph.csv")

In [3]:
#only look at the features in features list
features = ['Confirmed','Deaths']
num_features = len(features)

In [4]:
#array of states
states = pd.Series.unique(train_import['Province_State'])
num_states = len(states)

In [5]:
#split training into training and testing
PERCENT_TRAIN = 80
train , test = utils.split_dataframe(train_import,PERCENT_TRAIN,num_states)

In [6]:
#stratify by state (into state dictionary)
statesdata = {}
for s in states:
    statesdata[s] = train.loc[train['Province_State'] == s ,features]
    
#feature name -> column indices
state_feature_indices = utils.get_column_indices(statesdata['California'],features)

In [7]:
#set the WINDOW size
WINDOW_SIZE = 10

In [8]:
#append the feature spaces from the W days prior (where W is the window length)
new_features = []
for day in range(WINDOW_SIZE):
    for f in features:
        new_features.append(f + "(-"+ str(WINDOW_SIZE-day) + " days)")
all_new_features = new_features + features

In [10]:
all_new_features

['Confirmed(-10 days)',
 'Deaths(-10 days)',
 'Confirmed(-9 days)',
 'Deaths(-9 days)',
 'Confirmed(-8 days)',
 'Deaths(-8 days)',
 'Confirmed(-7 days)',
 'Deaths(-7 days)',
 'Confirmed(-6 days)',
 'Deaths(-6 days)',
 'Confirmed(-5 days)',
 'Deaths(-5 days)',
 'Confirmed(-4 days)',
 'Deaths(-4 days)',
 'Confirmed(-3 days)',
 'Deaths(-3 days)',
 'Confirmed(-2 days)',
 'Deaths(-2 days)',
 'Confirmed(-1 days)',
 'Deaths(-1 days)',
 'Confirmed',
 'Deaths']

In [11]:
from itertools import chain

#fill the knn data using days from training set
knndata = pd.DataFrame(columns = all_new_features)
num_training_days = len(statesdata['California'])

#fill the table
for d in range(WINDOW_SIZE,num_training_days):
    for s in states:
        knndata_row_index = knndata.shape[0]
        knn_row = utils.flatten_dataframe(statesdata[s],slice(d-WINDOW_SIZE,d+1), state_feature_indices)
        utils.dataframe_append_row(knndata,knn_row,s,d)     

In [12]:
knndata

Unnamed: 0,Confirmed(-10 days),Deaths(-10 days),Confirmed(-9 days),Deaths(-9 days),Confirmed(-8 days),Deaths(-8 days),Confirmed(-7 days),Deaths(-7 days),Confirmed(-6 days),Deaths(-6 days),...,Confirmed(-4 days),Deaths(-4 days),Confirmed(-3 days),Deaths(-3 days),Confirmed(-2 days),Deaths(-2 days),Confirmed(-1 days),Deaths(-1 days),Confirmed,Deaths
Alabama (day 10),3563,93,3734,99,3953,114,4075,118,4345,133,...,4712,153,4888,157,5079,163,5317,183,5593,196
Alaska (day 10),272,8,277,8,285,9,293,9,300,9,...,314,9,319,9,321,9,329,9,335,9
Arizona (day 10),3542,115,3705,122,3809,131,3964,142,4237,150,...,4724,180,4933,184,5068,191,5256,208,5473,231
Arkansas (day 10),1280,27,1410,29,1498,32,1569,33,1620,37,...,1744,38,1781,39,1973,41,1990,42,2276,42
California (day 10),22795,640,23931,714,25356,767,26686,860,27677,956,...,30491,1140,31431,1177,33686,1225,35465,1282,37344,1421
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Virginia (day 112),81237,2054,82364,2067,83609,2075,84567,2078,86072,2082,...,87993,2125,88904,2141,89888,2174,90801,2215,91782,2218
Washington (day 112),50009,1482,50824,1495,51849,1494,52635,1501,53321,1518,...,54985,1555,55803,1564,55803,1564,57541,1592,58173,1596
West Virginia (day 112),5452,102,5653,103,5775,103,5889,103,5999,104,...,6277,111,6375,112,6579,116,6738,116,6858,117
Wisconsin (day 112),45899,878,46917,878,47870,891,48827,892,49417,893,...,51049,911,52108,919,52940,934,54002,947,54924,948


In [14]:
#number of nearest neighbors
K_NEIGHBORS = 4

#number of days to predict in the future (for now let's predict for entirety of test labels)
NUM_DAYS_INTO_FUTURE = int(test.shape[0] / num_states)

In [15]:
from sklearn.neighbors import NearestNeighbors
knn = NearestNeighbors(n_neighbors = K_NEIGHBORS)

#get rid of previously predicted rows
knndata = knndata.iloc[:(num_training_days - WINDOW_SIZE)*num_states,:]

#feature name -> column indices
knn_feature_indices = utils.get_column_indices(knndata,features)

#build table for next NUM_DAYS_INTO_FUTURE entries
for d in range(num_training_days,num_training_days + NUM_DAYS_INTO_FUTURE):
    
    #refit the model after every day
    knn.fit(knndata.iloc[:,:-1 * num_features])
    
    for s in states:
        #get rows for past WINDOW_SIZE days
        row_indicies = slice(knndata.shape[0] - num_states * WINDOW_SIZE, knndata.shape[0], num_states)
        knn_row_features = utils.flatten_dataframe(knndata,row_indicies,slice(-1 * num_features,None,None))
        #find nearest neighbors based on these features
        knn_ret = knn.kneighbors([knn_row_features])[1][0]
        knn_ret_df = knndata.iloc[knn_ret,-1 * num_features:]
        #interpolate btwn the labels in the df
        for f in features:
            knn_row_features += [int(knn_ret_df[f].mean())]
        utils.dataframe_append_row(knndata,knn_row_features,s,d)     

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe.loc[row_name,:] = row


In [16]:
#analyze labels vs predictions
pd.set_option("display.max_rows", None, "display.max_columns", None)
knn_predictions = knndata.iloc[(num_training_days - WINDOW_SIZE)*num_states:,-1 * num_features:]
knn_labels = utils.dataframe_copy_rows_and_filter_columns(test,knn_predictions)
knn_errors = knn_predictions.subtract(knn_labels)

In [17]:
#print error
print(knn_errors)

                         Confirmed Deaths
Alabama (day 113)              787    173
Alaska (day 113)                79     14
Arizona (day 113)             7080    463
Arkansas (day 113)            -226    286
California (day 113)        -13782   -196
Colorado (day 113)             432     73
Connecticut (day 113)         -287     -6
Delaware (day 113)              96    118
Florida (day 113)             4924   1510
Georgia (day 113)             2351    992
Hawaii (day 113)              -168     -3
Idaho (day 113)                284     78
Illinois (day 113)           -3715    -29
Indiana (day 113)            -1484   -186
Iowa (day 113)                 176    345
Kansas (day 113)               -37    192
Kentucky (day 113)             368    -75
Louisiana (day 113)           -541    143
Maine (day 113)                  3      6
Maryland (day 113)             328   -654
Massachusetts (day 113)       -855    -35
Michigan (day 113)            -389     84
Minnesota (day 113)           -448

In [23]:
print(knn_errors.divide(knn_labels).abs())

                            Confirmed       Deaths
Alabama (day 113)          0.00849333      0.10594
Alaska (day 113)            0.0236456         0.56
Arizona (day 113)           0.0394436     0.122519
Arkansas (day 113)         0.00506761     0.602105
California (day 113)        0.0266653    0.0207605
Colorado (day 113)         0.00900938    0.0395879
Connecticut (day 113)      0.00573289   0.00135227
Delaware (day 113)         0.00637662     0.201709
Florida (day 113)           0.0100105     0.210982
Georgia (day 113)           0.0120296     0.258199
Hawaii (day 113)            0.0686275     0.115385
Idaho (day 113)             0.0131057         0.39
Illinois (day 113)          0.0201331   0.00375502
Indiana (day 113)           0.0216854    0.0624161
Iowa (day 113)             0.00383401     0.390271
Kansas (day 113)           0.00130135     0.523161
Kentucky (day 113)          0.0116796     0.100806
Louisiana (day 113)        0.00447677    0.0355368
Maine (day 113)           0.000

In [24]:
print('average errors: ', knn_errors.divide(knn_labels).abs().mean(axis = 0))

average errors:  Confirmed    0.096440
Deaths       0.488567
dtype: float64
