In [20]:
# Packages
import numpy as np
import sklearn 
import pandas as pd
import matplotlib.pyplot as plt

from utility_code.utility import utils

In [21]:
#import csvs
train = pd.read_csv("ucla2020-cs145-covid19-prediction/train.csv")
test = pd.read_csv("ucla2020-cs145-covid19-prediction/test.csv")
graph = pd.read_csv("ucla2020-cs145-covid19-prediction/graph.csv")

In [3]:
train

Unnamed: 0,ID,Province_State,Date,Confirmed,Deaths,Recovered,Active,Incident_Rate,People_Tested,People_Hospitalized,Mortality_Rate,Testing_Rate,Hospitalization_Rate
0,0,Alabama,04-12-2020,3563,93,,3470.0,75.988020,21583.0,437.0,2.610160,460.300152,12.264945
1,1,Alaska,04-12-2020,272,8,66.0,264.0,45.504049,8038.0,31.0,2.941176,1344.711576,11.397059
2,2,Arizona,04-12-2020,3542,115,,3427.0,48.662422,42109.0,,3.246753,578.522286,
3,3,Arkansas,04-12-2020,1280,27,367.0,1253.0,49.439423,19722.0,130.0,2.109375,761.753354,10.156250
4,4,California,04-12-2020,22795,640,,22155.0,58.137726,190328.0,5234.0,2.812020,485.423869,22.961176
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7095,7095,Virginia,08-31-2020,120575,2580,15085.0,102910.0,1412.626461,1586551.0,,2.139747,18587.633628,
7096,7096,Washington,08-31-2020,74635,1915,,72720.0,980.118828,1461354.0,,2.565820,19190.735838,
7097,7097,West Virginia,08-31-2020,10249,215,8017.0,2017.0,571.883891,436047.0,,2.097766,24330.984010,
7098,7098,Wisconsin,08-31-2020,75603,1122,67234.0,7247.0,1298.477578,1253422.0,,1.484068,21527.457417,


In [25]:
#only look at the features in features list
features = ['Confirmed','Deaths']
num_features = len(features)

In [26]:
#array of states
states = pd.Series.unique(train['Province_State'])
num_states = len(states)

In [27]:
#stratify by state (into state dictionary)
statesdata = {}
for s in states:
    statesdata[s] = train.loc[train['Province_State'] == s ,features]
    
#feature name -> column indices
state_feature_indices = utils.get_column_indices(statesdata['California'],features)

In [28]:
#set the WINDOW size
WINDOW_SIZE =12

In [29]:
#append the feature spaces from the W days prior (where W is the window length)
new_features = []
for day in range(WINDOW_SIZE):
    for f in features:
        new_features.append(f + "(-"+ str(WINDOW_SIZE-day) + " days)")
all_new_features = new_features + features

In [None]:
from itertools import chain

#fill the knn data using days from training set
knndata = pd.DataFrame(columns = all_new_features)
num_training_days = len(statesdata['California'])

#fill the table
for d in range(WINDOW_SIZE,num_training_days):
    for s in states:
        knndata_row_index = knndata.shape[0]
        knn_row = utils.flatten_dataframe(statesdata[s],slice(d-WINDOW_SIZE,d+1), state_feature_indices)
        utils.dataframe_append_row(knndata,knn_row,s,d)     

In [10]:
knndata

Unnamed: 0,Confirmed(-10 days),Deaths(-10 days),Active(-10 days),Confirmed(-9 days),Deaths(-9 days),Active(-9 days),Confirmed(-8 days),Deaths(-8 days),Active(-8 days),Confirmed(-7 days),...,Active(-3 days),Confirmed(-2 days),Deaths(-2 days),Active(-2 days),Confirmed(-1 days),Deaths(-1 days),Active(-1 days),Confirmed,Deaths,Active
Alabama (day 10),3563,93,3470,3734,99,3635,3953,114,3839,4075,...,4731,5079,163,4916,5317,183,5134,5593,196,5397
Alaska (day 10),272,8,264,277,8,269,285,9,276,293,...,310,321,9,312,329,9,320,335,9,326
Arizona (day 10),3542,115,3427,3705,122,3583,3809,131,3678,3964,...,4749,5068,191,4877,5256,208,5048,5473,231,5242
Arkansas (day 10),1280,27,1253,1410,29,1381,1498,32,1466,1569,...,1742,1973,41,1932,1990,42,1948,2276,42,2234
California (day 10),22795,640,22155,23931,714,23217,25356,767,24589,26686,...,30254,33686,1225,32461,35465,1282,34183,37344,1421,35923
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Virginia (day 141),110857,2436,94172,112068,2443,95272,112960,2467,96050,113623,...,100162,118794,2568,101269,119729,2569,102161,120575,2580,102910
Washington (day 141),69779,1850,67929,70595,1857,68738,71012,1863,69149,71371,...,71396,73879,1905,71974,74320,1905,72415,74635,1915,72720
West Virginia (day 141),9077,171,1766,9185,177,1701,9273,179,1736,9315,...,1766,9970,215,1820,10109,214,1912,10249,215,2017
Wisconsin (day 141),69059,1068,7936,70009,1081,7995,70462,1081,7661,70854,...,7603,74800,1119,7606,75337,1122,7516,75603,1122,7247


In [11]:
#number of nearest neighbors
K_NEIGHBORS = 3

#number of days to predict in the future (for now let's predict for entirety of test labels)
NUM_DAYS_INTO_FUTURE = int(test.shape[0] / num_states)

In [12]:
from sklearn.neighbors import NearestNeighbors
knn = NearestNeighbors(n_neighbors = K_NEIGHBORS)

#get rid of previously predicted rows
knndata = knndata.iloc[:(num_training_days - WINDOW_SIZE)*num_states,:]

#feature name -> column indices
knn_feature_indices = utils.get_column_indices(knndata,features)

#build table for next NUM_DAYS_INTO_FUTURE entries
for d in range(num_training_days,num_training_days + NUM_DAYS_INTO_FUTURE):
    
    #refit the model after every day
    knn.fit(knndata.iloc[:,:-1 * num_features])
    
    for s in states:
        #get rows for past WINDOW_SIZE days
        row_indicies = slice(knndata.shape[0] - num_states * WINDOW_SIZE, knndata.shape[0], num_states)
        knn_row_features = utils.flatten_dataframe(knndata,row_indicies,slice(-1 * num_features,None,None))
        #find nearest neighbors based on these features
        knn_ret = knn.kneighbors([knn_row_features])[1][0]
        knn_ret_df = knndata.iloc[knn_ret,-1 * num_features:]
        #interpolate btwn the labels in the df
        for f in features:
            knn_row_features += [int(knn_ret_df[f].mean())]
        utils.dataframe_append_row(knndata,knn_row_features,s,d)     

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe.loc[row_name,:] = row


In [13]:
#analyze labels vs predictions
pd.set_option("display.max_rows", None, "display.max_columns", None)
knn_predictions = knndata.iloc[(num_training_days - WINDOW_SIZE)*num_states:,-1 * num_features:]
knn_labels = utils.dataframe_copy_rows_and_filter_columns(test,knn_predictions)
knn_errors = knn_predictions.subtract(knn_labels)

KeyError: "['Active'] not in index"

In [23]:
knn_predictions.shape[0]

1300

In [24]:
knn_predictions

Unnamed: 0,Confirmed,Deaths,Incident_Rate
Alabama (day 142),128741,1913,1736
Alaska (day 142),5380,66,664
Arizona (day 142),201360,5011,2766
Arkansas (day 142),62214,1027,2030
California (day 142),704462,12914,1782
Colorado (day 142),58278,1798,844
Connecticut (day 142),52591,4465,1475
Delaware (day 142),17301,604,1776
Florida (day 142),624199,11547,2715
Georgia (day 142),268143,5571,2525


In [29]:
num_training_days * num_states

7100

In [15]:
submission = knn_predictions.drop(['Active'], axis =1)
submission['ForecastID'] = np.arange(len(knn_predictions))
submission = submission[['ForecastID','Confirmed','Deaths']]

In [16]:
submission

Unnamed: 0,ForecastID,Confirmed,Deaths
Alabama (day 142),0,123678,2135
Alaska (day 142),1,5147,37
Arizona (day 142),2,201116,4994
Arkansas (day 142),3,61547,1342
California (day 142),4,701934,12867
Colorado (day 142),5,58217,1833
Connecticut (day 142),6,52542,4465
Delaware (day 142),7,17236,604
Florida (day 142),8,626426,11235
Georgia (day 142),9,267129,5535


In [19]:
submission.to_csv('test_submission_Nov9.csv',index = False)