In [15]:
# Packages
import numpy as np
import sklearn 
import pandas as pd
import matplotlib.pyplot as plt

In [16]:
train = pd.read_csv("ucla2020-cs145-covid19-prediction/train.csv")
test = pd.read_csv("ucla2020-cs145-covid19-prediction/test.csv")
graph = pd.read_csv("ucla2020-cs145-covid19-prediction/graph.csv")

In [17]:
test.head

<bound method NDFrame.head of       ForecastID Province_State        Date  Confirmed  Deaths
0              0        Alabama  09-01-2020         -1      -1
1              1         Alaska  09-01-2020         -1      -1
2              2        Arizona  09-01-2020         -1      -1
3              3       Arkansas  09-01-2020         -1      -1
4              4     California  09-01-2020         -1      -1
...          ...            ...         ...        ...     ...
1295        1295       Virginia  09-26-2020         -1      -1
1296        1296     Washington  09-26-2020         -1      -1
1297        1297  West Virginia  09-26-2020         -1      -1
1298        1298      Wisconsin  09-26-2020         -1      -1
1299        1299        Wyoming  09-26-2020         -1      -1

[1300 rows x 5 columns]>

In [18]:
train.head
# We start out looking at only confirmed cases & deaths

<bound method NDFrame.head of         ID Province_State        Date  Confirmed  Deaths  Recovered    Active  \
0        0        Alabama  04-12-2020       3563      93        NaN    3470.0   
1        1         Alaska  04-12-2020        272       8       66.0     264.0   
2        2        Arizona  04-12-2020       3542     115        NaN    3427.0   
3        3       Arkansas  04-12-2020       1280      27      367.0    1253.0   
4        4     California  04-12-2020      22795     640        NaN   22155.0   
...    ...            ...         ...        ...     ...        ...       ...   
7095  7095       Virginia  08-31-2020     120575    2580    15085.0  102910.0   
7096  7096     Washington  08-31-2020      74635    1915        NaN   72720.0   
7097  7097  West Virginia  08-31-2020      10249     215     8017.0    2017.0   
7098  7098      Wisconsin  08-31-2020      75603    1122    67234.0    7247.0   
7099  7099        Wyoming  08-31-2020       3850      37     3181.0     632.0  

# Preprocess: 
For each state: 
    statedata(state) = tuples of features for each day for that state

In [19]:
states = pd.Series.unique(train['Province_State'])
features = ['Confirmed', 'Deaths']
statedata = {}
for index, row in train.iterrows():
    state = row['Province_State']
    info= []
    for ftr in features:
        info.append(row[ftr])
    if state in statedata:
        statedata[state].append(info)
    else:
        statedata[state] = [info]
statedata # in chronological order

{'Alabama': [[3563, 93],
  [3734, 99],
  [3953, 114],
  [4075, 118],
  [4345, 133],
  [4571, 148],
  [4712, 153],
  [4888, 157],
  [5079, 163],
  [5317, 183],
  [5593, 196],
  [5832, 202],
  [6026, 209],
  [6026, 209],
  [6421, 219],
  [6539, 228],
  [6750, 242],
  [6912, 256],
  [7088, 272],
  [7294, 289],
  [7611, 288],
  [7888, 290],
  [8112, 298],
  [8437, 315],
  [8691, 343],
  [9046, 369],
  [9385, 383],
  [9668, 390],
  [9889, 393],
  [10164, 403],
  [10464, 435],
  [10700, 450],
  [11101, 473],
  [11373, 483],
  [11674, 485],
  [11771, 488],
  [12086, 489],
  [12376, 504],
  [13052, 522],
  [13288, 529],
  [13670, 541],
  [14117, 549],
  [14478, 551],
  [14986, 566],
  [15650, 580],
  [16032, 583],
  [16530, 591],
  [17031, 610],
  [17359, 618],
  [17952, 630],
  [18630, 646],
  [18766, 653],
  [18851, 653],
  [19072, 653],
  [19387, 676],
  [20043, 689],
  [20500, 692],
  [20925, 718],
  [21422, 729],
  [21989, 744],
  [22845, 755],
  [23710, 769],
  [24601, 773],
  [25615, 77

# Preprocess: 
Scan window size windows in order to get feature vectors

In [27]:
num_days = len(statedata['California'])

In [21]:
from itertools import chain

window_size = 5
colnames = []
for i in range(window_size):
    for j in features:
        colnames.append('days_ago:'+ str(window_size-i) + '_'  + j)

knndata = pd.DataFrame(columns = colnames)

labelcols = []
for f in features:
    labelcols.append(f)
    
labels = pd.DataFrame(columns = labelcols)

for state, row in statedata.items():
    for i in range(window_size, num_days):
        stateandday = 'day' + str(i + 1) + state
        toflatten = row[i-window_size:i]
        knndata.loc[stateandday] = list(chain(*toflatten))
        labels.loc[stateandday] = row[i]
        

knndata
    

Unnamed: 0,days_ago:5_Confirmed,days_ago:5_Deaths,days_ago:4_Confirmed,days_ago:4_Deaths,days_ago:3_Confirmed,days_ago:3_Deaths,days_ago:2_Confirmed,days_ago:2_Deaths,days_ago:1_Confirmed,days_ago:1_Deaths
day6Alabama,3563,93,3734,99,3953,114,4075,118,4345,133
day7Alabama,3734,99,3953,114,4075,118,4345,133,4571,148
day8Alabama,3953,114,4075,118,4345,133,4571,148,4712,153
day9Alabama,4075,118,4345,133,4571,148,4712,153,4888,157
day10Alabama,4345,133,4571,148,4712,153,4888,157,5079,163
...,...,...,...,...,...,...,...,...,...,...
day138Wyoming,3543,37,3578,37,3603,37,3632,37,3680,37
day139Wyoming,3578,37,3603,37,3632,37,3680,37,3733,37
day140Wyoming,3603,37,3632,37,3680,37,3733,37,3803,37
day141Wyoming,3632,37,3680,37,3733,37,3803,37,3795,37


In [22]:
# Get corresponding labels (true values for the features on that day)
labels

Unnamed: 0,Confirmed,Deaths
day6Alabama,4571,148
day7Alabama,4712,153
day8Alabama,4888,157
day9Alabama,5079,163
day10Alabama,5317,183
...,...,...
day138Wyoming,3733,37
day139Wyoming,3803,37
day140Wyoming,3795,37
day141Wyoming,3829,37


In [23]:
# Can technically run knn on this
num_neighbors = 5
from sklearn.neighbors import NearestNeighbors
knn = NearestNeighbors(n_neighbors = num_neighbors)
knn.fit(knndata)

NearestNeighbors()

In [24]:
knn_ret = knn.kneighbors([knndata.loc['day142Wyoming']])
print(knn_ret)
pred_conf = 0
pred_death = 0
for i in knn_ret[1][0]:
    pred_conf += labels.iloc[i][0]
    pred_death += labels.iloc[i][1]

pred = [int(pred_conf/num_neighbors), int(pred_death/num_neighbors)]
print(pred)

(array([[  0.        ,  97.33447488, 105.98584811, 106.3578864 ,
        133.31166491]]), array([[6849,  255, 6848, 4602, 5509]], dtype=int64))
[3899, 45]


# Prediction:
dynamically build the labels structure and (knndata structure) for the days we want to predict

Input:
days_to_predict: # days into the future

In [28]:
days_to_predict = 30

In [None]:
for day in range(1,days_to_predict+1):
    for state in 
    
    