In [1]:
#Imports Pandas and Numpy which have features for data science work not included in base Python
#I will wait to import other tools until the cells where I need them so I can write the explanations in those posts.
import numpy as np
import pandas as pd

#Reads data from source and imports into Pandas dataframes
data_confirmed = pd.read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Confirmed.csv")
data_deaths = pd.read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Deaths.csv")
data_recovered = pd.read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Recovered.csv")

#sample data
print(data_confirmed.head())

Province/State Country/Region      Lat      Long  1/22/20  1/23/20  1/24/20  \
0            NaN       Thailand  15.0000  101.0000        2        3        5   
1            NaN          Japan  36.0000  138.0000        2        1        2   
2            NaN      Singapore   1.2833  103.8333        0        1        3   
3            NaN          Nepal  28.1667   84.2500        0        0        0   
4            NaN       Malaysia   2.5000  112.5000        0        0        0   

   1/25/20  1/26/20  1/27/20  ...  3/7/20  3/8/20  3/9/20  3/10/20  3/11/20  \
0        7        8        8  ...      50      50      50       53       59   
1        2        4        4  ...     461     502     511      581      639   
2        3        4        5  ...     138     150     150      160      178   
3        1        1        1  ...       1       1       1        1        1   
4        3        4        4  ...      93      99     117      129      149   

   3/12/20  3/13/20  3/14/20  3/15/20  3

In [12]:
#Filters each of the three to contain only US data
data_confirmed = data_confirmed[data_confirmed["Country/Region"].str.contains("US")]
data_deaths = data_deaths[data_deaths["Country/Region"].str.contains("US")]
data_recovered = data_recovered[data_recovered["Country/Region"].str.contains("US")]

#Sample data again
print(data_confirmed.head())
print(data_deaths.head())
print(data_recovered.head())

Province/State Country/Region      Lat      Long  1/22/20  1/23/20  \
98         Washington             US  47.4009 -121.4905        0        0   
99           New York             US  42.1657  -74.9481        0        0   
100        California             US  36.1162 -119.6816        0        0   
101     Massachusetts             US  42.2302  -71.5301        0        0   
102  Diamond Princess             US  35.4437  139.6380        0        0   

     1/24/20  1/25/20  1/26/20  1/27/20  ...  3/7/20  3/8/20  3/9/20  3/10/20  \
98         0        0        0        0  ...       0       0       0      267   
99         0        0        0        0  ...       0       0       0      173   
100        0        0        0        0  ...       0       0       0      144   
101        0        0        0        0  ...       0       0       0       92   
102        0        0        0        0  ...      45      45      45       46   

     3/11/20  3/12/20  3/13/20  3/14/20  3/15/20  3/16/20

In [13]:
#Filters out rows containing "Princess"
data_confirmed = data_confirmed[~data_confirmed["Province/State"].str.contains("Princess")]
data_deaths = data_deaths[~data_deaths["Province/State"].str.contains("Princess")]
data_recovered = data_recovered[~data_recovered["Province/State"].str.contains("Princess")]

#Filters out rows containing "," which catches the city entries
data_confirmed = data_confirmed[~data_confirmed["Province/State"].str.contains(",")]
data_deaths = data_deaths[~data_deaths["Province/State"].str.contains(",")]
data_recovered = data_recovered[~data_recovered["Province/State"].str.contains(",")]

#Filters out US territories
data_confirmed = data_confirmed[~data_confirmed["Province/State"].isin(["Guam", "Puerto Rico", "Virgin Islands"])]
data_deaths = data_deaths[~data_deaths["Province/State"].isin(["Guam", "Puerto Rico", "Virgin Islands"])]
data_recovered = data_recovered[~data_recovered["Province/State"].isin(["Guam", "Puerto Rico", "Virgin Islands"])]

#Sort the dataframe alphabetically by state name
data_confirmed.sort_values(by=["Province/State"], ascending = True, inplace = True)
data_deaths.sort_values(by=["Province/State"], ascending = True, inplace = True)
data_recovered.sort_values(by=["Province/State"], ascending = True, inplace = True)

#Resets indexing just to make everything a little cleaner to work with
data_confirmed = data_confirmed.reset_index(drop = True)
data_deaths = data_deaths.reset_index(drop = True)
data_recovered = data_recovered.reset_index(drop = True)

#Sampling again
print(data_confirmed.head())

Province/State Country/Region      Lat      Long  1/22/20  1/23/20  1/24/20  \
0        Alabama             US  32.3182  -86.9023        0        0        0   
1         Alaska             US  61.3707 -152.4044        0        0        0   
2        Arizona             US  33.7298 -111.4312        0        0        0   
3       Arkansas             US  34.9697  -92.3731        0        0        0   
4     California             US  36.1162 -119.6816        0        0        0   

   1/25/20  1/26/20  1/27/20  ...  3/7/20  3/8/20  3/9/20  3/10/20  3/11/20  \
0        0        0        0  ...       0       0       0        0        0   
1        0        0        0  ...       0       0       0        0        0   
2        0        0        0  ...       0       0       0        6        9   
3        0        0        0  ...       0       0       0        0        1   
4        0        0        0  ...       0       0       0      144      177   

   3/12/20  3/13/20  3/14/20  3/15/20  3

In [14]:
#State population estimates found at https://www.census.gov/data/datasets/time-series/demo/popest/2010s-national-total.html#par_textimage_401631162
data_pop = pd.read_csv("state_pop.csv")

print(data_pop)

NAME  population
0                Alabama     4903185
1                 Alaska      731545
2                Arizona     7278717
3               Arkansas     3017804
4             California    39512223
5               Colorado     5758736
6            Connecticut     3565287
7               Delaware      973764
8   District of Columbia      705749
9                Florida    21477737
10               Georgia    10617423
11                Hawaii     1415872
12                 Idaho     1787065
13              Illinois    12671821
14               Indiana     6732219
15                  Iowa     3155070
16                Kansas     2913314
17              Kentucky     4467673
18             Louisiana     4648794
19                 Maine     1344212
20              Maryland     6045680
21         Massachusetts     6892503
22              Michigan     9986857
23             Minnesota     5639632
24           Mississippi     2976149
25              Missouri     6137428
26               Mont

In [15]:
#Grabs the state, latitude, and longitude columns to initiate the dataframe
train_df = data_confirmed[["Province/State", "Lat", "Long"]]
#Drops the rows for the US territories
train_df = train_df[~train_df["Province/State"].isin(["Guam", "Puerto Rico", "Virgin Islands"])]
#Same for the second dataframe
predict_df = data_confirmed[["Province/State", "Lat", "Long"]]
predict_df = train_df[~train_df["Province/State"].isin(["Guam", "Puerto Rico", "Virgin Islands"])]

print(train_df.head())

Province/State      Lat      Long
0        Alabama  32.3182  -86.9023
1         Alaska  61.3707 -152.4044
2        Arizona  33.7298 -111.4312
3       Arkansas  34.9697  -92.3731
4     California  36.1162 -119.6816


In [16]:
#Creating variables for current active cases
#Subtracts the COVID deaths and the COVID recoveries from the confirmed cases to give a current, active count
pred_curr_day_active = data_confirmed.iloc[:, -1].subtract(data_deaths.iloc[:,-1].add(data_recovered.iloc[:,-1]))
train_curr_day_active = data_confirmed.iloc[:, -2].subtract(data_deaths.iloc[:,-2].add(data_recovered.iloc[:,-2]))

#Setting the result we are trying to predict in the training model
#The result to predict is the number of new cases between yeterday and today using data leading to today
train_target = data_confirmed.iloc[:,-1].subtract(data_confirmed.iloc[:,-2])

#Creating variables for the past day's increases in cases
train_increase = data_confirmed.iloc[:,-2].subtract(data_confirmed.iloc[:,-3])
train_3day_increase = data_confirmed.iloc[:,-2].subtract(data_confirmed.iloc[:,-5])

#Creating variables for the current day's increases in cases
pred_increase = data_confirmed.iloc[:,-1].subtract(data_confirmed.iloc[:,-2])
pred_3day_increase = data_confirmed.iloc[:,-1].subtract(data_confirmed.iloc[:,-4])

#Adds the columns to the training dataframe
train_df["population"] = data_pop["population"]
train_df["active"] = train_curr_day_active
train_df["1_day"] = train_increase
train_df["3_day"] = train_3day_increase

#Adds the columns to the prediction dataframe
predict_df["population"] = data_pop["population"]
predict_df["active"] = pred_curr_day_active
predict_df["1_day"] = pred_increase
predict_df["3_day"] = pred_3day_increase

display(train_df.head())

Unnamed: 0,Province/State,Lat,Long,population,active,1_day,3_day
0,Alabama,32.3182,-86.9023,4903185,12,6,12
1,Alaska,61.3707,-152.4044,731545,1,0,1
2,Arizona,33.7298,-111.4312,7278717,12,1,4
3,Arkansas,34.9697,-92.3731,3017804,16,4,10
4,California,36.1162,-119.6816,39512223,414,86,205


In [21]:
#Importing needed libraries
from math import sin, cos, sqrt, atan2, radians

#Initiate function which takes lat and lon from two different points as imputs
def haversine(lat1, lon1, lat2, lon2):

    #Estmated redius of the Earth
    R = 3958.8

    #Transforms the lat and lon into radians for the caulculation
    lat1 = radians(lat1)
    lon1 = radians(lon1)
    lat2 = radians(lat2)
    lon2 = radians(lon2)

    #Programmed version of the Haversine equation
    #Found here: https://en.wikipedia.org/wiki/Haversine_formula
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    distance = R * c

    #Returns the great circle distance between the two points.
    return distance

In [18]:
#Initializing the lists to populate with nearby cases
nearby_cases_pred = []
nearby_cases_train = []

#This line begins this for loop and allows for calling each row in the Pandas dataframe
for index, row in predict_df.iterrows():
    #This line gives a baseline of 0 for the nearby case count
    case_count = 0
    #This is a second, internal loop of every state again to compate against the selected state
    for i, r in predict_df.iterrows():
        #Calls the Haversine function and returns the distance
        hav = haversine(row["Lat"], row["Long"], r["Lat"], r["Long"])
        #If that distance is less than 250 and more than 0 (so it does not catch the same state)...
        #For example: comparing WV to WV in the inner loop would return 0, and we want nearby, not in-state cases here
        if hav <= 250 and hav > 0:
            #Add the case count for the nearby state to the running case count total
            case_count += r["active"]
    #Appends the total nearby cases to the list initiated above
    nearby_cases_pred.append(case_count)

#Same code again for the training data
for index, row in train_df.iterrows():
    case_count = 0
    for i, r in train_df.iterrows():
        hav = haversine(row["Lat"], row["Long"], r["Lat"], r["Long"])
        if hav <= 250 and hav > 0:
            case_count += r["active"]
    nearby_cases_train.append(case_count)

#Takes the lists created and adds them as new columns in the dataframes.
train_df["nearby"] = nearby_cases_train
predict_df["nearby"] = nearby_cases_pred

display(predict_df.head())

Unnamed: 0,Province/State,Lat,Long,population,active,1_day,3_day,nearby
0,Alabama,32.3182,-86.9023,4903185,29,17,24,185
1,Alaska,61.3707,-152.4044,731545,1,0,0,0
2,Arizona,33.7298,-111.4312,7278717,17,5,9,0
3,Arkansas,34.9697,-92.3731,3017804,22,6,16,19
4,California,36.1162,-119.6816,39512223,544,131,275,44


In [19]:
from sklearn.linear_model import LinearRegression

Y = train_target
X = train_df[["nearby", "active",  "1_day", "3_day"]]

regressor = LinearRegression().fit(X, Y)
coeff_df = pd.DataFrame(regressor.coef_, X.columns, columns=['Coefficient'])
display(coeff_df)
print("Intercept: " + str(regressor.intercept_))
print("R^2 value: " + str(round(regressor.score(X, Y),2)))

Unnamed: 0,Coefficient
nearby,0.001153
active,0.453221
1_day,-0.614545
3_day,0.077439


Intercept: -1.354738573622054
R^2 value: 0.95


In [23]:
y_train_pred = regressor.predict(X)

training_predictions = pd.DataFrame({"Predicted": y_train_pred, "Actual":Y})

display(training_predictions.head())

Unnamed: 0,Predicted,Actual
0,1.495397,17
1,-0.824079,0
2,3.779119,5
3,4.230294,6
4,149.330339,131


In [24]:
x_prediction = predict_df[["nearby", "active", "1_day", "3_day"]]

y_pred = regressor.predict(x_prediction)

adjusted_predictions = []

for i in y_pred:
    if i <= 0:
        adjusted_predictions.append(0)
    else:
        adjusted_predictions.append(int(i.round()))

predicted_by_state = pd.DataFrame({"State": predict_df["Province/State"], "Predicted new cases": adjusted_predictions})
display(predicted_by_state)

Unnamed: 0,State,Predicted new cases
0,Alabama,3
1,Alaska,0
2,Arizona,4
3,Arkansas,6
4,California,186
5,Colorado,61
6,Connecticut,12
7,Delaware,4
8,District of Columbia,6
9,Florida,50
