# Finding a trend in COVID cases in the United States through Linear Regression

In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

cases = pd.read_csv('../Data/time_series_covid_19_confirmed.csv')
cases

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,1/10/21,1/11/21,1/12/21,1/13/21,1/14/21,1/15/21,1/16/21,1/17/21,1/18/21,1/19/21
0,,Afghanistan,33.939110,67.709953,0,0,0,0,0,0,...,53489,53538,53584,53584,53775,53831,53938,53984,54062,54141
1,,Albania,41.153300,20.168300,0,0,0,0,0,0,...,63595,63971,64627,65334,65994,66635,67216,67690,67982,68568
2,,Algeria,28.033900,1.659600,0,0,0,0,0,0,...,102144,102369,102641,102860,103127,103381,103611,103833,104092,104341
3,,Andorra,42.506300,1.521800,0,0,0,0,0,0,...,8586,8586,8682,8818,8868,8946,9038,9083,9083,9194
4,,Angola,-11.202700,17.873900,0,0,0,0,0,0,...,18193,18254,18343,18425,18613,18679,18765,18875,18926,19011
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
267,,Vietnam,14.058324,108.277199,0,2,2,2,2,2,...,1514,1515,1520,1521,1531,1536,1537,1537,1539,1540
268,,West Bank and Gaza,31.952200,35.233200,0,0,0,0,0,0,...,147400,148171,148968,149769,150505,151142,151569,152031,152555,153093
269,,Yemen,15.552727,48.516388,0,0,0,0,0,0,...,2104,2105,2107,2109,2110,2111,2112,2112,2113,2115
270,,Zambia,-13.133897,27.849332,0,0,0,0,0,0,...,27728,28596,29757,31100,32800,34278,36074,37605,38207,39515


Here, I grabbed the index of the US row so that I could turn it into its own DataFrame. I then dropped the unnecessary rows and turned the dates into numbers instead to make it possible for the algorithm to accurately calculate with it.

In [2]:
US_found = cases['Country/Region'].str.find('US')
US_index = 0
for var in range(len(US_found)):
    if US_found[var] != -1:
        US_index = var

X = pd.DataFrame(cases.columns)
X.drop([0, 1, 2, 3], inplace = True)
for date in range(4, len(X) + 4):
    X.loc[date, 0] = date - 3

y = pd.DataFrame(cases.iloc[US_index:US_index+1, 4:]).transpose()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

       0
4      1
5      2
6      3
7      4
8      5
..   ...
363  360
364  361
365  362
366  363
367  364

[364 rows x 1 columns]
              248
1/22/20         1
1/23/20         1
1/24/20         2
1/25/20         2
1/26/20         5
...           ...
1/15/21  23556097
1/16/21  23758855
1/17/21  23936773
1/18/21  24078772
1/19/21  24246830

[364 rows x 1 columns]


# Creating and training our LinearRegression model

In [3]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)

print(X_train.shape)
print(y_train.shape)

(291, 1)
(291, 1)


In [5]:
from sklearn import metrics
print(y_pred)
#'Accuracy: ', accuracy_score(y_test, y_pred)

[[ 6.37163295e+06]
 [-2.32306687e+06]
 [-3.30122060e+06]
 [ 1.47402815e+07]
 [-1.01886190e+06]
 [ 5.82821421e+06]
 [ 1.36337039e+04]
 [ 2.35033428e+06]
 [ 4.14361612e+06]
 [ 2.73072740e+06]
 [ 8.21925666e+06]
 [-1.99701563e+06]
 [ 1.23492391e+07]
 [-1.12754565e+06]
 [ 3.32848801e+06]
 [ 1.20775297e+07]
 [-4.11634871e+06]
 [ 8.11057291e+06]
 [ 1.37077859e+07]
 [ 1.37218055e+06]
 [-3.62727185e+06]
 [ 4.85006048e+06]
 [-2.03733792e+05]
 [ 1.42512047e+07]
 [ 9.30609414e+06]
 [ 6.48031670e+06]
 [ 7.78452167e+06]
 [ 7.83886355e+06]
 [ 1.55554096e+07]
 [ 1.54467259e+07]
 [-2.75780186e+06]
 [ 1.75257367e+06]
 [-1.83399001e+06]
 [ 1.49576490e+07]
 [ 6.53465857e+06]
 [ 2.07862491e+06]
 [-6.92810657e+05]
 [-2.92082749e+06]
 [ 1.44142303e+07]
 [ 1.50663328e+07]
 [ 1.30556834e+07]
 [ 9.36043601e+06]
 [-2.48609250e+06]
 [ 1.00668804e+07]
 [ 1.80691554e+06]
 [ 8.54530791e+06]
 [ 7.74419939e+05]
 [-1.07320377e+06]
 [-3.84463934e+06]
 [-1.67096439e+06]
 [ 8.32794041e+06]
 [ 1.08276666e+07]
 [ 1.1208059