In [10]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, IndexToString, RFormula,VectorSlicer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline, Model
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [11]:
spark = SparkSession.builder.appName('logregconsult').getOrCreate()

In [12]:
# File location and type
file_location = "COVID_confirmed_cases.csv"
file_type = "csv"

# CSV options
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","

data = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .option('nanValue', ' ')\
  .option('nullValue', ' ')\
  .load(file_location)

# Pre-processing

In [13]:
df = data.toPandas()

In [14]:
df.head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,7/25/20,7/26/20,7/27/20,7/28/20,7/29/20,7/30/20,7/31/20,08-01-20,08-02-20,08-03-20
0,,Afghanistan,33.93911,67.709953,0,0,0,0,0,0,...,36036,36157,36263,36368,36471,36542,36675,36710,36710,36747
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,4637,4763,4880,4997,5105,5197,5276,5396,5519,5620
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,26764,27357,27973,28615,29229,29831,30394,30950,31465,31972
3,,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,897,897,907,907,918,922,925,925,925,937
4,,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,916,932,950,1000,1078,1109,1148,1164,1199,1280


In [28]:
df['Total'] = df.iloc[:,4:].sum(axis=1)

In [37]:
df[['Country/Region','Total']].sort_values('Total',ascending=False).head(3)

Unnamed: 0,Country/Region,Total
225,US,256188640
28,Brazil,108024882
131,India,52737993


In [42]:
df_us = df[df['Country/Region']=='US']
df_brazil = df[df['Country/Region']=='Brazil']
df_india = df[df['Country/Region']=='India']

In [57]:
x = df_india[list(df_india)[4:]]

In [76]:
x['1/22/20'].values[0]

0

In [80]:
x.iloc[:,:7]

Unnamed: 0,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20
131,0,0,0,0,0,0,0


In [77]:
arr_us_x = []
arr_us_y = []
for i in range(len(list(x))):
    x.iloc[:,:7].sum(axis=1)

0
0
0
0
0
0
0
0
1
1
1
2
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
5
5
28
30
31
34
39
43
56
62
73
82
102
113
119
142
156
194
244
330
396
499
536
657
727
887
987
1024
1251
1397
1998
2543
2567
3082
3588
4778
5311
5916
6725
7598
8446
9205
10453
11487
12322
13430
14352
15722
17615
18539
20080
21370
23077
24530
26283
27890
29451
31324
33062
34863
37257
39699
42505
46437
49400
52987
56351
59695
62808
67161
70768
74292
78055
81997
85784
90648
95698
100328
106475
112028
118226
124794
131423
138536
144950
150793
158086
165386
173491
181827
190609
198370
207191
216824
226713
236184
246622
257486
265928
276146
286605
297535
308993
320922
332424
343091
354065
366946
380532
395048
410451
425282
440215
456183
473105
490401
508953
528859
548318
566840
585481
604641
625544
648315
673165
697413
719664
742417
767296
793802
820916
849522
878254
906752
936181
968857
1003832
1039084
1077781
1118206
1155338
1193078
1238798
1288108
1337024
1385635
1435616
1480073
1531669
1581963
1634746
1695988


In [64]:
y = x.values.tolist()[0]
x = 

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 2,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 5,
 5,
 28,
 30,
 31,
 34,
 39,
 43,
 56,
 62,
 73,
 82,
 102,
 113,
 119,
 142,
 156,
 194,
 244,
 330,
 396,
 499,
 536,
 657,
 727,
 887,
 987,
 1024,
 1251,
 1397,
 1998,
 2543,
 2567,
 3082,
 3588,
 4778,
 5311,
 5916,
 6725,
 7598,
 8446,
 9205,
 10453,
 11487,
 12322,
 13430,
 14352,
 15722,
 17615,
 18539,
 20080,
 21370,
 23077,
 24530,
 26283,
 27890,
 29451,
 31324,
 33062,
 34863,
 37257,
 39699,
 42505,
 46437,
 49400,
 52987,
 56351,
 59695,
 62808,
 67161,
 70768,
 74292,
 78055,
 81997,
 85784,
 90648,
 95698,
 100328,
 106475,
 112028,
 118226,
 124794,
 131423,
 138536,
 144950,
 150793,
 158086,
 165386,
 173491,
 181827,
 190609,
 198370,
 207191,
 216824,
 226713,
 236184,
 246622,
 257486,
 265928,
 276146,
 286605,
 297535,
 308993,
 320922,
 332424,
 343091,
 354065,
 366946,
 380532,
 395048,
 410451,
 

In [59]:
list(x)

['1/22/20',
 '1/23/20',
 '1/24/20',
 '1/25/20',
 '1/26/20',
 '1/27/20',
 '1/28/20',
 '1/29/20',
 '1/30/20',
 '1/31/20',
 '02-01-20',
 '02-02-20',
 '02-03-20',
 '02-04-20',
 '02-05-20',
 '02-06-20',
 '02-07-20',
 '02-08-20',
 '02-09-20',
 '02-10-20',
 '02-11-20',
 '02-12-20',
 '2/13/20',
 '2/14/20',
 '2/15/20',
 '2/16/20',
 '2/17/20',
 '2/18/20',
 '2/19/20',
 '2/20/20',
 '2/21/20',
 '2/22/20',
 '2/23/20',
 '2/24/20',
 '2/25/20',
 '2/26/20',
 '2/27/20',
 '2/28/20',
 '2/29/20',
 '03-01-20',
 '03-02-20',
 '03-03-20',
 '03-04-20',
 '03-05-20',
 '03-06-20',
 '03-07-20',
 '03-08-20',
 '03-09-20',
 '03-10-20',
 '03-11-20',
 '03-12-20',
 '3/13/20',
 '3/14/20',
 '3/15/20',
 '3/16/20',
 '3/17/20',
 '3/18/20',
 '3/19/20',
 '3/20/20',
 '3/21/20',
 '3/22/20',
 '3/23/20',
 '3/24/20',
 '3/25/20',
 '3/26/20',
 '3/27/20',
 '3/28/20',
 '3/29/20',
 '3/30/20',
 '3/31/20',
 '04-01-20',
 '04-02-20',
 '04-03-20',
 '04-04-20',
 '04-05-20',
 '04-06-20',
 '04-07-20',
 '04-08-20',
 '04-09-20',
 '04-10-20',
 '04-1

In [36]:
list(df)

['Province/State',
 'Country/Region',
 'Lat',
 'Long',
 '1/22/20',
 '1/23/20',
 '1/24/20',
 '1/25/20',
 '1/26/20',
 '1/27/20',
 '1/28/20',
 '1/29/20',
 '1/30/20',
 '1/31/20',
 '02-01-20',
 '02-02-20',
 '02-03-20',
 '02-04-20',
 '02-05-20',
 '02-06-20',
 '02-07-20',
 '02-08-20',
 '02-09-20',
 '02-10-20',
 '02-11-20',
 '02-12-20',
 '2/13/20',
 '2/14/20',
 '2/15/20',
 '2/16/20',
 '2/17/20',
 '2/18/20',
 '2/19/20',
 '2/20/20',
 '2/21/20',
 '2/22/20',
 '2/23/20',
 '2/24/20',
 '2/25/20',
 '2/26/20',
 '2/27/20',
 '2/28/20',
 '2/29/20',
 '03-01-20',
 '03-02-20',
 '03-03-20',
 '03-04-20',
 '03-05-20',
 '03-06-20',
 '03-07-20',
 '03-08-20',
 '03-09-20',
 '03-10-20',
 '03-11-20',
 '03-12-20',
 '3/13/20',
 '3/14/20',
 '3/15/20',
 '3/16/20',
 '3/17/20',
 '3/18/20',
 '3/19/20',
 '3/20/20',
 '3/21/20',
 '3/22/20',
 '3/23/20',
 '3/24/20',
 '3/25/20',
 '3/26/20',
 '3/27/20',
 '3/28/20',
 '3/29/20',
 '3/30/20',
 '3/31/20',
 '04-01-20',
 '04-02-20',
 '04-03-20',
 '04-04-20',
 '04-05-20',
 '04-06-20',
 '0