In [11]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import os
from datetime import datetime 
from functools import reduce
from operator import add
import pandas as pd
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator
import findspark
findspark.init()

# Filter data to find only country with Australia

In [2]:
def filterData(df):
      return df.filter(col("Country/Region") == "Australia")

## Transpose data

In [7]:
def t_data(df):

    # Filter dtypes and split into column names and type description
    cols, dtypes = zip(*((c, t) for (c, t) in df.dtypes))
    # Spark SQL supports only homogeneous columns
    assert len(set(dtypes)) == 1, "All columns have to be of the same type"

    # Create and explode an array of (column_name, column_value) structs
    kvs = explode(array([
      struct(lit(c).alias("Date"), col(c).alias("Cases")) for c in cols
    ])).alias("kvs")

    return df.select([kvs]).select(["kvs.Date", "kvs.Cases"])

# Handeling file

In [12]:
#Relative path
dirname = os.path.dirname("C:\\Users\\HP\\git\\Covid19-Big-Data\\data\\")
filepath = os.path.join(dirname, 'time_series_19-covid-Confirmed_archived_0325.csv')


#scc = SparkContext("local", "covid_big_data")
conf = SparkConf().setMaster("local").setAppName("covid_big_data")
sc = SparkContext.getOrCreate(conf = conf)
sqlContext = SQLContext(sc)
df = sqlContext.read.option("inferSchema", "true").option("header", "true").csv(filepath)

#df.show(5)
#df.printSchema()

df = filterData(df)

counts = df.count()
print ("Number of rows in df -> %i" % (counts))

#df.show()

Number of rows in df -> 9


In [35]:
# Convert the Spark DataFrame back to a Pandas DataFrame using Arrow
result_pdf = df.select("*").toPandas()

In [24]:
# Dropped columns which are outside the time range
names = df.schema.names
start_date = datetime.strptime("1/31/20", '%m/%d/%y')
end_date = datetime.strptime("3/22/20", '%m/%d/%y')
#len(names)
for column in names:
    if column != "Province/State" and column != "Country/Region" and column != "Lat" and column != "Long":
        col_name = datetime.strptime(column, '%m/%d/%y')
        if not(col_name > start_date and col_name < end_date):
            df = df.drop(column)
    


#df.toPandas()

In [15]:
df.describe().toPandas().transpose()

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
Province/State,9,,,Australian Capital Territory,Western Australia
Country/Region,9,,,Australia,Australia
Lat,9,-24.502866666666666,23.946964075796746,-41.4545,35.4437
Long,9,141.0555888888889,11.754214953203432,115.8605,153.4
2/1/20,9,1.3333333333333333,1.8027756377319948,0,4
2/2/20,9,1.3333333333333333,1.7320508075688774,0,4
2/3/20,9,1.3333333333333333,1.7320508075688774,0,4
2/4/20,9,1.4444444444444444,1.810463415200036,0,4
2/5/20,9,1.4444444444444444,1.810463415200036,0,4


In [16]:
df_new_south_wales = df.filter(col("Province/State") == "New South Wales")
df_new_south_wales = df_new_south_wales.drop(*["Province/State","Country/Region", "Lat", "Long"])
df_new_south_wales.toPandas()

Unnamed: 0,2/1/20,2/2/20,2/3/20,2/4/20,2/5/20,2/6/20,2/7/20,2/8/20,2/9/20,2/10/20,...,3/12/20,3/13/20,3/14/20,3/15/20,3/16/20,3/17/20,3/18/20,3/19/20,3/20/20,3/21/20
0,4,4,4,4,4,4,4,4,4,4,...,65,92,112,134,171,210,267,307,353,436


In [231]:
from pyspark.sql import functions
dates = df_new_south_wales_t.select("Date").collect()
date_format = [pd.to_datetime(d) for d in dates]
X = date_format

In [232]:

from datetime import datetime, timedelta

starting_date = 35
day_numbers = []
for i in range(0, len(X)):
    day_numbers.append([i])
X = day_numbers
X = X[starting_date:]
Y = df_new_south_wales_t.select("Cases").collect()
Y = Y[starting_date:]

In [233]:
from sklearn import linear_model
linear_regr = linear_model.LinearRegression()

linear_regr.fit(X, Y)


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [234]:
print ("Linear Regression Model Score: %s" % (linear_regr.score(X, Y)))

Linear Regression Model Score: 0.889347066847035


In [235]:
# Predict future trend
import math
y_pred = linear_regr.predict(Z)


In [236]:
y_pred

array([[348.18333333],
       [375.24761905],
       [402.31190476]])

In [187]:
Z = [[49], [50], [51]]

In [27]:
df_new_south_wales_t = t_data(df_new_south_wales)
df_new_south_wales_t = df_new_south_wales_t.select("*").withColumn("Date_value", monotonically_increasing_id())
df_new_south_wales_t.toPandas()

Unnamed: 0,Date,Cases,Date_value
0,2/1/20,4,0
1,2/2/20,4,1
2,2/3/20,4,2
3,2/4/20,4,3
4,2/5/20,4,4
5,2/6/20,4,5
6,2/7/20,4,6
7,2/8/20,4,7
8,2/9/20,4,8
9,2/10/20,4,9


# Regression analysis

## Regression for south wales

IllegalArgumentException: 'Field "X" does not exist.\nAvailable fields: Date, Cases, Date_value'

In [29]:
lr = LinearRegression(featuresCol = 'Y', labelCol='X', maxIter=10, regParam=0.3, elasticNetParam=0.8)

# Fit the model
lrModel = lr.fit(v_df)

# Print the coefficients and intercept for linear regression
print("Coefficients: %s" % str(lrModel.coefficients))
print("Intercept: %s" % str(lrModel.intercept))

# Summarize the model over the training set and print out some metrics
trainingSummary = lrModel.summary
print("numIterations: %d" % trainingSummary.totalIterations)
print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory))
trainingSummary.residuals.show()
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

Coefficients: [0.09947314890089527]
Intercept: 19.34331196097759
numIterations: 3
objectiveHistory: [0.4999999999999998, 0.4360246316911617, 0.2726352066098193]
+-------------------+
|          residuals|
+-------------------+
| -19.74120455658117|
| -18.74120455658117|
| -17.74120455658117|
| -16.74120455658117|
| -15.74120455658117|
| -14.74120455658117|
| -13.74120455658117|
| -12.74120455658117|
| -11.74120455658117|
| -10.74120455658117|
|  -9.74120455658117|
|  -8.74120455658117|
|  -7.74120455658117|
|  -6.74120455658117|
|  -5.74120455658117|
|  -4.74120455658117|
|-3.7412045565811702|
|-2.7412045565811702|
|-1.7412045565811702|
|-0.7412045565811702|
+-------------------+
only showing top 20 rows

RMSE: 10.416292
r2: 0.478996


In [44]:
lr_predictions = lrModel.transform(v_df)
lr_predictions.select("prediction","Date_value","features").show(60)
lr_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="Date_value",metricName="r2")
print("R Squared (R2) on test data = %g" % lr_evaluator.evaluate(lr_predictions))

+------------------+----------+--------+
|        prediction|Date_value|features|
+------------------+----------+--------+
| 19.74120455658117|         0|   [4.0]|
| 19.74120455658117|         1|   [4.0]|
| 19.74120455658117|         2|   [4.0]|
| 19.74120455658117|         3|   [4.0]|
| 19.74120455658117|         4|   [4.0]|
| 19.74120455658117|         5|   [4.0]|
| 19.74120455658117|         6|   [4.0]|
| 19.74120455658117|         7|   [4.0]|
| 19.74120455658117|         8|   [4.0]|
| 19.74120455658117|         9|   [4.0]|
| 19.74120455658117|        10|   [4.0]|
| 19.74120455658117|        11|   [4.0]|
| 19.74120455658117|        12|   [4.0]|
| 19.74120455658117|        13|   [4.0]|
| 19.74120455658117|        14|   [4.0]|
| 19.74120455658117|        15|   [4.0]|
| 19.74120455658117|        16|   [4.0]|
| 19.74120455658117|        17|   [4.0]|
| 19.74120455658117|        18|   [4.0]|
| 19.74120455658117|        19|   [4.0]|
| 19.74120455658117|        20|   [4.0]|
| 19.74120455658

In [None]:
## Generalised Linear model

In [31]:
from datetime import datetime, timedelta
import numpy as np
from sklearn import linear_model

# prepare the lists for the model
X = date_format
y = data['gravi_deceduti'].tolist()[1:]
# date format is not suitable for modeling, let's transform the date into incrementals number starting from April 1st
starting_date = 37  # April 1st is the 37th day of the series
day_numbers = []
for i in range(1, len(X)):
    day_numbers.append([i])
X = day_numbers
# # let's train our model only with data after the peak
X = X[starting_date:]
y = y[starting_date:]
# Instantiate Linear Regression
linear_regr = linear_model.LinearRegression()
# Train the model using the training sets
linear_regr.fit(X, y)
print ("Linear Regression Model Score: %s" % (linear_regr.score(X, y)))

IllegalArgumentException: 'Field "label" does not exist.\nAvailable fields: features, Date_value'

In [12]:
#result_pdf.head()
#To map using geography
import folium
from folium import plugins

In [165]:
map = folium.Map(location = [-25, 140], zoom_start = 4, tiles='Stamenterrain')
for lat, long, province, value in zip(result_pdf['Lat'], result_pdf['Long'], result_pdf['Province/State'], result_pdf['3/23/20']):
    folium.CircleMarker([lat, long], radius = value*0.2, popup = ('<strong>Province</strong>: ' + str(province).capitalize() + '<br>''<strong>value</strong>: '
                                                         + str(value) + '<br>'), color='red', fill_color='red', fill_opacity=0.3).add_to(map)


In [166]:
display(map)

In [150]:
result_pdf['Province/State']

0                 New South Wales
1                        Victoria
2                      Queensland
3                 South Australia
4           From Diamond Princess
5               Western Australia
6                        Tasmania
7              Northern Territory
8    Australian Capital Territory
Name: Province/State, dtype: object

In [9]:
result_pdf.style.background_gradient(cmap = 'Reds')

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,1/29/20,1/30/20,1/31/20,2/1/20,2/2/20,2/3/20,2/4/20,2/5/20,2/6/20,2/7/20,2/8/20,2/9/20,2/10/20,2/11/20,2/12/20,2/13/20,2/14/20,2/15/20,2/16/20,2/17/20,2/18/20,2/19/20,2/20/20,2/21/20,2/22/20,2/23/20,2/24/20,2/25/20,2/26/20,2/27/20,2/28/20,2/29/20,3/1/20,3/2/20,3/3/20,3/4/20,3/5/20,3/6/20,3/7/20,3/8/20,3/9/20,3/10/20,3/11/20,3/12/20,3/13/20,3/14/20,3/15/20,3/16/20,3/17/20,3/18/20,3/19/20,3/20/20,3/21/20,3/22/20,3/23/20
0,New South Wales,Australia,-33.8688,151.209,0,0,0,0,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,6,6,13,22,22,26,28,38,48,55,65,65,92,112,134,171,210,267,307,353,436,533,533
1,Victoria,Australia,-37.8136,144.963,0,0,0,0,1,1,1,1,2,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,7,7,9,9,10,10,10,11,11,15,18,21,21,36,49,57,71,94,121,121,121,229,296,296
2,Queensland,Australia,-28.0167,153.4,0,0,0,0,0,0,0,1,3,2,3,2,2,3,3,4,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,9,9,9,11,11,13,13,13,15,15,18,20,20,35,46,61,68,78,94,144,184,221,221,221
3,South Australia,Australia,-34.9285,138.601,0,0,0,0,0,0,0,0,0,0,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,3,3,3,3,5,5,7,7,7,7,7,9,9,16,19,20,29,29,37,42,50,67,100,100
4,From Diamond Princess,Australia,35.4437,139.638,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,7,7,7,7,7,8,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,Western Australia,Australia,-31.9505,115.861,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,2,2,2,3,3,3,3,4,6,9,9,14,17,17,28,31,35,52,64,90,120,120
6,Tasmania,Australia,-41.4545,145.971,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,3,3,5,5,6,7,7,10,10,10,16,22,22
7,Northern Territory,Australia,-12.4634,130.846,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,1,1,1,1,1,1,1,1,1,3,3,3,3
8,Australian Capital Territory,Australia,-35.4735,149.012,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,2,2,3,4,6,9,19,19
