First Cell in Notebook - Starting SparkSession & Register;
Dataset Used - COVID 19 Dataset

In [60]:
#read the dataset
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("CovidAnalysis").getOrCreate()
country_wise_df = spark.read.csv("D:/BL_DI_DE/Database Problem/data/country_wise_latest.csv", header=True, inferSchema=True)
country_wise_df.show()


+-------------------+---------+------+---------+------+---------+----------+-------------+------------------+---------------------+----------------------+-------------------+-------------+-----------------+--------------------+
|     Country/Region|Confirmed|Deaths|Recovered|Active|New cases|New deaths|New recovered|Deaths / 100 Cases|Recovered / 100 Cases|Deaths / 100 Recovered|Confirmed last week|1 week change|1 week % increase|          WHO Region|
+-------------------+---------+------+---------+------+---------+----------+-------------+------------------+---------------------+----------------------+-------------------+-------------+-----------------+--------------------+
|        Afghanistan|    36263|  1269|    25198|  9796|      106|        10|           18|               3.5|                69.49|                  5.04|              35526|          737|             2.07|Eastern Mediterra...|
|            Albania|     4880|   144|     2745|  1991|      117|         6|           6

In [21]:
###Check the schema
country_wise_df = spark.read.csv("country_wise_latest.csv", header=True, inferSchema=True)
country_wise_df.printSchema()

root
 |-- Country/Region: string (nullable = true)
 |-- Confirmed: integer (nullable = true)
 |-- Deaths: integer (nullable = true)
 |-- Recovered: integer (nullable = true)
 |-- Active: integer (nullable = true)
 |-- New cases: integer (nullable = true)
 |-- New deaths: integer (nullable = true)
 |-- New recovered: integer (nullable = true)
 |-- Deaths / 100 Cases: double (nullable = true)
 |-- Recovered / 100 Cases: double (nullable = true)
 |-- Deaths / 100 Recovered: string (nullable = true)
 |-- Confirmed last week: integer (nullable = true)
 |-- 1 week change: integer (nullable = true)
 |-- 1 week % increase: double (nullable = true)
 |-- WHO Region: string (nullable = true)



In [25]:
#prints the header, limited to 3
country_wise_df.head(3)

[Row(Country/Region='Afghanistan', Confirmed=36263, Deaths=1269, Recovered=25198, Active=9796, New cases=106, New deaths=10, New recovered=18, Deaths / 100 Cases=3.5, Recovered / 100 Cases=69.49, Deaths / 100 Recovered='5.04', Confirmed last week=35526, 1 week change=737, 1 week % increase=2.07, WHO Region='Eastern Mediterranean'),
 Row(Country/Region='Albania', Confirmed=4880, Deaths=144, Recovered=2745, Active=1991, New cases=117, New deaths=6, New recovered=63, Deaths / 100 Cases=2.95, Recovered / 100 Cases=56.25, Deaths / 100 Recovered='5.25', Confirmed last week=4171, 1 week change=709, 1 week % increase=17.0, WHO Region='Europe'),
 Row(Country/Region='Algeria', Confirmed=27973, Deaths=1163, Recovered=18837, Active=7973, New cases=616, New deaths=8, New recovered=749, Deaths / 100 Cases=4.16, Recovered / 100 Cases=67.34, Deaths / 100 Recovered='6.17', Confirmed last week=23691, 1 week change=4282, 1 week % increase=18.07, WHO Region='Africa')]

In [26]:
##type
type(country_wise_df)

pyspark.sql.dataframe.DataFrame

In [27]:
#basic column check
country_wise_df.columns

['Country/Region',
 'Confirmed',
 'Deaths',
 'Recovered',
 'Active',
 'New cases',
 'New deaths',
 'New recovered',
 'Deaths / 100 Cases',
 'Recovered / 100 Cases',
 'Deaths / 100 Recovered',
 'Confirmed last week',
 '1 week change',
 '1 week % increase',
 'WHO Region']

In [35]:
##checking few particular columns
country_wise_df.select(['Confirmed', 'Active', 'Deaths / 100 Cases']).show()

+---------+------+------------------+
|Confirmed|Active|Deaths / 100 Cases|
+---------+------+------------------+
|    36263|  9796|               3.5|
|     4880|  1991|              2.95|
|    27973|  7973|              4.16|
|      907|    52|              5.73|
|      950|   667|              4.32|
|       86|    18|              3.49|
|   167416| 91782|              1.83|
|    37390| 10014|               1.9|
|    15303|  5825|              1.09|
|    20558|  1599|              3.47|
|    30446|  6781|              1.39|
|      382|   280|              2.88|
|    39482|  3231|              0.36|
|   226225| 97577|              1.31|
|      110|     9|              6.36|
|    67251|  6221|               0.8|
|    66428| 39154|             14.79|
|       48|    20|              4.17|
|     1770|   699|              1.98|
|       99|    13|               0.0|
+---------+------+------------------+
only showing top 20 rows



In [37]:
###checking how many dtypes are present
country_wise_df.dtypes

[('Country/Region', 'string'),
 ('Confirmed', 'int'),
 ('Deaths', 'int'),
 ('Recovered', 'int'),
 ('Active', 'int'),
 ('New cases', 'int'),
 ('New deaths', 'int'),
 ('New recovered', 'int'),
 ('Deaths / 100 Cases', 'double'),
 ('Recovered / 100 Cases', 'double'),
 ('Deaths / 100 Recovered', 'string'),
 ('Confirmed last week', 'int'),
 ('1 week change', 'int'),
 ('1 week % increase', 'double'),
 ('WHO Region', 'string')]

In [62]:
##checking the describe option
country_wise_df.describe()

DataFrame[summary: string, Country/Region: string, Confirmed: string, Deaths: string, Recovered: string, Active: string, New cases: string, New deaths: string, New recovered: string, Deaths / 100 Cases: string, Recovered / 100 Cases: string, Deaths / 100 Recovered: string, Confirmed last week: string, 1 week change: string, 1 week % increase: string, WHO Region: string]

In [66]:
##Adding Columns in data frame
country_wise_df.withColumn('India ka COVID', country_wise_df['Deaths']+4).show()

+-------------------+---------+------+---------+------+---------+----------+-------------+------------------+---------------------+----------------------+-------------------+-------------+-----------------+--------------------+--------------+
|     Country/Region|Confirmed|Deaths|Recovered|Active|New cases|New deaths|New recovered|Deaths / 100 Cases|Recovered / 100 Cases|Deaths / 100 Recovered|Confirmed last week|1 week change|1 week % increase|          WHO Region|India ka COVID|
+-------------------+---------+------+---------+------+---------+----------+-------------+------------------+---------------------+----------------------+-------------------+-------------+-----------------+--------------------+--------------+
|        Afghanistan|    36263|  1269|    25198|  9796|      106|        10|           18|               3.5|                69.49|                  5.04|              35526|          737|             2.07|Eastern Mediterra...|          1273|
|            Albania|     48

In [67]:
###Dropping the column
country_wise_df.drop('India ka COVID').show()

+-------------------+---------+------+---------+------+---------+----------+-------------+------------------+---------------------+----------------------+-------------------+-------------+-----------------+--------------------+
|     Country/Region|Confirmed|Deaths|Recovered|Active|New cases|New deaths|New recovered|Deaths / 100 Cases|Recovered / 100 Cases|Deaths / 100 Recovered|Confirmed last week|1 week change|1 week % increase|          WHO Region|
+-------------------+---------+------+---------+------+---------+----------+-------------+------------------+---------------------+----------------------+-------------------+-------------+-----------------+--------------------+
|        Afghanistan|    36263|  1269|    25198|  9796|      106|        10|           18|               3.5|                69.49|                  5.04|              35526|          737|             2.07|Eastern Mediterra...|
|            Albania|     4880|   144|     2745|  1991|      117|         6|           6