In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window

In [0]:
spark = SparkSession.builder.appName("COVID19Analysis").getOrCreate()

# Load COVID-19 data (example: synthetic dataset)
data = [
("2021-01-01", "USA", 10000),
("2021-01-01", "India", 8000),
("2021-01-02", "USA", 10500),
("2021-01-02", "India", 8200),
("2021-01-01", "russia", 200),
("2021-01-02", "russia", 8100),
]

schema = ["date", "country", "cases"]

covid_df = spark.createDataFrame(data, schema=schema)
covid_df.show()

+----------+-------+-----+
|      date|country|cases|
+----------+-------+-----+
|2021-01-01|    USA|10000|
|2021-01-01|  India| 8000|
|2021-01-02|    USA|10500|
|2021-01-02|  India| 8200|
|2021-01-01| russia|  200|
|2021-01-02| russia| 8100|
+----------+-------+-----+



In [0]:
#Calculate total cases per country
total_cases_per_country = covid_df.groupBy("country").agg(sum("cases").alias("total_cases"))
total_cases_per_country.show()

+-------+-----------+
|country|total_cases|
+-------+-----------+
|    USA|      20500|
|  India|      16200|
| russia|       8300|
+-------+-----------+



<h3> we can use defualt value in lag or we can convert using fillna as well.</h3>

In [0]:
# Calculate daily new cases
covid_df = covid_df.withColumn("prev_date_cases", lag("cases",1,0).over(Window.partitionBy("country").orderBy("date")))
#covid_df=covid_df.fillna({"prev_date_cases":0})
covid_df = covid_df.withColumn("new_cases", col("cases") - col("prev_date_cases"))
covid_df.show()

+----------+-------+-----+---------------+---------+
|      date|country|cases|prev_date_cases|new_cases|
+----------+-------+-----+---------------+---------+
|2021-01-01|  India| 8000|              0|     8000|
|2021-01-02|  India| 8200|           8000|      200|
|2021-01-01|    USA|10000|              0|    10000|
|2021-01-02|    USA|10500|          10000|      500|
|2021-01-01| russia|  200|              0|      200|
|2021-01-02| russia| 8100|            200|     7900|
+----------+-------+-----+---------------+---------+



In [0]:
# Analyze the data (example: top countries with the most cases)
top_countries = total_cases_per_country.orderBy(desc("total_cases")).limit(2)
top_countries.show()

+-------+-----------+
|country|total_cases|
+-------+-----------+
|    USA|      20500|
|  India|      16200|
+-------+-----------+

