Start a simple spark session in local mode


In [None]:
import org.apache.spark.sql.SparkSession
val spark = SparkSession.builder().appName("Analytical Spark App")
.config("spark.sql.shuffle.partitions",4)
.master("local[*]")
.getOrCreate()

Read the data for covid vaccine drive in india

In [24]:
val vaccineDF=spark.read.option("header","true").option("inferSchema","true").csv("../data/covid_india/covid_vaccine_statewise.csv").
withColumn("updated_on_ts",to_timestamp($"updated_on","dd/MM/yyyy"))

vaccineDF: org.apache.spark.sql.DataFrame = [Updated_On: string, State: string ... 17 more fields]


See sample data using show

In [11]:
vaccineDF.show(2,false)

+----------+-----+----------------------------+------------------------+-----------+-----------------------+------------------------+---------------------------+-----------------------------+----------------------------------+--------------------------+-----------------------------+----+---------------+---------------+---------------+-----------------+------------------------+
|Updated_On|State|Total_Individuals_Vaccinated|Total_Sessions_Conducted|Total_Sites|First_Dose_Administered|Second_Dose_Administered|Male_Individuals_Vaccinated|Female_Individuals_Vaccinated|Transgender_Individuals_Vaccinated|Total_Covaxin_Administered|Total_CoviShield_Administered|AEFI|Age_18_30_years|Age_30_45_years|Age_45_60_years|Age_60_plus_years|Total_Doses_Administered|
+----------+-----+----------------------------+------------------------+-----------+-----------------------+------------------------+---------------------------+-----------------------------+----------------------------------+--------------

In [25]:
vaccineDF.select($"updated_on_ts",$"state").show(false)

+-------------------+-----+
|updated_on_ts      |state|
+-------------------+-----+
|2021-01-16 00:00:00|India|
|2021-01-17 00:00:00|India|
|2021-01-18 00:00:00|India|
|2021-01-19 00:00:00|India|
|2021-01-20 00:00:00|India|
|2021-01-21 00:00:00|India|
|2021-01-22 00:00:00|India|
|2021-01-23 00:00:00|India|
|2021-01-24 00:00:00|India|
|2021-01-25 00:00:00|India|
|2021-01-26 00:00:00|India|
|2021-01-27 00:00:00|India|
|2021-01-28 00:00:00|India|
|2021-01-29 00:00:00|India|
|2021-01-30 00:00:00|India|
|2021-01-31 00:00:00|India|
|2021-02-01 00:00:00|India|
|2021-02-02 00:00:00|India|
|2021-02-03 00:00:00|India|
|2021-02-04 00:00:00|India|
+-------------------+-----+
only showing top 20 rows



In [28]:
val statesDF=vaccineDF.filter($"state"=!="India").select($"state",to_date($"updated_on_ts").as("updated_on_dt"))

import org.apache.spark.sql.functions.rank
import org.apache.spark.sql.expressions.Window

val windowSpecs=Window.partitionBy($"state").orderBy($"updated_on_dt".desc)
val rankedDF=statesDF.withColumn("updt_rank",rank() over (windowSpecs))
rankedDF.show(false)

+-----------------+-------------+---------+
|state            |updated_on_dt|updt_rank|
+-----------------+-------------+---------+
|Arunachal Pradesh|2021-05-07   |1        |
|Arunachal Pradesh|2021-05-06   |2        |
|Arunachal Pradesh|2021-05-05   |3        |
|Arunachal Pradesh|2021-05-04   |4        |
|Arunachal Pradesh|2021-05-03   |5        |
|Arunachal Pradesh|2021-05-02   |6        |
|Arunachal Pradesh|2021-05-01   |7        |
|Arunachal Pradesh|2021-04-30   |8        |
|Arunachal Pradesh|2021-04-29   |9        |
|Arunachal Pradesh|2021-04-28   |10       |
|Arunachal Pradesh|2021-04-27   |11       |
|Arunachal Pradesh|2021-04-26   |12       |
|Arunachal Pradesh|2021-04-25   |13       |
|Arunachal Pradesh|2021-04-24   |14       |
|Arunachal Pradesh|2021-04-23   |15       |
|Arunachal Pradesh|2021-04-22   |16       |
|Arunachal Pradesh|2021-04-21   |17       |
|Arunachal Pradesh|2021-04-20   |18       |
|Arunachal Pradesh|2021-04-19   |19       |
|Arunachal Pradesh|2021-04-18   

statesDF: org.apache.spark.sql.DataFrame = [state: string, updated_on_dt: date]
import org.apache.spark.sql.functions.rank
import org.apache.spark.sql.expressions.Window
windowSpecs: org.apache.spark.sql.expressions.WindowSpec = org.apache.spark.sql.expressions.WindowSpec@6e52c858
rankedDF: org.apache.spark.sql.DataFrame = [state: string, updated_on_dt: date ... 1 more field]


Find latest vaceen update date from each state

In [30]:
rankedDF.filter($"updt_rank"===1).show(40,false)

+----------------------------------------+-------------+---------+
|state                                   |updated_on_dt|updt_rank|
+----------------------------------------+-------------+---------+
|Arunachal Pradesh                       |2021-05-07   |1        |
|Assam                                   |2021-05-07   |1        |
|Dadra and Nagar Haveli and Daman and Diu|2021-05-07   |1        |
|Gujarat                                 |2021-05-07   |1        |
|Himachal Pradesh                        |2021-05-07   |1        |
|Ladakh                                  |2021-05-07   |1        |
|Lakshadweep                             |2021-05-07   |1        |
|Maharashtra                             |2021-05-07   |1        |
|Nagaland                                |2021-05-07   |1        |
|Telangana                               |2021-05-07   |1        |
|Tripura                                 |2021-05-07   |1        |
|West Bengal                             |2021-05-07   |1     

Date of maximum Total_Individuals_Vaccinated by state

In [33]:
import org.apache.spark.sql.functions.dense_rank
val windowSpecs1=Window.partitionBy($"state").orderBy($"Total_Individuals_Vaccinated".desc)
vaccineDF.filter($"state"=!="India")
.select($"state",to_date($"updated_on_ts").as("updated_on_dt"),$"Total_Individuals_Vaccinated")
.filter($"Total_Individuals_Vaccinated".isNotNull)
.withColumn("vcn_rank",dense_rank() over (windowSpecs))
.filter($"vcn_rank"===1)
.show(false)

+----------------------------------------+-------------+----------------------------+--------+
|state                                   |updated_on_dt|Total_Individuals_Vaccinated|vcn_rank|
+----------------------------------------+-------------+----------------------------+--------+
|Arunachal Pradesh                       |2021-05-06   |202965.0                    |1       |
|Assam                                   |2021-05-06   |2211808.0                   |1       |
|Dadra and Nagar Haveli and Daman and Diu|2021-05-05   |80984.0                     |1       |
|Gujarat                                 |2021-05-06   |1.0505574E7                 |1       |
|Himachal Pradesh                        |2021-05-06   |1671620.0                   |1       |
|Ladakh                                  |2021-05-06   |81061.0                     |1       |
|Lakshadweep                             |2021-05-06   |19986.0                     |1       |
|Maharashtra                             |2021-05-

import org.apache.spark.sql.functions.dense_rank
windowSpecs1: org.apache.spark.sql.expressions.WindowSpec = org.apache.spark.sql.expressions.WindowSpec@631b9808


Read employee Data

In [36]:
val smployeeSchema="name string,dept string,salary int"
val employees=spark.read.option("delimiter","|").schema(smployeeSchema).csv("../data/dummy_data/employees.csv")
employees.show(false)

+-------+----------+------+
|name   |dept      |salary|
+-------+----------+------+
|  James|     Sales|3000  |
|  James|     Sales|3000  |
| Robert|     Sales|4100  |
|   Saif|     Sales|4100  |
|Michael|     Sales|4600  |
|  Maria|   Finance|3000  |
|  Scott|   Finance|3300  |
|    Jen|   Finance|3900  |
|  Kumar| Marketing|2000  |
|   Jeff| Marketing|3000  |
+-------+----------+------+



smployeeSchema: String = name string,dept string,salary int
employees: org.apache.spark.sql.DataFrame = [name: string, dept: string ... 1 more field]


In [58]:
import org.apache.spark.sql.functions._

val wcByDept=Window.partitionBy($"dept").orderBy($"salary".desc)
val wcByDeptAsc=Window.partitionBy($"dept").orderBy($"salary".asc)
employees
.withColumn("row_num",row_number() over(wcByDept))
.withColumn("rnk",rank() over(wcByDept))
.withColumn("dense_rnk",dense_rank() over(wcByDept))
.withColumn("pcnt_rnk",percent_rank() over(wcByDeptAsc))
.withColumn("ntile",ntile(2) over(wcByDept))
.withColumn("cume_dist",cume_dist() over(Window.partitionBy($"dept").orderBy($"salary".asc)))
.withColumn("lag",lag("salary",1).over(wcByDeptAsc))
.withColumn("lead",lead("salary",1).over(wcByDeptAsc))
.sort($"dept",$"salary".desc)
.show(false)

+-------+----------+------+-------+---+---------+--------+-----+------------------+----+----+
|name   |dept      |salary|row_num|rnk|dense_rnk|pcnt_rnk|ntile|cume_dist         |lag |lead|
+-------+----------+------+-------+---+---------+--------+-----+------------------+----+----+
|Michael|     Sales|4600  |1      |1  |1        |1.0     |1    |1.0               |4100|null|
|   Saif|     Sales|4100  |3      |2  |2        |0.5     |1    |0.8               |4100|4600|
| Robert|     Sales|4100  |2      |2  |2        |0.5     |1    |0.8               |3000|4100|
|  James|     Sales|3000  |4      |4  |3        |0.0     |2    |0.4               |null|3000|
|  James|     Sales|3000  |5      |4  |3        |0.0     |2    |0.4               |3000|4100|
|    Jen|   Finance|3900  |1      |1  |1        |1.0     |1    |1.0               |3300|null|
|  Scott|   Finance|3300  |2      |2  |2        |0.5     |1    |0.6666666666666666|3000|3900|
|  Maria|   Finance|3000  |3      |3  |3        |0.0     |2 

import org.apache.spark.sql.functions._
wcByDept: org.apache.spark.sql.expressions.WindowSpec = org.apache.spark.sql.expressions.WindowSpec@bfc3add
wcByDeptAsc: org.apache.spark.sql.expressions.WindowSpec = org.apache.spark.sql.expressions.WindowSpec@59cc58b


In [69]:
import org.apache.spark.sql.functions._

val wcByDept=Window.partitionBy($"dept")
employees
.withColumn("row",row_number() over(Window.partitionBy($"dept").orderBy($"salary")))
.withColumn("avg", avg(col("salary")).over(wcByDept))
.withColumn("sum", sum(col("salary")).over(wcByDept))
.withColumn("min", min(col("salary")).over(wcByDept))
.withColumn("max", max(col("salary")).over(wcByDept))
.filter($"row"===1)
.drop("name","salary","row")
.show(false)

+----------+------+-----+----+----+
|dept      |avg   |sum  |min |max |
+----------+------+-----+----+----+
|   Finance|3400.0|10200|3000|3900|
|     Sales|3760.0|18800|3000|4600|
| Marketing|2500.0|5000 |2000|3000|
+----------+------+-----+----+----+



import org.apache.spark.sql.functions._
wcByDept: org.apache.spark.sql.expressions.WindowSpec = org.apache.spark.sql.expressions.WindowSpec@6c279c07
