## [Spark Tutorial](https://www.youtube.com/watch?v=5RosqOeJrrs)

### Spark Session

In [1]:
# Spark Session
from pyspark.sql import SparkSession

# `local[*]` means use all available cores on the local machine.

spark = (
    SparkSession.builder
    .appName("spark-intro")
    .master("local[*]")
    .getOrCreate()
)

In [2]:
spark

In [3]:
# Print the number of driver cores.

spark.sparkContext.defaultParallelism

24

In [4]:
# Read csv into dataframe

emp = spark.read.csv('data/emp.csv', header=True, inferSchema=True)
emp.show()

print(f"Number of rows in the Employees DataFrame: {emp.count()}")
print(f"Number of partitions in the Employees DataFrame: {emp.rdd.getNumPartitions()}")

+-----------+-------------+-------------+---+------+------+-------------------+
|employee_id|department_id|         name|age|gender|salary|          hire_date|
+-----------+-------------+-------------+---+------+------+-------------------+
|          1|          101|     John Doe| 30|  Male| 50000|2015-01-01 00:00:00|
|          2|          101|   Jane Smith| 25|Female| 45000|2016-02-15 00:00:00|
|          3|          102|    Bob Brown| 35|  Male| 55000|2014-05-01 00:00:00|
|          4|          102|    Alice Lee| 28|Female| 48000|2017-09-30 00:00:00|
|          5|          103|    Jack Chan| 40|  Male| 60000|2013-04-01 00:00:00|
|          6|          103|    Jill Wong| 32|Female| 52000|2018-07-01 00:00:00|
|          7|          101|James Johnson| 42|  Male| 70000|2012-03-15 00:00:00|
|          8|          102|     Kate Kim| 29|Female| 51000|2019-10-01 00:00:00|
|          9|          103|      Tom Tan| 33|  Male| 58000|2016-06-01 00:00:00|
|         10|          104|     Lisa Lee

In [5]:
# Print the schema of the dataframe

emp.printSchema()

root
 |-- employee_id: integer (nullable = true)
 |-- department_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)
 |-- hire_date: timestamp (nullable = true)



In [6]:
# Increase the number of partitions to 10
# (use `coalesce()` to decrease the number of partitions)

emp_re = emp.repartition(10)
emp_re.rdd.getNumPartitions()

10

In [7]:
emp_re.show()

print(f"Number of rows in the Employees (Repartitioned) DataFrame: {emp_re.count()}")
print(f"Number of partitions in the Employees (Repartitioned) DataFrame: {emp_re.rdd.getNumPartitions()}")

+-----------+-------------+-------------+---+------+------+-------------------+
|employee_id|department_id|         name|age|gender|salary|          hire_date|
+-----------+-------------+-------------+---+------+------+-------------------+
|          2|          101|   Jane Smith| 25|Female| 45000|2016-02-15 00:00:00|
|         12|          105|   Susan Chen| 31|Female| 54000|2017-02-15 00:00:00|
|         20|          102|    Grace Kim| 32|Female| 53000|2018-11-01 00:00:00|
|          3|          102|    Bob Brown| 35|  Male| 55000|2014-05-01 00:00:00|
|         16|          107|  Kelly Zhang| 30|Female| 49000|2018-04-01 00:00:00|
|         15|          106|  Michael Lee| 37|  Male| 63000|2014-09-30 00:00:00|
|         19|          103|  Steven Chen| 36|  Male| 62000|2015-08-01 00:00:00|
|          6|          103|    Jill Wong| 32|Female| 52000|2018-07-01 00:00:00|
|         17|          105|  George Wang| 34|  Male| 57000|2016-03-15 00:00:00|
|          5|          103|    Jack Chan

In [8]:
# Load another (much larger) csv into a new dataframe and check its properties.

cities = spark.read.csv('data/cities.csv', header=True, inferSchema=True)

# Print the rows and partitions of the dataframe
print(f"Number of rows in the Cities-DataFrame: {cities.count()}")
print(f"Number of partitions in the Cities-DataFrame: {cities.rdd.getNumPartitions()}")

Number of rows in the Cities-DataFrame: 2349391
Number of partitions in the Cities-DataFrame: 19


### Basic Transformations 1

In [9]:
emp.schema

StructType([StructField('employee_id', IntegerType(), True), StructField('department_id', IntegerType(), True), StructField('name', StringType(), True), StructField('age', IntegerType(), True), StructField('gender', StringType(), True), StructField('salary', IntegerType(), True), StructField('hire_date', TimestampType(), True)])

In [10]:
emp.show()

+-----------+-------------+-------------+---+------+------+-------------------+
|employee_id|department_id|         name|age|gender|salary|          hire_date|
+-----------+-------------+-------------+---+------+------+-------------------+
|          1|          101|     John Doe| 30|  Male| 50000|2015-01-01 00:00:00|
|          2|          101|   Jane Smith| 25|Female| 45000|2016-02-15 00:00:00|
|          3|          102|    Bob Brown| 35|  Male| 55000|2014-05-01 00:00:00|
|          4|          102|    Alice Lee| 28|Female| 48000|2017-09-30 00:00:00|
|          5|          103|    Jack Chan| 40|  Male| 60000|2013-04-01 00:00:00|
|          6|          103|    Jill Wong| 32|Female| 52000|2018-07-01 00:00:00|
|          7|          101|James Johnson| 42|  Male| 70000|2012-03-15 00:00:00|
|          8|          102|     Kate Kim| 29|Female| 51000|2019-10-01 00:00:00|
|          9|          103|      Tom Tan| 33|  Male| 58000|2016-06-01 00:00:00|
|         10|          104|     Lisa Lee

In [None]:
# Creating a manual schema in Spark
from pyspark.sql.types import _parse_datatype_string
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# ! IMPLICT INFERENCE
# Spark can infer the schema from a string
schema_string = "name string, age int"
print(_parse_datatype_string(schema_string))

# ! EXPLICIT INFERENCE
# Template: StructType([StructField(name, dataType, nullable?)]) 
schema_spark = StructType([
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True)
])
print(schema_spark)

StructType([StructField('name', StringType(), True), StructField('age', IntegerType(), True)])
StructType([StructField('name', StringType(), True), StructField('age', IntegerType(), True)])


In [13]:
# Columns and Expressions
from pyspark.sql.functions import col, expr

# ? col("name") == expr("name"), since both are Column objects and hence treated as same.
# select employee_id, name, age, salary from emp

emp_filtered = emp.select(col("employee_id"), expr("name"), emp.age, emp.salary)    # ! TRANSFORMATION
emp_filtered.show() # ! ACTION

+-----------+-------------+---+------+
|employee_id|         name|age|salary|
+-----------+-------------+---+------+
|          1|     John Doe| 30| 50000|
|          2|   Jane Smith| 25| 45000|
|          3|    Bob Brown| 35| 55000|
|          4|    Alice Lee| 28| 48000|
|          5|    Jack Chan| 40| 60000|
|          6|    Jill Wong| 32| 52000|
|          7|James Johnson| 42| 70000|
|          8|     Kate Kim| 29| 51000|
|          9|      Tom Tan| 33| 58000|
|         10|     Lisa Lee| 27| 47000|
|         11|   David Park| 38| 65000|
|         12|   Susan Chen| 31| 54000|
|         13|    Brian Kim| 45| 75000|
|         14|    Emily Lee| 26| 46000|
|         15|  Michael Lee| 37| 63000|
|         16|  Kelly Zhang| 30| 49000|
|         17|  George Wang| 34| 57000|
|         18|    Nancy Liu| 29| 50000|
|         19|  Steven Chen| 36| 62000|
|         20|    Grace Kim| 32| 53000|
+-----------+-------------+---+------+



In [14]:
emp_casted = emp_filtered.select(expr("employee_id as emp_id"), emp_filtered.name, expr("cast(age as int) as age"), emp_filtered.salary)
emp_casted.show()

+------+-------------+---+------+
|emp_id|         name|age|salary|
+------+-------------+---+------+
|     1|     John Doe| 30| 50000|
|     2|   Jane Smith| 25| 45000|
|     3|    Bob Brown| 35| 55000|
|     4|    Alice Lee| 28| 48000|
|     5|    Jack Chan| 40| 60000|
|     6|    Jill Wong| 32| 52000|
|     7|James Johnson| 42| 70000|
|     8|     Kate Kim| 29| 51000|
|     9|      Tom Tan| 33| 58000|
|    10|     Lisa Lee| 27| 47000|
|    11|   David Park| 38| 65000|
|    12|   Susan Chen| 31| 54000|
|    13|    Brian Kim| 45| 75000|
|    14|    Emily Lee| 26| 46000|
|    15|  Michael Lee| 37| 63000|
|    16|  Kelly Zhang| 30| 49000|
|    17|  George Wang| 34| 57000|
|    18|    Nancy Liu| 29| 50000|
|    19|  Steven Chen| 36| 62000|
|    20|    Grace Kim| 32| 53000|
+------+-------------+---+------+



In [16]:
emp_casted.printSchema()

root
 |-- emp_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: integer (nullable = true)



In [17]:
emp_casted_alt = emp_filtered.selectExpr("employee_id as emp_id", "name", "cast(age as int) as age", "salary")
emp_casted_alt.show()

+------+-------------+---+------+
|emp_id|         name|age|salary|
+------+-------------+---+------+
|     1|     John Doe| 30| 50000|
|     2|   Jane Smith| 25| 45000|
|     3|    Bob Brown| 35| 55000|
|     4|    Alice Lee| 28| 48000|
|     5|    Jack Chan| 40| 60000|
|     6|    Jill Wong| 32| 52000|
|     7|James Johnson| 42| 70000|
|     8|     Kate Kim| 29| 51000|
|     9|      Tom Tan| 33| 58000|
|    10|     Lisa Lee| 27| 47000|
|    11|   David Park| 38| 65000|
|    12|   Susan Chen| 31| 54000|
|    13|    Brian Kim| 45| 75000|
|    14|    Emily Lee| 26| 46000|
|    15|  Michael Lee| 37| 63000|
|    16|  Kelly Zhang| 30| 49000|
|    17|  George Wang| 34| 57000|
|    18|    Nancy Liu| 29| 50000|
|    19|  Steven Chen| 36| 62000|
|    20|    Grace Kim| 32| 53000|
+------+-------------+---+------+



In [18]:
# Filter emp_casted based on Age > 30

emp_casted.select("emp_id", "name", "age", "salary").where("age > 30").show()

+------+-------------+---+------+
|emp_id|         name|age|salary|
+------+-------------+---+------+
|     3|    Bob Brown| 35| 55000|
|     5|    Jack Chan| 40| 60000|
|     6|    Jill Wong| 32| 52000|
|     7|James Johnson| 42| 70000|
|     9|      Tom Tan| 33| 58000|
|    11|   David Park| 38| 65000|
|    12|   Susan Chen| 31| 54000|
|    13|    Brian Kim| 45| 75000|
|    15|  Michael Lee| 37| 63000|
|    17|  George Wang| 34| 57000|
|    19|  Steven Chen| 36| 62000|
|    20|    Grace Kim| 32| 53000|
+------+-------------+---+------+



### Basic Transformations 2

In [21]:
emp.show()

+-----------+-------------+-------------+---+------+------+-------------------+
|employee_id|department_id|         name|age|gender|salary|          hire_date|
+-----------+-------------+-------------+---+------+------+-------------------+
|          1|          101|     John Doe| 30|  Male| 50000|2015-01-01 00:00:00|
|          2|          101|   Jane Smith| 25|Female| 45000|2016-02-15 00:00:00|
|          3|          102|    Bob Brown| 35|  Male| 55000|2014-05-01 00:00:00|
|          4|          102|    Alice Lee| 28|Female| 48000|2017-09-30 00:00:00|
|          5|          103|    Jack Chan| 40|  Male| 60000|2013-04-01 00:00:00|
|          6|          103|    Jill Wong| 32|Female| 52000|2018-07-01 00:00:00|
|          7|          101|James Johnson| 42|  Male| 70000|2012-03-15 00:00:00|
|          8|          102|     Kate Kim| 29|Female| 51000|2019-10-01 00:00:00|
|          9|          103|      Tom Tan| 33|  Male| 58000|2016-06-01 00:00:00|
|         10|          104|     Lisa Lee

In [23]:
emp.printSchema()

root
 |-- employee_id: integer (nullable = true)
 |-- department_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)
 |-- hire_date: timestamp (nullable = true)



In [24]:
from pyspark.sql.functions import col, cast

emp.select("employee_id", "name", "age", col("salary").cast("double")).printSchema()

root
 |-- employee_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: double (nullable = true)



In [26]:
# Adding new columns to the DataFrame

emp_casted = emp.select("employee_id", "name", "age", col("salary").cast("double"))

emp_taxed = emp_casted.withColumn("tax", col("salary") * 0.2)
emp_taxed.show()

+-----------+-------------+---+-------+-------+
|employee_id|         name|age| salary|    tax|
+-----------+-------------+---+-------+-------+
|          1|     John Doe| 30|50000.0|10000.0|
|          2|   Jane Smith| 25|45000.0| 9000.0|
|          3|    Bob Brown| 35|55000.0|11000.0|
|          4|    Alice Lee| 28|48000.0| 9600.0|
|          5|    Jack Chan| 40|60000.0|12000.0|
|          6|    Jill Wong| 32|52000.0|10400.0|
|          7|James Johnson| 42|70000.0|14000.0|
|          8|     Kate Kim| 29|51000.0|10200.0|
|          9|      Tom Tan| 33|58000.0|11600.0|
|         10|     Lisa Lee| 27|47000.0| 9400.0|
|         11|   David Park| 38|65000.0|13000.0|
|         12|   Susan Chen| 31|54000.0|10800.0|
|         13|    Brian Kim| 45|75000.0|15000.0|
|         14|    Emily Lee| 26|46000.0| 9200.0|
|         15|  Michael Lee| 37|63000.0|12600.0|
|         16|  Kelly Zhang| 30|49000.0| 9800.0|
|         17|  George Wang| 34|57000.0|11400.0|
|         18|    Nancy Liu| 29|50000.0|1

In [28]:
# Literals (Adding a constant to the DataFrame)
from pyspark.sql.functions import lit

emp_new_cols = emp_taxed.withColumn("columnOne", lit(1)).withColumn("columnTwo", lit("two"))
emp_new_cols.show()

+-----------+-------------+---+-------+-------+---------+---------+
|employee_id|         name|age| salary|    tax|columnOne|columnTwo|
+-----------+-------------+---+-------+-------+---------+---------+
|          1|     John Doe| 30|50000.0|10000.0|        1|      two|
|          2|   Jane Smith| 25|45000.0| 9000.0|        1|      two|
|          3|    Bob Brown| 35|55000.0|11000.0|        1|      two|
|          4|    Alice Lee| 28|48000.0| 9600.0|        1|      two|
|          5|    Jack Chan| 40|60000.0|12000.0|        1|      two|
|          6|    Jill Wong| 32|52000.0|10400.0|        1|      two|
|          7|James Johnson| 42|70000.0|14000.0|        1|      two|
|          8|     Kate Kim| 29|51000.0|10200.0|        1|      two|
|          9|      Tom Tan| 33|58000.0|11600.0|        1|      two|
|         10|     Lisa Lee| 27|47000.0| 9400.0|        1|      two|
|         11|   David Park| 38|65000.0|13000.0|        1|      two|
|         12|   Susan Chen| 31|54000.0|10800.0| 

In [29]:
emp_new_cols.withColumnRenamed("employee_id", "emp_id").show()

+------+-------------+---+-------+-------+---------+---------+
|emp_id|         name|age| salary|    tax|columnOne|columnTwo|
+------+-------------+---+-------+-------+---------+---------+
|     1|     John Doe| 30|50000.0|10000.0|        1|      two|
|     2|   Jane Smith| 25|45000.0| 9000.0|        1|      two|
|     3|    Bob Brown| 35|55000.0|11000.0|        1|      two|
|     4|    Alice Lee| 28|48000.0| 9600.0|        1|      two|
|     5|    Jack Chan| 40|60000.0|12000.0|        1|      two|
|     6|    Jill Wong| 32|52000.0|10400.0|        1|      two|
|     7|James Johnson| 42|70000.0|14000.0|        1|      two|
|     8|     Kate Kim| 29|51000.0|10200.0|        1|      two|
|     9|      Tom Tan| 33|58000.0|11600.0|        1|      two|
|    10|     Lisa Lee| 27|47000.0| 9400.0|        1|      two|
|    11|   David Park| 38|65000.0|13000.0|        1|      two|
|    12|   Susan Chen| 31|54000.0|10800.0|        1|      two|
|    13|    Brian Kim| 45|75000.0|15000.0|        1|   

In [30]:
# Dropping columns from the DataFrame

emp_new_cols.drop("columnTwo").show()

+-----------+-------------+---+-------+-------+---------+
|employee_id|         name|age| salary|    tax|columnOne|
+-----------+-------------+---+-------+-------+---------+
|          1|     John Doe| 30|50000.0|10000.0|        1|
|          2|   Jane Smith| 25|45000.0| 9000.0|        1|
|          3|    Bob Brown| 35|55000.0|11000.0|        1|
|          4|    Alice Lee| 28|48000.0| 9600.0|        1|
|          5|    Jack Chan| 40|60000.0|12000.0|        1|
|          6|    Jill Wong| 32|52000.0|10400.0|        1|
|          7|James Johnson| 42|70000.0|14000.0|        1|
|          8|     Kate Kim| 29|51000.0|10200.0|        1|
|          9|      Tom Tan| 33|58000.0|11600.0|        1|
|         10|     Lisa Lee| 27|47000.0| 9400.0|        1|
|         11|   David Park| 38|65000.0|13000.0|        1|
|         12|   Susan Chen| 31|54000.0|10800.0|        1|
|         13|    Brian Kim| 45|75000.0|15000.0|        1|
|         14|    Emily Lee| 26|46000.0| 9200.0|        1|
|         15| 

In [31]:
emp_new_cols.show()

+-----------+-------------+---+-------+-------+---------+---------+
|employee_id|         name|age| salary|    tax|columnOne|columnTwo|
+-----------+-------------+---+-------+-------+---------+---------+
|          1|     John Doe| 30|50000.0|10000.0|        1|      two|
|          2|   Jane Smith| 25|45000.0| 9000.0|        1|      two|
|          3|    Bob Brown| 35|55000.0|11000.0|        1|      two|
|          4|    Alice Lee| 28|48000.0| 9600.0|        1|      two|
|          5|    Jack Chan| 40|60000.0|12000.0|        1|      two|
|          6|    Jill Wong| 32|52000.0|10400.0|        1|      two|
|          7|James Johnson| 42|70000.0|14000.0|        1|      two|
|          8|     Kate Kim| 29|51000.0|10200.0|        1|      two|
|          9|      Tom Tan| 33|58000.0|11600.0|        1|      two|
|         10|     Lisa Lee| 27|47000.0| 9400.0|        1|      two|
|         11|   David Park| 38|65000.0|13000.0|        1|      two|
|         12|   Susan Chen| 31|54000.0|10800.0| 

In [33]:
# Filter DataFrame where tax > 10000, along with LIMIT to 5 rows

emp_taxed.where("tax > 10000").limit(5).show()

+-----------+-------------+---+-------+-------+
|employee_id|         name|age| salary|    tax|
+-----------+-------------+---+-------+-------+
|          3|    Bob Brown| 35|55000.0|11000.0|
|          5|    Jack Chan| 40|60000.0|12000.0|
|          6|    Jill Wong| 32|52000.0|10400.0|
|          7|James Johnson| 42|70000.0|14000.0|
|          8|     Kate Kim| 29|51000.0|10200.0|
+-----------+-------------+---+-------+-------+



In [37]:
# Bonus: Adding multiple columns to the dataframe at once

columns = {
    'tax': col('salary') * 0.2,
    'bonus': col('salary') * 0.1
}

emp_casted.withColumns(columns).show()

+-----------+-------------+---+-------+-------+------+
|employee_id|         name|age| salary|    tax| bonus|
+-----------+-------------+---+-------+-------+------+
|          1|     John Doe| 30|50000.0|10000.0|5000.0|
|          2|   Jane Smith| 25|45000.0| 9000.0|4500.0|
|          3|    Bob Brown| 35|55000.0|11000.0|5500.0|
|          4|    Alice Lee| 28|48000.0| 9600.0|4800.0|
|          5|    Jack Chan| 40|60000.0|12000.0|6000.0|
|          6|    Jill Wong| 32|52000.0|10400.0|5200.0|
|          7|James Johnson| 42|70000.0|14000.0|7000.0|
|          8|     Kate Kim| 29|51000.0|10200.0|5100.0|
|          9|      Tom Tan| 33|58000.0|11600.0|5800.0|
|         10|     Lisa Lee| 27|47000.0| 9400.0|4700.0|
|         11|   David Park| 38|65000.0|13000.0|6500.0|
|         12|   Susan Chen| 31|54000.0|10800.0|5400.0|
|         13|    Brian Kim| 45|75000.0|15000.0|7500.0|
|         14|    Emily Lee| 26|46000.0| 9200.0|4600.0|
|         15|  Michael Lee| 37|63000.0|12600.0|6300.0|
|         

### String and Dates

In [38]:
spark = (
    SparkSession
    .builder
    .appName("String & Dates")
    .master("local[*]")
    .getOrCreate()
)

spark

In [45]:
# Add a "Case" column to the DataFrame based on conditions
from pyspark.sql.functions import when, col

emp.withColumn('new_gender', when(col('gender') == 'Male', 'M').when(col('gender') == 'Female', 'F').otherwise(None)).show()

+-----------+-------------+-------------+---+------+------+-------------------+----------+
|employee_id|department_id|         name|age|gender|salary|          hire_date|new_gender|
+-----------+-------------+-------------+---+------+------+-------------------+----------+
|          1|          101|     John Doe| 30|  Male| 50000|2015-01-01 00:00:00|         M|
|          2|          101|   Jane Smith| 25|Female| 45000|2016-02-15 00:00:00|         F|
|          3|          102|    Bob Brown| 35|  Male| 55000|2014-05-01 00:00:00|         M|
|          4|          102|    Alice Lee| 28|Female| 48000|2017-09-30 00:00:00|         F|
|          5|          103|    Jack Chan| 40|  Male| 60000|2013-04-01 00:00:00|         M|
|          6|          103|    Jill Wong| 32|Female| 52000|2018-07-01 00:00:00|         F|
|          7|          101|James Johnson| 42|  Male| 70000|2012-03-15 00:00:00|         M|
|          8|          102|     Kate Kim| 29|Female| 51000|2019-10-01 00:00:00|         F|

In [46]:
# Replace in Strings

from pyspark.sql.functions import regexp_replace

emp.withColumn("new_name", regexp_replace("name", "J", "Z")).show()

+-----------+-------------+-------------+---+------+------+-------------------+-------------+
|employee_id|department_id|         name|age|gender|salary|          hire_date|     new_name|
+-----------+-------------+-------------+---+------+------+-------------------+-------------+
|          1|          101|     John Doe| 30|  Male| 50000|2015-01-01 00:00:00|     Zohn Doe|
|          2|          101|   Jane Smith| 25|Female| 45000|2016-02-15 00:00:00|   Zane Smith|
|          3|          102|    Bob Brown| 35|  Male| 55000|2014-05-01 00:00:00|    Bob Brown|
|          4|          102|    Alice Lee| 28|Female| 48000|2017-09-30 00:00:00|    Alice Lee|
|          5|          103|    Jack Chan| 40|  Male| 60000|2013-04-01 00:00:00|    Zack Chan|
|          6|          103|    Jill Wong| 32|Female| 52000|2018-07-01 00:00:00|    Zill Wong|
|          7|          101|James Johnson| 42|  Male| 70000|2012-03-15 00:00:00|Zames Zohnson|
|          8|          102|     Kate Kim| 29|Female| 51000|2

In [53]:
# Convert Timestamp (String) type column to Date type

from pyspark.sql.functions import to_date, col

emp.withColumn("hire_date", to_date(col("hire_date"), 'yyyy-MM-dd')).printSchema()

root
 |-- employee_id: integer (nullable = true)
 |-- department_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)
 |-- hire_date: date (nullable = true)



In [55]:
# Add Current Date and Timestamp columns

from pyspark.sql.functions import current_date, current_timestamp

columns = {
    "current_date": current_date(),
    "current_timestamp": current_timestamp()
}

emp.withColumns(columns).show(truncate=False)

+-----------+-------------+-------------+---+------+------+-------------------+------------+--------------------------+
|employee_id|department_id|name         |age|gender|salary|hire_date          |current_date|current_timestamp         |
+-----------+-------------+-------------+---+------+------+-------------------+------------+--------------------------+
|1          |101          |John Doe     |30 |Male  |50000 |2015-01-01 00:00:00|2025-04-30  |2025-04-30 02:52:20.972595|
|2          |101          |Jane Smith   |25 |Female|45000 |2016-02-15 00:00:00|2025-04-30  |2025-04-30 02:52:20.972595|
|3          |102          |Bob Brown    |35 |Male  |55000 |2014-05-01 00:00:00|2025-04-30  |2025-04-30 02:52:20.972595|
|4          |102          |Alice Lee    |28 |Female|48000 |2017-09-30 00:00:00|2025-04-30  |2025-04-30 02:52:20.972595|
|5          |103          |Jack Chan    |40 |Male  |60000 |2013-04-01 00:00:00|2025-04-30  |2025-04-30 02:52:20.972595|
|6          |103          |Jill Wong    

In [59]:
# Drop rows with null values in a column

temp = emp.withColumn("gender", when(col("name") == "Nancy Liu", None).otherwise(col("gender")))
temp.na.drop().show()

del temp

+-----------+-------------+-------------+---+------+------+-------------------+
|employee_id|department_id|         name|age|gender|salary|          hire_date|
+-----------+-------------+-------------+---+------+------+-------------------+
|          1|          101|     John Doe| 30|  Male| 50000|2015-01-01 00:00:00|
|          2|          101|   Jane Smith| 25|Female| 45000|2016-02-15 00:00:00|
|          3|          102|    Bob Brown| 35|  Male| 55000|2014-05-01 00:00:00|
|          4|          102|    Alice Lee| 28|Female| 48000|2017-09-30 00:00:00|
|          5|          103|    Jack Chan| 40|  Male| 60000|2013-04-01 00:00:00|
|          6|          103|    Jill Wong| 32|Female| 52000|2018-07-01 00:00:00|
|          7|          101|James Johnson| 42|  Male| 70000|2012-03-15 00:00:00|
|          8|          102|     Kate Kim| 29|Female| 51000|2019-10-01 00:00:00|
|          9|          103|      Tom Tan| 33|  Male| 58000|2016-06-01 00:00:00|
|         10|          104|     Lisa Lee

In [60]:
# Fix null values with coalesce
from pyspark.sql.functions import coalesce, lit

temp = emp.withColumn("gender", when(col("name") == "Nancy Liu", None).otherwise(col("gender")))

temp.withColumn("gender", coalesce(col("gender"), lit("Unknown"))).show()

+-----------+-------------+-------------+---+-------+------+-------------------+
|employee_id|department_id|         name|age| gender|salary|          hire_date|
+-----------+-------------+-------------+---+-------+------+-------------------+
|          1|          101|     John Doe| 30|   Male| 50000|2015-01-01 00:00:00|
|          2|          101|   Jane Smith| 25| Female| 45000|2016-02-15 00:00:00|
|          3|          102|    Bob Brown| 35|   Male| 55000|2014-05-01 00:00:00|
|          4|          102|    Alice Lee| 28| Female| 48000|2017-09-30 00:00:00|
|          5|          103|    Jack Chan| 40|   Male| 60000|2013-04-01 00:00:00|
|          6|          103|    Jill Wong| 32| Female| 52000|2018-07-01 00:00:00|
|          7|          101|James Johnson| 42|   Male| 70000|2012-03-15 00:00:00|
|          8|          102|     Kate Kim| 29| Female| 51000|2019-10-01 00:00:00|
|          9|          103|      Tom Tan| 33|   Male| 58000|2016-06-01 00:00:00|
|         10|          104| 

In [61]:
# Convert date/timestamp into string and extract information from it
from pyspark.sql.functions import date_format

emp.withColumn("hire_year", date_format(col("hire_date"), "yyyy")).show()

+-----------+-------------+-------------+---+------+------+-------------------+---------+
|employee_id|department_id|         name|age|gender|salary|          hire_date|hire_year|
+-----------+-------------+-------------+---+------+------+-------------------+---------+
|          1|          101|     John Doe| 30|  Male| 50000|2015-01-01 00:00:00|     2015|
|          2|          101|   Jane Smith| 25|Female| 45000|2016-02-15 00:00:00|     2016|
|          3|          102|    Bob Brown| 35|  Male| 55000|2014-05-01 00:00:00|     2014|
|          4|          102|    Alice Lee| 28|Female| 48000|2017-09-30 00:00:00|     2017|
|          5|          103|    Jack Chan| 40|  Male| 60000|2013-04-01 00:00:00|     2013|
|          6|          103|    Jill Wong| 32|Female| 52000|2018-07-01 00:00:00|     2018|
|          7|          101|James Johnson| 42|  Male| 70000|2012-03-15 00:00:00|     2012|
|          8|          102|     Kate Kim| 29|Female| 51000|2019-10-01 00:00:00|     2019|
|         

### Sort, Union & Aggregation

In [70]:
# Convert all columns to string type

emp_str = emp.select([col(c).cast("string") for c in emp.columns])
emp_str.printSchema()

root
 |-- employee_id: string (nullable = true)
 |-- department_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- age: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: string (nullable = true)
 |-- hire_date: string (nullable = true)



In [75]:
# Split the dataframe into 2 parts

emp_str1 = emp_str.filter(emp.employee_id < 11)
emp_str2 = emp_str.filter(emp.employee_id > 10)

In [None]:
# Union and Union All (remove duplicates)
# ! The columns must be in the same order and have the same data types
# ? UnionByName can used when the column names are different but data types are same

emp_str2.union(emp_str1).show()

+-----------+-------------+-------------+---+------+------+-------------------+
|employee_id|department_id|         name|age|gender|salary|          hire_date|
+-----------+-------------+-------------+---+------+------+-------------------+
|         11|          104|   David Park| 38|  Male| 65000|2015-11-01 00:00:00|
|         12|          105|   Susan Chen| 31|Female| 54000|2017-02-15 00:00:00|
|         13|          106|    Brian Kim| 45|  Male| 75000|2011-07-01 00:00:00|
|         14|          107|    Emily Lee| 26|Female| 46000|2019-01-01 00:00:00|
|         15|          106|  Michael Lee| 37|  Male| 63000|2014-09-30 00:00:00|
|         16|          107|  Kelly Zhang| 30|Female| 49000|2018-04-01 00:00:00|
|         17|          105|  George Wang| 34|  Male| 57000|2016-03-15 00:00:00|
|         18|          104|    Nancy Liu| 29|Female| 50000|2017-06-01 00:00:00|
|         19|          103|  Steven Chen| 36|  Male| 62000|2015-08-01 00:00:00|
|         20|          102|    Grace Kim

In [85]:
# Sorting the dataframe

from pyspark.sql.functions import asc, desc

emp.orderBy(desc("salary")).show(5)

emp.orderBy(asc("hire_date")).show(5)

+-----------+-------------+-------------+---+------+------+-------------------+
|employee_id|department_id|         name|age|gender|salary|          hire_date|
+-----------+-------------+-------------+---+------+------+-------------------+
|         13|          106|    Brian Kim| 45|  Male| 75000|2011-07-01 00:00:00|
|          7|          101|James Johnson| 42|  Male| 70000|2012-03-15 00:00:00|
|         11|          104|   David Park| 38|  Male| 65000|2015-11-01 00:00:00|
|         15|          106|  Michael Lee| 37|  Male| 63000|2014-09-30 00:00:00|
|         19|          103|  Steven Chen| 36|  Male| 62000|2015-08-01 00:00:00|
+-----------+-------------+-------------+---+------+------+-------------------+
only showing top 5 rows

+-----------+-------------+-------------+---+------+------+-------------------+
|employee_id|department_id|         name|age|gender|salary|          hire_date|
+-----------+-------------+-------------+---+------+------+-------------------+
|         13|  

In [92]:
# Aggregation functions
from pyspark.sql.functions import count, sum, avg, max, min

emp.groupBy("department_id").agg(count("employee_id").alias("dept_count"), sum("salary").alias("dept_pay")).orderBy(desc("dept_pay")).show()


+-------------+----------+--------+
|department_id|dept_count|dept_pay|
+-------------+----------+--------+
|          103|         4|  232000|
|          102|         4|  207000|
|          101|         3|  165000|
|          104|         3|  162000|
|          106|         2|  138000|
|          105|         2|  111000|
|          107|         2|   95000|
+-------------+----------+--------+



In [93]:
emp.groupby("department_id").agg(avg("salary").alias("avg_dept_salary")).where(col("avg_dept_salary") > 50000).show()

+-------------+---------------+
|department_id|avg_dept_salary|
+-------------+---------------+
|          101|        55000.0|
|          103|        58000.0|
|          102|        51750.0|
|          105|        55500.0|
|          106|        69000.0|
|          104|        54000.0|
+-------------+---------------+



### Unique Data and Window

In [95]:
# Get unique data from the dataframe

emp.distinct().show()

+-----------+-------------+-------------+---+------+------+-------------------+
|employee_id|department_id|         name|age|gender|salary|          hire_date|
+-----------+-------------+-------------+---+------+------+-------------------+
|         10|          104|     Lisa Lee| 27|Female| 47000|2018-08-01 00:00:00|
|         11|          104|   David Park| 38|  Male| 65000|2015-11-01 00:00:00|
|         13|          106|    Brian Kim| 45|  Male| 75000|2011-07-01 00:00:00|
|         16|          107|  Kelly Zhang| 30|Female| 49000|2018-04-01 00:00:00|
|         20|          102|    Grace Kim| 32|Female| 53000|2018-11-01 00:00:00|
|          8|          102|     Kate Kim| 29|Female| 51000|2019-10-01 00:00:00|
|         19|          103|  Steven Chen| 36|  Male| 62000|2015-08-01 00:00:00|
|          6|          103|    Jill Wong| 32|Female| 52000|2018-07-01 00:00:00|
|         12|          105|   Susan Chen| 31|Female| 54000|2017-02-15 00:00:00|
|          2|          101|   Jane Smith

In [104]:
emp.select('department_id').distinct().show()

+-------------+
|department_id|
+-------------+
|          101|
|          103|
|          107|
|          102|
|          105|
|          106|
|          104|
+-------------+



In [105]:
# Window functions
# ? Allows to compute values based on a "window" of rows without collapsing them into a single row, unlike groupBy()

from pyspark.sql.window import Window
from pyspark.sql.functions import max, col, desc

window_spec = Window.partitionBy("department_id").orderBy(desc("salary"))
max_func = max(col("salary")).over(window_spec)

emp.withColumn("max_dept_salary", max_func).show()

+-----------+-------------+-------------+---+------+------+-------------------+---------------+
|employee_id|department_id|         name|age|gender|salary|          hire_date|max_dept_salary|
+-----------+-------------+-------------+---+------+------+-------------------+---------------+
|          7|          101|James Johnson| 42|  Male| 70000|2012-03-15 00:00:00|          70000|
|          1|          101|     John Doe| 30|  Male| 50000|2015-01-01 00:00:00|          70000|
|          2|          101|   Jane Smith| 25|Female| 45000|2016-02-15 00:00:00|          70000|
|          3|          102|    Bob Brown| 35|  Male| 55000|2014-05-01 00:00:00|          55000|
|         20|          102|    Grace Kim| 32|Female| 53000|2018-11-01 00:00:00|          55000|
|          8|          102|     Kate Kim| 29|Female| 51000|2019-10-01 00:00:00|          55000|
|          4|          102|    Alice Lee| 28|Female| 48000|2017-09-30 00:00:00|          55000|
|         19|          103|  Steven Chen

In [107]:
# Get the 2nd highest salary in each department

from pyspark.sql.functions import rank, col

rank_spec = Window.partitionBy("department_id").orderBy(desc("salary"))
rank_func = rank().over(rank_spec)

emp.withColumn("rank", rank_func).filter(col("rank") == 2).show()

+-----------+-------------+-----------+---+------+------+-------------------+----+
|employee_id|department_id|       name|age|gender|salary|          hire_date|rank|
+-----------+-------------+-----------+---+------+------+-------------------+----+
|          1|          101|   John Doe| 30|  Male| 50000|2015-01-01 00:00:00|   2|
|         20|          102|  Grace Kim| 32|Female| 53000|2018-11-01 00:00:00|   2|
|          5|          103|  Jack Chan| 40|  Male| 60000|2013-04-01 00:00:00|   2|
|         18|          104|  Nancy Liu| 29|Female| 50000|2017-06-01 00:00:00|   2|
|         12|          105| Susan Chen| 31|Female| 54000|2017-02-15 00:00:00|   2|
|         15|          106|Michael Lee| 37|  Male| 63000|2014-09-30 00:00:00|   2|
|         14|          107|  Emily Lee| 26|Female| 46000|2019-01-01 00:00:00|   2|
+-----------+-------------+-----------+---+------+------+-------------------+----+

