In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F, Window as W
from pyspark.sql.types import *

In [2]:
spark = SparkSession.builder \
    .master("local[*]") \
    .appName("Comprehensive PySpark Practice") \
    .getOrCreate()

Sample Data

In [3]:
employee_data = [
    (1, "John", "Sales", 50000, "2020-01-15", "NY", None, 100),
    (2, "Jane", "Engineering", None, "2021-02-20", "CA", "T2", 150),
    (3, "Bob", "Sales", 55000, "2019-03-10", None, "T1", 120),
    (4, "Alice", "Engineering", 65000, "2022-04-05", "CA", "T2", None),
    (5, "Charlie", None, 75000, "2018-05-01", "NY", "T3", 200)
]

sales_data = [
    (1, "2023-01-01", 100, "2023-Q1", "A"),
    (1, "2023-01-02", 150, "2023-Q1", "B"),
    (2, "2023-01-01", 200, "2023-Q1", "A"),
    (2, "2023-01-02", None, "2023-Q1", "C"),
    (3, "2023-01-03", 300, "2023-Q1", None)
]

Create DataFrames

In [4]:
df_emp = spark.createDataFrame(
    employee_data, 
    ["emp_id", "name", "dept", "salary", "hire_date", "state", "team", "score"]
)

df_sales = spark.createDataFrame(
    sales_data,
    ["emp_id", "sale_date", "amount", "quarter", "product"]
)

Problem 1: Handling Null Values
Problem

Clean employee data by handling missing values with appropriate defaults:

    Fill missing department values with "Unassigned".
    Replace missing salaries with the department's average salary.
    Assign "Unknown" to missing states.
    Default missing scores to 0.

Solution

Use coalesce, window functions for averages, and default literals for missing data.

In [30]:
# df_emp.show()

+------+-------+-----------+------+----------+-----+----+-----+
|emp_id|   name|       dept|salary| hire_date|state|team|score|
+------+-------+-----------+------+----------+-----+----+-----+
|     1|   John|      Sales| 50000|2020-01-15|   NY|NULL|  100|
|     2|   Jane|Engineering|  NULL|2021-02-20|   CA|  T2|  150|
|     3|    Bob|      Sales| 55000|2019-03-10| NULL|  T1|  120|
|     4|  Alice|Engineering| 65000|2022-04-05|   CA|  T2| NULL|
|     5|Charlie|       NULL| 75000|2018-05-01|   NY|  T3|  200|
+------+-------+-----------+------+----------+-----+----+-----+



In [14]:
dept_window = W.partitionBy("dept").orderBy("emp_id")

df_emp = (
    df_emp
    .withColumn(
        "dept", 
        F.coalesce("dept", F.lit("Unassigned"))
    )
    .withColumn(
        "salary", 
        F.coalesce(
            "salary", 
            F.avg("salary").over(dept_window))
    )
    .withColumn(
        "state", 
        F.coalesce("state", F.lit("Unknown"))
    )
    .withColumn(
        "score", 
        F.coalesce("score", F.lit(0))
    )
    .withColumn(
        "team", 
        F.coalesce("team", F.lit("Unknown"))
    )
)

In [15]:
df_emp.show()

+------+-------+-----------+-------+----------+-------+-------+-----+
|emp_id|   name|       dept| salary| hire_date|  state|   team|score|
+------+-------+-----------+-------+----------+-------+-------+-----+
|     2|   Jane|Engineering|65000.0|2021-02-20|     CA|     T2|  150|
|     4|  Alice|Engineering|65000.0|2022-04-05|     CA|     T2|    0|
|     1|   John|      Sales|50000.0|2020-01-15|     NY|Unknown|  100|
|     3|    Bob|      Sales|55000.0|2019-03-10|Unknown|     T1|  120|
|     5|Charlie| Unassigned|75000.0|2018-05-01|     NY|     T3|  200|
+------+-------+-----------+-------+----------+-------+-------+-----+



Problem 2: Window Functions
Problem

Calculate various metrics using window functions:

    Rank salaries within departments.
    Calculate running totals.
    Find differences with the next and previous salary.

Solution

Use rank, sum, lead, and lag functions within a defined window.

In [22]:
df_emp = (
    df_emp
    .withColumn("salary_rank", F.rank().over(dept_window))
    .withColumn("running_total", F.sum("salary").over(dept_window.rowsBetween(W.unboundedPreceding, W.currentRow)))
    .withColumn("next_salary", F.lead("salary", 1).over(dept_window))
    .withColumn("prev_salary", F.lag("salary", 1).over(dept_window))
)

In [23]:
df_emp.show()

+------+-------+-----------+-------+----------+-------+-------+-----+-----------+-------------+-----------+-----------+
|emp_id|   name|       dept| salary| hire_date|  state|   team|score|salary_rank|running_total|next_salary|prev_salary|
+------+-------+-----------+-------+----------+-------+-------+-----+-----------+-------------+-----------+-----------+
|     2|   Jane|Engineering|65000.0|2021-02-20|     CA|     T2|  150|          1|      65000.0|    65000.0|       NULL|
|     4|  Alice|Engineering|65000.0|2022-04-05|     CA|     T2|    0|          2|     130000.0|       NULL|    65000.0|
|     1|   John|      Sales|50000.0|2020-01-15|     NY|Unknown|  100|          1|      50000.0|    55000.0|       NULL|
|     3|    Bob|      Sales|55000.0|2019-03-10|Unknown|     T1|  120|          2|     105000.0|       NULL|    50000.0|
|     5|Charlie| Unassigned|75000.0|2018-05-01|     NY|     T3|  200|          1|      75000.0|       NULL|       NULL|
+------+-------+-----------+-------+----

In [24]:
df_sales.show()

+------+----------+------+-------+-------+
|emp_id| sale_date|amount|quarter|product|
+------+----------+------+-------+-------+
|     1|2023-01-01|   100|2023-Q1|      A|
|     1|2023-01-02|   150|2023-Q1|      B|
|     2|2023-01-01|   200|2023-Q1|      A|
|     2|2023-01-02|  NULL|2023-Q1|      C|
|     3|2023-01-03|   300|2023-Q1|   NULL|
+------+----------+------+-------+-------+



In [25]:
pivoted = df_sales.groupBy("emp_id").pivot("quarter").agg(F.sum("amount"))


In [26]:
pivoted.show()

+------+-------+
|emp_id|2023-Q1|
+------+-------+
|     1|    250|
|     3|    300|
|     2|    200|
+------+-------+

