In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql import Window

In [2]:
print(pyspark.__version__)

3.5.5


In [3]:
spark = SparkSession.builder \
    .appName("Performance Reviews") \
    .master("local[*]") \
    .getOrCreate()

25/05/04 17:00:22 WARN Utils: Your hostname, Ameys-Mac-mini.local resolves to a loopback address: 127.0.0.1; using 192.168.1.12 instead (on interface en1)
25/05/04 17:00:22 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/04 17:00:23 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
employees_df = spark.read.csv("employees.csv", header=True, inferSchema=True)
performance_reviews_df = spark.read.csv("performance_reviews.csv", header=True, inferSchema=True)

In [5]:
employees_df.createOrReplaceTempView("employees")
performance_reviews_df.createOrReplaceTempView("performance_reviews")

## Data Exploration

### Show the schema and 5 sample records from each DataFrame.

In [6]:
employees_df.show(5)

+-----------+----------------+----------+---------+----------+
|employee_id|   employee_name|department| position| hire_date|
+-----------+----------------+----------+---------+----------+
|          1|Jennifer Estrada|     Sales|  Analyst|2022-11-05|
|          2|    Tammy Jordan| Marketing|  Manager|2022-11-29|
|          3|     Emily Walsh| Marketing|  Manager|2024-05-01|
|          4|      Cindy Hall| Marketing|     Lead|2023-03-06|
|          5|     Aaron Smith|     Sales|Associate|2021-06-21|
+-----------+----------------+----------+---------+----------+
only showing top 5 rows



In [7]:
performance_reviews_df.show(5)

+---------+-----------+-----------+-----+
|review_id|employee_id|review_date|score|
+---------+-----------+-----------+-----+
|        1|         33| 2024-10-20|    2|
|        2|         38| 2025-04-20|    5|
|        3|         29| 2023-06-09|    5|
|        4|         13| 2023-12-29|    1|
|        5|         27| 2023-12-10|    5|
+---------+-----------+-----------+-----+
only showing top 5 rows



In [8]:
employees_df.printSchema()

root
 |-- employee_id: integer (nullable = true)
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- position: string (nullable = true)
 |-- hire_date: date (nullable = true)



In [9]:
performance_reviews_df.printSchema()

root
 |-- review_id: integer (nullable = true)
 |-- employee_id: integer (nullable = true)
 |-- review_date: date (nullable = true)
 |-- score: integer (nullable = true)



### How many performance reviews does each employee have?

In [10]:
spark.sql(
    """
    SELECT
        e.employee_id,
        count(r.employee_id) as review_count
    FROM employees e
    LEFT JOIN performance_reviews r
    USING(employee_id)
    GROUP BY 1
    ORDER BY 2 DESC
    """
).show(5)

+-----------+------------+
|employee_id|review_count|
+-----------+------------+
|         27|           8|
|          8|           7|
|         29|           7|
|         37|           6|
|         21|           6|
+-----------+------------+
only showing top 5 rows



In [11]:
employees_df.join(
    performance_reviews_df,
    on=["employee_id"],
    how="left"
).groupBy("employee_id") \
    .agg(count("employee_id").alias("review_count")) \
    .orderBy(desc("review_count")) \
    .show(5)

+-----------+------------+
|employee_id|review_count|
+-----------+------------+
|         27|           8|
|          8|           7|
|         29|           7|
|         37|           6|
|         21|           6|
+-----------+------------+
only showing top 5 rows



## Window Functions Practice

### Assign a rank to each review based on the review_date within each employee.

In [12]:
spark.sql(
    """
    SELECT
        employee_id,
        review_date,
        score,
        RANK() OVER (PARTITION BY employee_id ORDER BY review_date DESC) as rank
    FROM performance_reviews
    """
).show(5)

+-----------+-----------+-----+----+
|employee_id|review_date|score|rank|
+-----------+-----------+-----+----+
|          1| 2024-10-08|    4|   1|
|          1| 2024-07-01|    1|   2|
|          1| 2023-12-07|    3|   3|
|          2| 2024-07-12|    5|   1|
|          2| 2023-12-26|    1|   2|
+-----------+-----------+-----+----+
only showing top 5 rows



In [13]:
window_spec = Window.partitionBy("employee_id").orderBy(desc("review_date"))
performance_reviews_df.withColumn(
    "rank",
    rank().over(window_spec)
).select(
    "employee_id",
    "review_date",
    "score",
    "rank"
).show(5)

+-----------+-----------+-----+----+
|employee_id|review_date|score|rank|
+-----------+-----------+-----+----+
|          1| 2024-10-08|    4|   1|
|          1| 2024-07-01|    1|   2|
|          1| 2023-12-07|    3|   3|
|          2| 2024-07-12|    5|   1|
|          2| 2023-12-26|    1|   2|
+-----------+-----------+-----+----+
only showing top 5 rows



### Assign a row number to each review ordered by review_date per employee.

In [14]:
spark.sql(
    """
    SELECT
        employee_id,
        review_date,
        score,
        ROW_NUMBER() OVER (PARTITION BY employee_id ORDER BY review_date DESC) as row_number
    FROM performance_reviews
    """
).show(5)

+-----------+-----------+-----+----------+
|employee_id|review_date|score|row_number|
+-----------+-----------+-----+----------+
|          1| 2024-10-08|    4|         1|
|          1| 2024-07-01|    1|         2|
|          1| 2023-12-07|    3|         3|
|          2| 2024-07-12|    5|         1|
|          2| 2023-12-26|    1|         2|
+-----------+-----------+-----+----------+
only showing top 5 rows



In [15]:
window_spec = Window.partitionBy("employee_id").orderBy(desc("review_date"))
performance_reviews_df.withColumn(
    "row_number",
    row_number().over(window_spec)
).select(
    "employee_id",
    "review_date",
    "score",
    "row_number"
).show(5)

+-----------+-----------+-----+----------+
|employee_id|review_date|score|row_number|
+-----------+-----------+-----+----------+
|          1| 2024-10-08|    4|         1|
|          1| 2024-07-01|    1|         2|
|          1| 2023-12-07|    3|         3|
|          2| 2024-07-12|    5|         1|
|          2| 2023-12-26|    1|         2|
+-----------+-----------+-----+----------+
only showing top 5 rows



### Use a window function to calculate the average score for each employee across all reviews.

In [16]:
spark.sql(
    """
    SELECT
        employee_id,
        review_date,
        score,
        ROUND(AVG(score) OVER (PARTITION BY employee_id), 2) as avg_score
    FROM performance_reviews
    """
).show(5)

+-----------+-----------+-----+---------+
|employee_id|review_date|score|avg_score|
+-----------+-----------+-----+---------+
|          1| 2023-12-07|    3|     2.67|
|          1| 2024-07-01|    1|     2.67|
|          1| 2024-10-08|    4|     2.67|
|          2| 2023-12-26|    1|      3.0|
|          2| 2024-07-12|    5|      3.0|
+-----------+-----------+-----+---------+
only showing top 5 rows



In [17]:
window_spec = Window.partitionBy("employee_id")
performance_reviews_df.withColumn(
    "avg_score",
    round(avg("score").over(window_spec), 2)
).select(
    "employee_id",
    "review_date",
    "score",
    "avg_score"
).show(5)

+-----------+-----------+-----+---------+
|employee_id|review_date|score|avg_score|
+-----------+-----------+-----+---------+
|          1| 2023-12-07|    3|     2.67|
|          1| 2024-07-01|    1|     2.67|
|          1| 2024-10-08|    4|     2.67|
|          2| 2023-12-26|    1|      3.0|
|          2| 2024-07-12|    5|      3.0|
+-----------+-----------+-----+---------+
only showing top 5 rows



### Find the difference between each review's score and the employee’s average score using window functions.

In [22]:
spark.sql(
    """
    SELECT
        employee_id,
        review_date,
        score,
        round(AVG(score) OVER (PARTITION BY employee_id), 2) as avg_score,
        round(score - AVG (score) OVER (PARTITION BY employee_id), 2) as score_diff
    FROM performance_reviews
    """
).show(5)

+-----------+-----------+-----+---------+----------+
|employee_id|review_date|score|avg_score|score_diff|
+-----------+-----------+-----+---------+----------+
|          1| 2023-12-07|    3|     2.67|      0.33|
|          1| 2024-07-01|    1|     2.67|     -1.67|
|          1| 2024-10-08|    4|     2.67|      1.33|
|          2| 2023-12-26|    1|      3.0|      -2.0|
|          2| 2024-07-12|    5|      3.0|       2.0|
+-----------+-----------+-----+---------+----------+
only showing top 5 rows



In [23]:
window_spec = Window.partitionBy("employee_id")
performance_reviews_df.withColumn(
    "avg_score",
    round(avg("score").over(window_spec), 2)
).withColumn(
    "score_diff",
    round(col("score") - avg("score").over(window_spec), 2)
).select(
    "employee_id",
    "review_date",
    "score",
    "avg_score",
    "score_diff"
).show(5)

+-----------+-----------+-----+---------+----------+
|employee_id|review_date|score|avg_score|score_diff|
+-----------+-----------+-----+---------+----------+
|          1| 2023-12-07|    3|     2.67|      0.33|
|          1| 2024-07-01|    1|     2.67|     -1.67|
|          1| 2024-10-08|    4|     2.67|      1.33|
|          2| 2023-12-26|    1|      3.0|      -2.0|
|          2| 2024-07-12|    5|      3.0|       2.0|
+-----------+-----------+-----+---------+----------+
only showing top 5 rows



### Use LAG to get the previous review's score per employee and calculate the change in score.

In [24]:
spark.sql(
    """
    SELECT
        employee_id,
        review_date,
        score,
        LAG(score) OVER (PARTITION BY employee_id ORDER BY review_date) as previous_score,
        score - LAG(score) OVER (PARTITION BY employee_id ORDER BY review_date) as score_change
    FROM performance_reviews
    """
).show(5)

+-----------+-----------+-----+--------------+------------+
|employee_id|review_date|score|previous_score|score_change|
+-----------+-----------+-----+--------------+------------+
|          1| 2023-12-07|    3|          NULL|        NULL|
|          1| 2024-07-01|    1|             3|          -2|
|          1| 2024-10-08|    4|             1|           3|
|          2| 2023-12-26|    1|          NULL|        NULL|
|          2| 2024-07-12|    5|             1|           4|
+-----------+-----------+-----+--------------+------------+
only showing top 5 rows



In [25]:
window_spec = Window.partitionBy("employee_id").orderBy("review_date")
performance_reviews_df.withColumn(
    "previous_score",
    lag("score", 1).over(window_spec)
).withColumn(
    "score_change",
    col("score") - lag("score", 1).over(window_spec)
).select(
    "employee_id",
    "review_date",
    "score",
    "previous_score",
    "score_change"
).show(5)

+-----------+-----------+-----+--------------+------------+
|employee_id|review_date|score|previous_score|score_change|
+-----------+-----------+-----+--------------+------------+
|          1| 2023-12-07|    3|          NULL|        NULL|
|          1| 2024-07-01|    1|             3|          -2|
|          1| 2024-10-08|    4|             1|           3|
|          2| 2023-12-26|    1|          NULL|        NULL|
|          2| 2024-07-12|    5|             1|           4|
+-----------+-----------+-----+--------------+------------+
only showing top 5 rows



### Use LEAD to compare each review's score to the next one per employee.

In [26]:
spark.sql(
    """
    SELECT
        employee_id,
        review_date,
        score,
        LEAD(score) OVER (PARTITION BY employee_id ORDER BY review_date) as next_score
    FROM performance_reviews
    """
).show(5)

+-----------+-----------+-----+----------+
|employee_id|review_date|score|next_score|
+-----------+-----------+-----+----------+
|          1| 2023-12-07|    3|         1|
|          1| 2024-07-01|    1|         4|
|          1| 2024-10-08|    4|      NULL|
|          2| 2023-12-26|    1|         5|
|          2| 2024-07-12|    5|      NULL|
+-----------+-----------+-----+----------+
only showing top 5 rows



In [28]:
window_spec = Window.partitionBy("employee_id").orderBy("review_date")
performance_reviews_df.withColumn(
    "next_score",
    lead("score", 1).over(window_spec)
).select(
    "employee_id",
    "review_date",
    "score",
    "next_score"
).show(5)

+-----------+-----------+-----+----------+
|employee_id|review_date|score|next_score|
+-----------+-----------+-----+----------+
|          1| 2023-12-07|    3|         1|
|          1| 2024-07-01|    1|         4|
|          1| 2024-10-08|    4|      NULL|
|          2| 2023-12-26|    1|         5|
|          2| 2024-07-12|    5|      NULL|
+-----------+-----------+-----+----------+
only showing top 5 rows



### Get the most recent review per employee using ROW_NUMBER.

In [35]:
spark.sql(
    """
    with cte as (
        SELECT
            employee_id,
            score,
            review_date,
            ROW_NUMBER() OVER (PARTITION BY employee_id ORDER BY review_date DESC) as row_number
        FROM performance_reviews
    )
    
    SELECT 
        c.employee_id,
        e.employee_name,
        c.review_date as latest_review_date,
        c.score as latest_score 
    FROM cte c
    JOIN employees e
    USING(employee_id)
    WHERE row_number = 1
    """
).show(5)

+-----------+----------------+------------------+------------+
|employee_id|   employee_name|latest_review_date|latest_score|
+-----------+----------------+------------------+------------+
|          1|Jennifer Estrada|        2024-10-08|           4|
|          2|    Tammy Jordan|        2024-07-12|           5|
|          3|     Emily Walsh|        2024-03-02|           2|
|          4|      Cindy Hall|        2023-05-05|           2|
|          5|     Aaron Smith|        2023-10-10|           4|
+-----------+----------------+------------------+------------+
only showing top 5 rows



In [38]:
window_spec = Window.partitionBy("employee_id").orderBy(desc("review_date"))

windowed_df = performance_reviews_df.withColumn(
    "row_number",
    row_number().over(window_spec)
)

filtered_df = windowed_df.filter(
    col("row_number") == 1
)

employees_df.join(
    filtered_df,
    on=["employee_id"],
    how="inner"
).select(
    "employee_id",
    "employee_name",
    filtered_df["review_date"].alias("latest_review_date"),
    filtered_df["score"].alias("latest_review_score")
).show(5)

+-----------+----------------+------------------+-------------------+
|employee_id|   employee_name|latest_review_date|latest_review_score|
+-----------+----------------+------------------+-------------------+
|          1|Jennifer Estrada|        2024-10-08|                  4|
|          2|    Tammy Jordan|        2024-07-12|                  5|
|          3|     Emily Walsh|        2024-03-02|                  2|
|          4|      Cindy Hall|        2023-05-05|                  2|
|          5|     Aaron Smith|        2023-10-10|                  4|
+-----------+----------------+------------------+-------------------+
only showing top 5 rows



### Calculate a rolling average score over the last 3 reviews per employee.

In [41]:
spark.sql(
    """
    SELECT
        employee_id,
        review_date,
        score,
        ROUND(AVG(score) OVER (
            PARTITION BY employee_id 
            ORDER BY review_date 
            ROWS BETWEEN 2 PRECEDING AND CURRENT ROW
        ), 2) as rolling_avg
    FROM performance_reviews
    """
).show()

+-----------+-----------+-----+-----------+
|employee_id|review_date|score|rolling_avg|
+-----------+-----------+-----+-----------+
|          1| 2023-12-07|    3|        3.0|
|          1| 2024-07-01|    1|        2.0|
|          1| 2024-10-08|    4|       2.67|
|          2| 2023-12-26|    1|        1.0|
|          2| 2024-07-12|    5|        3.0|
|          3| 2023-11-24|    4|        4.0|
|          3| 2024-01-26|    5|        4.5|
|          3| 2024-03-02|    2|       3.67|
|          4| 2023-05-05|    2|        2.0|
|          5| 2023-10-10|    4|        4.0|
|          6| 2023-06-24|    2|        2.0|
|          6| 2023-11-15|    3|        2.5|
|          6| 2024-12-20|    1|        2.0|
|          7| 2023-10-27|    2|        2.0|
|          7| 2024-05-22|    4|        3.0|
|          7| 2025-01-08|    3|        3.0|
|          8| 2023-06-18|    5|        5.0|
|          8| 2023-10-01|    4|        4.5|
|          8| 2024-02-10|    2|       3.67|
|          8| 2024-05-30|    1| 

In [42]:
window_spec = Window.partitionBy("employee_id").orderBy("review_date").rowsBetween(-2, 0)

performance_reviews_df.withColumn(
    "rolling_avg",
    round(avg("score").over(window_spec), 2)
).select(
    "employee_id",
    "review_date",
    "score",
    "rolling_avg"
).show(5)

+-----------+-----------+-----+-----------+
|employee_id|review_date|score|rolling_avg|
+-----------+-----------+-----+-----------+
|          1| 2023-12-07|    3|        3.0|
|          1| 2024-07-01|    1|        2.0|
|          1| 2024-10-08|    4|       2.67|
|          2| 2023-12-26|    1|        1.0|
|          2| 2024-07-12|    5|        3.0|
+-----------+-----------+-----+-----------+
only showing top 5 rows



### For each department, rank employees based on their latest review score (highest to lowest).

In [44]:
spark.sql(
    """
    with cte as (
        SELECT
            e.department,
            e.employee_id,
            r.score,
            ROW_NUMBER() OVER (PARTITION BY employee_id ORDER BY review_date DESC) as latest_review
        FROM employees e
        JOIN performance_reviews r
        USING(employee_id)
    )
    SELECT
        department,
        employee_id,
        score,
        RANK() OVER (PARTITION BY department ORDER BY score DESC) as rank
    FROM cte
    WHERE latest_review = 1
    """
).show()

+-----------+-----------+-----+----+
| department|employee_id|score|rank|
+-----------+-----------+-----+----+
|Engineering|         38|    5|   1|
|Engineering|         25|    4|   2|
|Engineering|          9|    3|   3|
|Engineering|         24|    3|   3|
|Engineering|         11|    2|   5|
|Engineering|         29|    2|   5|
|Engineering|         30|    2|   5|
|Engineering|         31|    2|   5|
|Engineering|          6|    1|   9|
|Engineering|         19|    1|   9|
|    Finance|         33|    4|   1|
|    Finance|         27|    2|   2|
|    Finance|          8|    1|   3|
|         HR|         17|    5|   1|
|         HR|         26|    5|   1|
|         HR|         37|    4|   3|
|         HR|          7|    3|   4|
|         HR|         16|    3|   4|
|         HR|         39|    3|   4|
|         HR|         13|    1|   7|
+-----------+-----------+-----+----+
only showing top 20 rows



In [None]:
window_spec_1 = Window.partitionBy("employee_id").orderBy(desc("review_date"))
latest_reviews_df = performance_reviews_df.withColumn(
    "latest_review",
    row_number().over(window_spec_1)
).filter(col("latest_review") == 1)

joined_df = latest_reviews_df.join(
    employees_df, 
    on="employee_id", 
    how="inner"
)

window_spec_2 = Window.partitionBy("department").orderBy(desc("score"))
final_df = joined_df.withColumn(
    "rank", 
    rank().over(window_spec_2)
)

final_df.select(
    "department", 
    "employee_id", 
    "score", 
    "rank"
).show()

+-----------+-----------+-----+----+
| department|employee_id|score|rank|
+-----------+-----------+-----+----+
|Engineering|         38|    5|   1|
|Engineering|         25|    4|   2|
|Engineering|          9|    3|   3|
|Engineering|         24|    3|   3|
|Engineering|         11|    2|   5|
|Engineering|         29|    2|   5|
|Engineering|         30|    2|   5|
|Engineering|         31|    2|   5|
|Engineering|          6|    1|   9|
|Engineering|         19|    1|   9|
|    Finance|         33|    4|   1|
|    Finance|         27|    2|   2|
|    Finance|          8|    1|   3|
|         HR|         17|    5|   1|
|         HR|         26|    5|   1|
|         HR|         37|    4|   3|
|         HR|          7|    3|   4|
|         HR|         16|    3|   4|
|         HR|         39|    3|   4|
|         HR|         13|    1|   7|
+-----------+-----------+-----+----+
only showing top 20 rows



### Use a window function to get cumulative review score per employee.

In [48]:
spark.sql(
    """
    SELECT
        employee_id,
        review_date,
        score,
        SUM(score) OVER (PARTITION BY employee_id) as cumulative_score
    FROM performance_reviews
    """
).show(5)

+-----------+-----------+-----+----------------+
|employee_id|review_date|score|cumulative_score|
+-----------+-----------+-----+----------------+
|          1| 2023-12-07|    3|               8|
|          1| 2024-07-01|    1|               8|
|          1| 2024-10-08|    4|               8|
|          2| 2023-12-26|    1|               6|
|          2| 2024-07-12|    5|               6|
+-----------+-----------+-----+----------------+
only showing top 5 rows



In [49]:
window_spec = Window.partitionBy("employee_id")
performance_reviews_df.withColumn(
    "cumulative_score",
    sum("score").over(window_spec)
).select(
    "employee_id",
    "review_date",
    "score",
    "cumulative_score"
).show(5)

+-----------+-----------+-----+----------------+
|employee_id|review_date|score|cumulative_score|
+-----------+-----------+-----+----------------+
|          1| 2023-12-07|    3|               8|
|          1| 2024-07-01|    1|               8|
|          1| 2024-10-08|    4|               8|
|          2| 2023-12-26|    1|               6|
|          2| 2024-07-12|    5|               6|
+-----------+-----------+-----+----------------+
only showing top 5 rows



### For each employee, calculate the number of days between each review and the previous one.

In [53]:
spark.sql(
    """
    SELECT
        employee_id,
        review_date,
        LAG(review_date) OVER (PARTITION BY employee_id ORDER BY review_date) as previous_date,
        DATEDIFF(review_date, LAG(review_date) OVER (PARTITION BY employee_id ORDER BY review_date)) as days_between
    FROM performance_reviews
    """
).show(5)

+-----------+-----------+-------------+------------+
|employee_id|review_date|previous_date|days_between|
+-----------+-----------+-------------+------------+
|          1| 2023-12-07|         NULL|        NULL|
|          1| 2024-07-01|   2023-12-07|         207|
|          1| 2024-10-08|   2024-07-01|          99|
|          2| 2023-12-26|         NULL|        NULL|
|          2| 2024-07-12|   2023-12-26|         199|
+-----------+-----------+-------------+------------+
only showing top 5 rows



In [54]:
window_spec = Window.partitionBy("employee_id").orderBy('review_date')
performance_reviews_df.withColumn(
    "previous_date",
    lag("review_date", 1).over(window_spec)
).withColumn(
    "days_between",
    datediff(col("review_date"), lag("review_date", 1).over(window_spec))
).select(
    "employee_id",
    "review_date",
    "previous_date",
    "days_between"
).show(5)

+-----------+-----------+-------------+------------+
|employee_id|review_date|previous_date|days_between|
+-----------+-----------+-------------+------------+
|          1| 2023-12-07|         NULL|        NULL|
|          1| 2024-07-01|   2023-12-07|         207|
|          1| 2024-10-08|   2024-07-01|          99|
|          2| 2023-12-26|         NULL|        NULL|
|          2| 2024-07-12|   2023-12-26|         199|
+-----------+-----------+-------------+------------+
only showing top 5 rows

