In [1]:
# =====================================================
# LeetCode-Style Practice Dataset for PySpark
# Target: PySpark (Spark 3.x)
# Purpose: Same logical data as SQL Server dataset
# =====================================================

from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql import Window

spark = SparkSession.builder \
    .appName("LeetCodeStyleSQLDataset") \
    .getOrCreate()

# ---------------------
# Departments
# ---------------------
departments_schema = StructType([
    StructField("DeptId", IntegerType(), False),
    StructField("DeptName", StringType(), True)
])

departments_data = [
    (1, "IT"),
    (2, "HR"),
    (3, "Sales")
]

Departments = spark.createDataFrame(departments_data, departments_schema)

# ---------------------
# Employees
# ---------------------
employees_schema = StructType([
    StructField("EmpId", IntegerType(), False),
    StructField("EmpName", StringType(), True),
    StructField("Salary", IntegerType(), True),
    StructField("DeptId", IntegerType(), True),
    StructField("ManagerId", IntegerType(), True)
])

employees_data = [
    (1, "Alice", 90000, 1, None),
    (2, "Bob", 80000, 1, 1),
    (3, "Charlie", 80000, 1, 1),
    (4, "David", 60000, 2, None),
    (5, "Eva", 70000, None, None),
    (6, "Frank", 90000, 3, None)
]

Employees = spark.createDataFrame(employees_data, employees_schema)

# ---------------------
# Customers
# ---------------------
customers_schema = StructType([
    StructField("CustomerId", IntegerType(), False),
    StructField("CustomerName", StringType(), True)
])

customers_data = [
    (1, "John"),
    (2, "Jane"),
    (3, "Alex")
]

Customers = spark.createDataFrame(customers_data, customers_schema)

# ---------------------
# Products
# ---------------------
products_schema = StructType([
    StructField("ProductId", IntegerType(), False),
    StructField("ProductName", StringType(), True),
    StructField("Price", IntegerType(), True)
])

products_data = [
    (1, "Laptop", 1000),
    (2, "Phone", 500),
    (3, "Tablet", 300)
]

Products = spark.createDataFrame(products_data, products_schema)

# ---------------------
# Orders
# ---------------------
orders_schema = StructType([
    StructField("OrderId", IntegerType(), False),
    StructField("CustomerId", IntegerType(), True),
    StructField("ProductId", IntegerType(), True),
    StructField("OrderDate", StringType(), True),
    StructField("Quantity", IntegerType(), True)
])

orders_data = [
    (1, 1, 1, "2024-01-01", 1),
    (2, 1, 2, "2024-01-02", 2),
    (3, 2, 2, "2024-01-03", 1),
    (4, 2, 3, "2024-01-10", 3),
    (5, 3, 1, "2024-01-11", 1)
]

Orders = spark.createDataFrame(orders_data, orders_schema)

# ---------------------
# Logs
# ---------------------
logs_schema = StructType([
    StructField("LogId", IntegerType(), True)
])

logs_data = [(1,), (1,), (2,), (3,), (3,), (3,), (5,), (6,)]

Logs = spark.createDataFrame(logs_data, logs_schema)

# ---------------------
# Stadium
# ---------------------
stadium_schema = StructType([
    StructField("VisitDate", StringType(), True),
    StructField("People", IntegerType(), True)
])

stadium_data = [
    ("2024-01-01", 10),
    ("2024-01-02", 120),
    ("2024-01-03", 130),
    ("2024-01-04", 140),
    ("2024-01-05", 20)
]

Stadium = spark.createDataFrame(stadium_data, stadium_schema)

# ---------------------
# Users
# ---------------------
users_schema = StructType([
    StructField("UserId", IntegerType(), False),
    StructField("Banned", StringType(), True)
])

users_data = [
    (1, "No"),
    (2, "Yes"),
    (3, "No"),
    (4, "No")
]

Users = spark.createDataFrame(users_data, users_schema)

# ---------------------
# Trips
# ---------------------
trips_schema = StructType([
    StructField("TripId", IntegerType(), False),
    StructField("ClientId", IntegerType(), True),
    StructField("DriverId", IntegerType(), True),
    StructField("Status", StringType(), True),
    StructField("RequestDate", StringType(), True)
])

trips_data = [
    (1, 1, 3, "completed", "2024-01-01"),
    (2, 2, 3, "cancelled_by_driver", "2024-01-01"),
    (3, 1, 4, "cancelled_by_client", "2024-01-02"),
    (4, 3, 4, "completed", "2024-01-02")
]

Trips = spark.createDataFrame(trips_data, trips_schema)

# ---------------------
# Movies
# ---------------------
movies_schema = StructType([
    StructField("MovieId", IntegerType(), False),
    StructField("Title", StringType(), True)
])

movies_data = [
    (1, "Inception"),
    (2, "Interstellar"),
    (3, "Dunkirk")
]

Movies = spark.createDataFrame(movies_data, movies_schema)

# ---------------------
# MovieRatings
# ---------------------
ratings_schema = StructType([
    StructField("MovieId", IntegerType(), True),
    StructField("UserId", IntegerType(), True),
    StructField("Rating", IntegerType(), True)
])

ratings_data = [
    (1, 1, 5),
    (1, 2, 4),
    (2, 1, 5),
    (2, 3, 5),
    (3, 2, 3)
]

MovieRatings = spark.createDataFrame(ratings_data, ratings_schema)

print("LeetCode-style PySpark datasets loaded and registered.")


LeetCode-style PySpark datasets loaded and registered.


In [2]:
Employees.show()

+-----+-------+------+------+---------+
|EmpId|EmpName|Salary|DeptId|ManagerId|
+-----+-------+------+------+---------+
|    1|  Alice| 90000|     1|     null|
|    2|    Bob| 80000|     1|        1|
|    3|Charlie| 80000|     1|        1|
|    4|  David| 60000|     2|     null|
|    5|    Eva| 70000|  null|     null|
|    6|  Frank| 90000|     3|     null|
+-----+-------+------+------+---------+



### Question 1 — Second Highest Salary (Tie-aware)

Using the `Employees` table, return the **second highest distinct salary**.

Constraints:

- If there is no second highest salary, return `NULL`.
- Duplicate salaries must not create false ranks.

Expected columns:

- `SecondHighestSalary`

In [3]:
window_spec = Window.orderBy(desc(col('Salary')))
emp = Employees.select('Salary').distinct()
emp = emp.withColumn('rn', row_number().over(window_spec)).filter((col('rn')==2)).withColumnRenamed('Salary','SecondHighestSalary').drop('rn')
emp.show()

+-------------------+
|SecondHighestSalary|
+-------------------+
|              80000|
+-------------------+



### Question 2 — Employees Earning More Than Their Manager

From the `Employees` table, list employees who earn **strictly more** than their manager.

Notes:

- Some employees have no manager.
- Managers are also employees.

Expected columns:

- `EmpName`

In [4]:
emp = Employees.alias('e1').join(Employees.alias('e2'), on = col("e1.EmpId") == col("e2.ManagerId"), how = 'inner').filter(col('e1.Salary')<col('e2.Salary')).select('e2.EmpName')
emp.show()

+-------+
|EmpName|
+-------+
+-------+



### Question 3 — Customers Who Never Ordered

Using `Customers` and `Orders`, find customers who **never placed an order**.

Constraints:

- Do not assume referential completeness.
- Avoid false positives caused by joins.

Expected columns:

- `CustomerName`

In [5]:
cust = Customers.alias('c').join(Orders.alias('o'), on = col('c.CustomerId')==col('o.CustomerId'), how = 'left').filter(col('o.OrderId').isNull()).select('CustomerName')
cust.show()

+------------+
|CustomerName|
+------------+
+------------+



### Question 4 — Duplicate Numbers

Using the `Logs` table, find all numbers that appear **at least three times consecutively**.

Notes:

- Order is determined by the natural insertion order.
- Repeated values must be adjacent to count.

Expected columns:

- `ConsecutiveNums`

In [6]:
Logs.show()

+-----+
|LogId|
+-----+
|    1|
|    1|
|    2|
|    3|
|    3|
|    3|
|    5|
|    6|
+-----+



In [7]:
log = Logs.withColumn('Lag', lag(col('LogId')).over(Window.orderBy(col('LogId')))).withColumn('Lead', lead(col('LogId')).over(Window.orderBy(col('LogId'))))
log.filter((col('LogId')==col('Lag')) & (col('Lead') == col('LogId'))).selectExpr('LogId as ConsecutiveNums').drop('Lag','Lead').show()

+---------------+
|ConsecutiveNums|
+---------------+
|              3|
+---------------+



### Question 5 — High Attendance Periods

From the `Stadium` table, report all rows that belong to a period of **at least three consecutive days** where `People >= 100`.

Notes:

- All qualifying rows in the streak must be returned.
- Single-day spikes should be excluded.

Expected columns:

- `VisitDate`, `People`

In [8]:
Stadium.show()

+----------+------+
| VisitDate|People|
+----------+------+
|2024-01-01|    10|
|2024-01-02|   120|
|2024-01-03|   130|
|2024-01-04|   140|
|2024-01-05|    20|
+----------+------+



In [36]:
stad = Stadium.withColumn('prev_day_count',lag('People').over(Window.orderBy('VisitDate')))\
        .withColumn('next_day_count', lead('People').over(Window.orderBy('VisitDate')))\
        .where('(prev_day_count<100 and People>=100) OR (People>=100 and next_day_count>=100) OR (People>=100 and next_day_count<100)')\
        .select('VisitDate','People')
stad.show()

+----------+------+
| VisitDate|People|
+----------+------+
|2024-01-02|   120|
|2024-01-03|   130|
|2024-01-04|   140|
+----------+------+

