In [1]:
import os
os.getcwd()
os.chdir("H:\pyspark_advanced-coding_interview")
os.getcwd()

'H:\\pyspark_advanced-coding_interview'

In [10]:

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, concat_ws, lit

# Initialize Spark
spark = SparkSession.builder.appName("EmployeeHierarchy").getOrCreate()

# Sample Employee Data
data = [
    (1, 'John', None),  # CEO
    (2, 'Alice', 1),    # Manager reporting to John
    (3, 'Bob', 1),      # Manager reporting to John
    (4, 'David', 2),    # Employee reporting to Alice
    (5, 'Eva', 2),      # Employee reporting to Alice
    (6, 'Charlie', 3),  # Employee reporting to Bob
    (7, 'Mike', 3)      # Employee reporting to Bob
]

columns = ["emp_id", "emp_name", "manager_id"]

# Create DataFrame
df = spark.createDataFrame(data, columns)


df.createOrReplaceTempView("employee")


df.show()



+------+--------+----------+
|emp_id|emp_name|manager_id|
+------+--------+----------+
|     1|    John|      null|
|     2|   Alice|         1|
|     3|     Bob|         1|
|     4|   David|         2|
|     5|     Eva|         2|
|     6| Charlie|         3|
|     7|    Mike|         3|
+------+--------+----------+



In [13]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, concat, when

# Initialize Spark Session
spark = SparkSession.builder.appName("EmployeeHierarchy").getOrCreate()

# Sample Data: Employee - Manager Table
data = [
    (1, "Alice", None),
    (2, "Bob", 1),
    (3, "Charlie", 1),
    (4, "David", 2),
    (5, "Eva", 2),
    (6, "Frank", 3)
]

# Convert to DataFrame and create a temp view
df = spark.createDataFrame(data, ["emp_id", "emp_name", "mgr_id"])
df.createOrReplaceTempView("employee")

# Start with the top-level employee (e.g., CEO) who has no manager (mgr_id is NULL)
hierarchy_df = df.filter(col("mgr_id").isNull()) \
    .withColumn("hierarchy_path", col("emp_name"))

# Iteratively join to find next levels in the hierarchy
for i in range(5):  # Assuming a maximum of 5 levels deep, adjust as needed
    hierarchy_df = hierarchy_df.alias("h").join(
        df.alias("e"), col("h.emp_id") == col("e.mgr_id"), "left"
    ).select(
        col("e.emp_id"),
        col("e.emp_name"),
        col("e.mgr_id"),
        concat(col("h.hierarchy_path"), lit(" -> "), col("e.emp_name")).alias("hierarchy_path")
    ).where(col("e.emp_id").isNotNull()).union(hierarchy_df)

# Show the result
hierarchy_df.show(truncate=False)



+------+--------+------+-------------------------+
|emp_id|emp_name|mgr_id|hierarchy_path           |
+------+--------+------+-------------------------+
|2     |Bob     |1     |Alice -> Bob             |
|3     |Charlie |1     |Alice -> Charlie         |
|4     |David   |2     |Alice -> Bob -> David    |
|5     |Eva     |2     |Alice -> Bob -> Eva      |
|4     |David   |2     |Alice -> Bob -> David    |
|5     |Eva     |2     |Alice -> Bob -> Eva      |
|4     |David   |2     |Alice -> Bob -> David    |
|5     |Eva     |2     |Alice -> Bob -> Eva      |
|4     |David   |2     |Alice -> Bob -> David    |
|5     |Eva     |2     |Alice -> Bob -> Eva      |
|6     |Frank   |3     |Alice -> Charlie -> Frank|
|6     |Frank   |3     |Alice -> Charlie -> Frank|
|6     |Frank   |3     |Alice -> Charlie -> Frank|
|6     |Frank   |3     |Alice -> Charlie -> Frank|
|2     |Bob     |1     |Alice -> Bob             |
|3     |Charlie |1     |Alice -> Charlie         |
|4     |David   |2     |Alice -