In [1]:
import os
os.getcwd()
os.chdir("H:\pyspark_advanced-coding_interview")
os.getcwd()

'H:\\pyspark_advanced-coding_interview'

In [2]:
from pyspark.sql import SparkSession

# Create a Spark session with optimized settings
spark = (
    SparkSession.builder 
    .appName("OptimizedLocalSpark") 
    .config("spark.driver.memory", "8g")        
    .config("spark.executor.memory", "8g")    
    .config("spark.executor.cores", "4")       
    .config("spark.cores.max", "12")           
    .config("spark.sql.shuffle.partitions", "28")  
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") 
    .getOrCreate()
)


In [5]:

from pyspark.sql import Row

# Sample employee data
data = [
    Row(EmployeeID=1, EmployeeName="Alice", Department="HR", Salary=5000, ManagerID=None),
    Row(EmployeeID=2, EmployeeName="Bob", Department="HR", Salary=4500, ManagerID=1),
    Row(EmployeeID=3, EmployeeName="Charlie", Department="HR", Salary=5500, ManagerID=1),
    Row(EmployeeID=4, EmployeeName="David", Department="IT", Salary=6000, ManagerID=None),
    Row(EmployeeID=5, EmployeeName="Eve", Department="IT", Salary=6500, ManagerID=4),
    Row(EmployeeID=6, EmployeeName="Frank", Department="IT", Salary=5800, ManagerID=4),
    Row(EmployeeID=7, EmployeeName="Grace", Department="Finance", Salary=5200, ManagerID=None),
    Row(EmployeeID=8, EmployeeName="Heidi", Department="Finance", Salary=4800, ManagerID=7),
    Row(EmployeeID=9, EmployeeName="Ivan", Department="Finance", Salary=5300, ManagerID=7),
    Row(EmployeeID=10, EmployeeName="Judy", Department="Sales", Salary=4000, ManagerID=None),
    Row(EmployeeID=11, EmployeeName="Kevin", Department="Sales", Salary=4200, ManagerID=10),
    Row(EmployeeID=12, EmployeeName="Laura", Department="Sales", Salary=4500, ManagerID=10),
    Row(EmployeeID=13, EmployeeName="Mallory", Department="Marketing", Salary=4900, ManagerID=None),
    Row(EmployeeID=14, EmployeeName="Niaj", Department="Marketing", Salary=5000, ManagerID=13),
    Row(EmployeeID=15, EmployeeName="Oscar", Department="Marketing", Salary=4600, ManagerID=13)
]

# Create DataFrame
df = spark.createDataFrame(data)
df.cache()
df.createOrReplaceTempView("Employees")
# Display the DataFrame
df.show()

+----------+------------+----------+------+---------+
|EmployeeID|EmployeeName|Department|Salary|ManagerID|
+----------+------------+----------+------+---------+
|         1|       Alice|        HR|  5000|     null|
|         2|         Bob|        HR|  4500|        1|
|         3|     Charlie|        HR|  5500|        1|
|         4|       David|        IT|  6000|     null|
|         5|         Eve|        IT|  6500|        4|
|         6|       Frank|        IT|  5800|        4|
|         7|       Grace|   Finance|  5200|     null|
|         8|       Heidi|   Finance|  4800|        7|
|         9|        Ivan|   Finance|  5300|        7|
|        10|        Judy|     Sales|  4000|     null|
|        11|       Kevin|     Sales|  4200|       10|
|        12|       Laura|     Sales|  4500|       10|
|        13|     Mallory| Marketing|  4900|     null|
|        14|        Niaj| Marketing|  5000|       13|
|        15|       Oscar| Marketing|  4600|       13|
+----------+------------+---

# Spark SQL

In [7]:
res = spark.sql(""" 
    Select e.EmployeeID,  e.EmployeeName, m.EmployeeName as Manager_Name,   e. Department ,  e. Salary ,  m. ManagerID from Employees e 
    join Employees m 
    on e.ManagerID = m.EmployeeID
     and e.Salary > m.Salary  
                """)
res.show()

+----------+------------+------------+----------+------+---------+
|EmployeeID|EmployeeName|Manager_Name|Department|Salary|ManagerID|
+----------+------------+------------+----------+------+---------+
|         3|     Charlie|       Alice|        HR|  5500|     null|
|         5|         Eve|       David|        IT|  6500|     null|
|         9|        Ivan|       Grace|   Finance|  5300|     null|
|        11|       Kevin|        Judy|     Sales|  4200|     null|
|        12|       Laura|        Judy|     Sales|  4500|     null|
|        14|        Niaj|     Mallory| Marketing|  5000|     null|
+----------+------------+------------+----------+------+---------+



In [15]:
res5 = spark.sql("""  

SELECT emp.EmployeeID, emp.EmployeeName, emp.Department, emp.Salary AS EmployeeSalary,
       mgr.EmployeeName AS ManagerName, mgr.Salary AS ManagerSalary
FROM Employees emp
INNER JOIN Employees mgr 
ON emp.ManagerID = mgr.EmployeeID
WHERE emp.Salary > mgr.Salary
            
                 """)
res5.show()
                 

+----------+------------+----------+--------------+-----------+-------------+
|EmployeeID|EmployeeName|Department|EmployeeSalary|ManagerName|ManagerSalary|
+----------+------------+----------+--------------+-----------+-------------+
|         3|     Charlie|        HR|          5500|      Alice|         5000|
|         5|         Eve|        IT|          6500|      David|         6000|
|         9|        Ivan|   Finance|          5300|      Grace|         5200|
|        11|       Kevin|     Sales|          4200|       Judy|         4000|
|        12|       Laura|     Sales|          4500|       Judy|         4000|
|        14|        Niaj| Marketing|          5000|    Mallory|         4900|
+----------+------------+----------+--------------+-----------+-------------+



In [16]:
# SQL Query with CTE to find employees with a higher salary than their manager
advanced_query = """
WITH ManagerSalaries AS (
    SELECT EmployeeID AS ManagerID, Salary AS ManagerSalary
    FROM Employees
)


SELECT e.EmployeeID, e.EmployeeName, e.Department, e.Salary AS EmployeeSalary,
       m.ManagerSalary
FROM Employees e
LEFT JOIN ManagerSalaries m 
ON e.ManagerID = m.ManagerID
WHERE e.Salary > m.ManagerSalary
"""

# Execute the query
advanced_sql_result = spark.sql(advanced_query)
advanced_sql_result.show()


+----------+------------+----------+--------------+-------------+
|EmployeeID|EmployeeName|Department|EmployeeSalary|ManagerSalary|
+----------+------------+----------+--------------+-------------+
|         3|     Charlie|        HR|          5500|         5000|
|         5|         Eve|        IT|          6500|         6000|
|         9|        Ivan|   Finance|          5300|         5200|
|        11|       Kevin|     Sales|          4200|         4000|
|        12|       Laura|     Sales|          4500|         4000|
|        14|        Niaj| Marketing|          5000|         4900|
+----------+------------+----------+--------------+-------------+



# Pyspark 

In [18]:
from pyspark.sql.functions import col

# Step 1: Self-join the DataFrame to match employees with their managers
employee_manager_df = df.alias("emp").join(
    df.alias("mgr"),
    col("emp.ManagerID") == col("mgr.EmployeeID"),
    "inner"
).select(
    col("emp.EmployeeID").alias("EmployeeID"),
    col("emp.EmployeeName").alias("EmployeeName"),
    col("emp.Department").alias("Department"),
    col("emp.Salary").alias("EmployeeSalary"),
    col("mgr.EmployeeName").alias("ManagerName"),
    col("mgr.Salary").alias("ManagerSalary")
)

# Step 2: Filter to find employees with a salary greater than their manager's salary
result_df = employee_manager_df.filter(col("EmployeeSalary") > col("ManagerSalary"))

# Show the result
result_df.show()


+----------+------------+----------+--------------+-----------+-------------+
|EmployeeID|EmployeeName|Department|EmployeeSalary|ManagerName|ManagerSalary|
+----------+------------+----------+--------------+-----------+-------------+
|         3|     Charlie|        HR|          5500|      Alice|         5000|
|         5|         Eve|        IT|          6500|      David|         6000|
|         9|        Ivan|   Finance|          5300|      Grace|         5200|
|        11|       Kevin|     Sales|          4200|       Judy|         4000|
|        12|       Laura|     Sales|          4500|       Judy|         4000|
|        14|        Niaj| Marketing|          5000|    Mallory|         4900|
+----------+------------+----------+--------------+-----------+-------------+

