In [4]:
import os
os.getcwd()
os.chdir("H:\pyspark_advanced-coding_interview")
os.getcwd()

'H:\\pyspark_advanced-coding_interview'

In [3]:
from pyspark.sql import SparkSession

# Initialize Spark Session
spark = SparkSession.builder.appName("SelfJoinExamples").getOrCreate()

# Sample data for Spark DataFrame
data = [
    (1, 'Alice', 'Marketing', 60000),
    (2, 'Bob', 'Sales', 50000),
    (3, 'Charlie', 'Marketing', 60000),
    (4, 'David', 'Sales', 70000),
    (5, 'Eve', 'Marketing', 65000),
    (6, 'Frank', 'Sales', 50000)
]

columns = ["EmployeeID", "EmployeeName", "Department", "Salary"]

# Create DataFrame
df = spark.createDataFrame(data, columns)

# Create a temporary table to use in SQL
df.createOrReplaceTempView("employee_table")

# Show the table
df.show()


+----------+------------+----------+------+
|EmployeeID|EmployeeName|Department|Salary|
+----------+------------+----------+------+
|         1|       Alice| Marketing| 60000|
|         2|         Bob|     Sales| 50000|
|         3|     Charlie| Marketing| 60000|
|         4|       David|     Sales| 70000|
|         5|         Eve| Marketing| 65000|
|         6|       Frank|     Sales| 50000|
+----------+------------+----------+------+



----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 61068)
Traceback (most recent call last):
  File "c:\Users\lpdda\AppData\Local\Programs\Python\Python311\Lib\socketserver.py", line 317, in _handle_request_noblock
    self.process_request(request, client_address)
  File "c:\Users\lpdda\AppData\Local\Programs\Python\Python311\Lib\socketserver.py", line 348, in process_request
    self.finish_request(request, client_address)
  File "c:\Users\lpdda\AppData\Local\Programs\Python\Python311\Lib\socketserver.py", line 361, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "c:\Users\lpdda\AppData\Local\Programs\Python\Python311\Lib\socketserver.py", line 755, in __init__
    self.handle()
  File "C:\spark\python\pyspark\accumulators.py", line 281, in handle
    poll(accum_updates)
  File "C:\spark\python\pyspark\accumulators.py", line 253, in poll
    if func():
       ^^^^^^
  File "C:\spark\python\py

# Find Employees Earning the Same Salary Within the Same Department

In [10]:
#Find Employees Earning the Same Salary Within the Same Department
df.createOrReplaceTempView("employee_table")
 
res = spark.sql("""
WITH cte_e AS (
    SELECT 
        Department, 
        dense_rank() OVER (PARTITION BY Department ORDER BY Salary) AS rnk,
        EmployeeID 
    FROM employee_table
)
SELECT e.*, d.rnk
FROM employee_table e
INNER JOIN cte_e d 
ON e.EmployeeID = d.EmployeeID 
WHERE d.rnk = 1
""")

# Show the result
res.show()



+----------+------------+----------+------+---+
|EmployeeID|EmployeeName|Department|Salary|rnk|
+----------+------------+----------+------+---+
|         1|       Alice| Marketing| 60000|  1|
|         2|         Bob|     Sales| 50000|  1|
|         3|     Charlie| Marketing| 60000|  1|
|         6|       Frank|     Sales| 50000|  1|
+----------+------------+----------+------+---+



In [19]:
res2 = spark.sql("""

  SELECT e1.EmployeeName AS Employee1, e2.EmployeeName AS Employee2, e1.Salary
FROM employee_table e1
join employee_table e2
ON e1.Department = e2.Department AND e1.Salary = e2.Salary AND e1.EmployeeID <> e2.EmployeeID;

""")

# Show the result
res2.show()

+---------+---------+------+
|Employee1|Employee2|Salary|
+---------+---------+------+
|    Alice|  Charlie| 60000|
|  Charlie|    Alice| 60000|
|      Bob|    Frank| 50000|
|    Frank|      Bob| 50000|
+---------+---------+------+



In [20]:
# Detect Employees Who Have the Same Salary in Different Departments

res7 = spark.sql("""                     

SELECT e1.EmployeeName AS Employee1, e1.Department AS Department1, 
       e2.EmployeeName AS Employee2, e2.Department AS Department2, e1.Salary
FROM employee_table e1
JOIN employee_table e2 
ON e1.Salary = e2.Salary AND e1.EmployeeID < e2.EmployeeID;               
                 
                 """)
res7.show()

+---------+-----------+---------+-----------+------+
|Employee1|Department1|Employee2|Department2|Salary|
+---------+-----------+---------+-----------+------+
|      Bob|      Sales|    Frank|      Sales| 50000|
|    Alice|  Marketing|  Charlie|  Marketing| 60000|
+---------+-----------+---------+-----------+------+



# Identify Employees Who Earn More Than Others in the Same Department

In [14]:
# Identify Employees Who Earn More Than Others in the Same Department
res3 = spark.sql("""
SELECT e1.EmployeeName AS HigherEarner, e2.EmployeeName AS LowerEarner, e1.Salary - e2.Salary AS SalaryDifference
FROM employee_table e1
JOIN employee_table e2
ON e1.Department = e2.Department AND e1.Salary > e2.Salary;
""")
res3.show()


+------------+-----------+----------------+
|HigherEarner|LowerEarner|SalaryDifference|
+------------+-----------+----------------+
|         Eve|      Alice|            5000|
|         Eve|    Charlie|            5000|
|       David|        Bob|           20000|
|       David|      Frank|           20000|
+------------+-----------+----------------+



In [15]:
from pyspark.sql import SparkSession

# Initialize Spark Session
spark = SparkSession.builder.appName("RealWorldSelfJoinExamples").getOrCreate()

# Sample data for Spark DataFrame
data1 = [
    (1, 'Alice', 'Sales', 50000, 3),
    (2, 'Bob', 'Sales', 60000, 3),
    (3, 'Charlie', 'Sales', 90000, None), # Charlie is the manager
    (4, 'David', 'Engineering', 75000, 6),
    (5, 'Eve', 'Engineering', 80000, 6),
    (6, 'Frank', 'Engineering', 100000, None), # Frank is the manager
]

columns1 = ["EmployeeID", "EmployeeName", "Department", "Salary", "ManagerID"]

# Create DataFrame and Table
df1 = spark.createDataFrame(data1, columns1)
df1.createOrReplaceTempView("employee_manager")
df1.show()


+----------+------------+-----------+------+---------+
|EmployeeID|EmployeeName| Department|Salary|ManagerID|
+----------+------------+-----------+------+---------+
|         1|       Alice|      Sales| 50000|        3|
|         2|         Bob|      Sales| 60000|        3|
|         3|     Charlie|      Sales| 90000|     null|
|         4|       David|Engineering| 75000|        6|
|         5|         Eve|Engineering| 80000|        6|
|         6|       Frank|Engineering|100000|     null|
+----------+------------+-----------+------+---------+



# Identify Employees and Their Managers

In [16]:
#Identify Employees and Their Managers
res5 = spark.sql("""                       
SELECT e1.EmployeeName AS Employee, e2.EmployeeName AS Manager
FROM employee_manager e1
LEFT JOIN employee_manager e2 ON e1.ManagerID = e2.EmployeeID;               
                 """)

res5.show()

res6 = spark.sql("""                       
SELECT e1.EmployeeName AS Employee, e2.EmployeeName AS Manager
FROM employee_manager e1
JOIN employee_manager e2 ON e1.ManagerID = e2.EmployeeID;               
                 """)
res6.show()


+--------+-------+
|Employee|Manager|
+--------+-------+
|   Alice|Charlie|
|     Bob|Charlie|
| Charlie|   null|
|   David|  Frank|
|     Eve|  Frank|
|   Frank|   null|
+--------+-------+

+--------+-------+
|Employee|Manager|
+--------+-------+
|   Alice|Charlie|
|     Bob|Charlie|
|   David|  Frank|
|     Eve|  Frank|
+--------+-------+



##### Compare and Identify Employees Who Earn More Than Their Managers

In [25]:
res11= spark.sql(""" 
                 
 SELECT e1.EmployeeName AS Employee, e1.Salary AS EmployeeSalary, 
       e2.EmployeeName AS Manager, e2.Salary AS ManagerSalary
FROM employee_manager e1
JOIN employee_manager e2 
ON e1.ManagerID = e2.EmployeeID 
WHERE e1.Salary > e2.Salary;
                
                 
                 """)
res11.show()


+--------+--------------+-------+-------------+
|Employee|EmployeeSalary|Manager|ManagerSalary|
+--------+--------------+-------+-------------+
+--------+--------------+-------+-------------+



# Find Consecutive Dates for an Employee’s Work Log

In [21]:
# Sample data for employee work logs
data3 = [
    (1, 'Alice', '2024-10-20'),
    (1, 'Alice', '2024-10-21'),
    (1, 'Alice', '2024-10-23'),
    (2, 'Bob', '2024-10-20'),
    (2, 'Bob', '2024-10-22'),
    (2, 'Bob', '2024-10-23'),
]

columns3 = ["EmployeeID", "EmployeeName", "WorkDate"]
df3 = spark.createDataFrame(data3, columns3)
df3.createOrReplaceTempView("work_logs")
df3.show()


+----------+------------+----------+
|EmployeeID|EmployeeName|  WorkDate|
+----------+------------+----------+
|         1|       Alice|2024-10-20|
|         1|       Alice|2024-10-21|
|         1|       Alice|2024-10-23|
|         2|         Bob|2024-10-20|
|         2|         Bob|2024-10-22|
|         2|         Bob|2024-10-23|
+----------+------------+----------+



In [22]:
res10 = spark.sql(""" 
   SELECT e1.EmployeeName, e1.WorkDate AS Date1, e2.WorkDate AS Date2
FROM work_logs e1
JOIN work_logs e2 
ON e1.EmployeeID = e2.EmployeeID AND DATE_ADD(e1.WorkDate, 1) = e2.WorkDate;
            
                  """)

res10.show()

+------------+----------+----------+
|EmployeeName|     Date1|     Date2|
+------------+----------+----------+
|       Alice|2024-10-20|2024-10-21|
|         Bob|2024-10-22|2024-10-23|
+------------+----------+----------+

