In [1]:
import os
os.getcwd()
os.chdir("H:\pyspark_advanced-coding_interview")
os.getcwd()

'H:\\pyspark_advanced-coding_interview'

In [2]:
from pyspark.sql import SparkSession

# Create a Spark session with optimized settings
spark = (
    SparkSession.builder 
    .appName("OptimizedLocalSpark") 
    .config("spark.driver.memory", "8g")        
    .config("spark.executor.memory", "8g")    
    .config("spark.executor.cores", "4")       
    .config("spark.cores.max", "12")           
    .config("spark.sql.shuffle.partitions", "28")  
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") 
    .getOrCreate()
)
sc = spark.sparkContext

In [5]:
from pyspark.sql import SparkSession
from pyspark.sql import Row


# Sample employee data
data = [
    Row(EmployeeID=1, EmployeeName="Alice", Department="HR", Salary=4000),
    Row(EmployeeID=2, EmployeeName="Bob", Department="HR", Salary=5000),
    Row(EmployeeID=3, EmployeeName="Charlie", Department="HR", Salary=4500),
    Row(EmployeeID=4, EmployeeName="David", Department="IT", Salary=6000),
    Row(EmployeeID=5, EmployeeName="Eve", Department="IT", Salary=7500),
    Row(EmployeeID=6, EmployeeName="Frank", Department="IT", Salary=7000),
    Row(EmployeeID=7, EmployeeName="Grace", Department="Finance", Salary=5500),
    Row(EmployeeID=8, EmployeeName="Heidi", Department="Finance", Salary=5000),
    Row(EmployeeID=9, EmployeeName="Ivan", Department="Finance", Salary=4500),
    Row(EmployeeID=10, EmployeeName="Judy", Department="Sales", Salary=3000),
    Row(EmployeeID=11, EmployeeName="Kevin", Department="Sales", Salary=3200),
    Row(EmployeeID=12, EmployeeName="Laura", Department="Sales", Salary=3500),
    Row(EmployeeID=13, EmployeeName="Mallory", Department="Marketing", Salary=4000),
    Row(EmployeeID=14, EmployeeName="Niaj", Department="Marketing", Salary=4500),
    Row(EmployeeID=15, EmployeeName="Oscar", Department="Marketing", Salary=4800)
]

# Create DataFrame
df = spark.createDataFrame(data)
df.createOrReplaceTempView("Employees")
df.cache()
# Display the DataFrame
df.show()

+----------+------------+----------+------+
|EmployeeID|EmployeeName|Department|Salary|
+----------+------------+----------+------+
|         1|       Alice|        HR|  4000|
|         2|         Bob|        HR|  5000|
|         3|     Charlie|        HR|  4500|
|         4|       David|        IT|  6000|
|         5|         Eve|        IT|  7500|
|         6|       Frank|        IT|  7000|
|         7|       Grace|   Finance|  5500|
|         8|       Heidi|   Finance|  5000|
|         9|        Ivan|   Finance|  4500|
|        10|        Judy|     Sales|  3000|
|        11|       Kevin|     Sales|  3200|
|        12|       Laura|     Sales|  3500|
|        13|     Mallory| Marketing|  4000|
|        14|        Niaj| Marketing|  4500|
|        15|       Oscar| Marketing|  4800|
+----------+------------+----------+------+



# Spark SQL

In [10]:
# find employees with highest salary in a department
res = spark.sql("""
   SELECT e.EmployeeID, e.EmployeeName, e.Department, e.Salary,maxSalEmp.max_salary from Employees  e   
   inner join      
   (select  Department, Max(Salary) as max_salary from Employees group by  Department) maxSalEmp
   on e.Department = maxSalEmp. Department
   and e.Salary = maxSalEmp.max_salary
                
                """)
res.show()

+----------+------------+----------+------+----------+
|EmployeeID|EmployeeName|Department|Salary|max_salary|
+----------+------------+----------+------+----------+
|         2|         Bob|        HR|  5000|      5000|
|         5|         Eve|        IT|  7500|      7500|
|         7|       Grace|   Finance|  5500|      5500|
|        12|       Laura|     Sales|  3500|      3500|
|        15|       Oscar| Marketing|  4800|      4800|
+----------+------------+----------+------+----------+



In [16]:
res3= spark.sql("""
SELECT e.EmployeeID, e.EmployeeName, e.Department, e.Salary
FROM Employees e
WHERE e.Salary = (
    SELECT MAX(Salary) 
    FROM Employees 
    WHERE Department = e.Department
)
""" )
res3.show()

+----------+------------+----------+------+
|EmployeeID|EmployeeName|Department|Salary|
+----------+------------+----------+------+
|         2|         Bob|        HR|  5000|
|         5|         Eve|        IT|  7500|
|         7|       Grace|   Finance|  5500|
|        12|       Laura|     Sales|  3500|
|        15|       Oscar| Marketing|  4800|
+----------+------------+----------+------+



In [13]:
res2 = spark.sql("""
    select * from (
   SELECT EmployeeID, EmployeeName, Department, Salary,
   rank() over (partition by Department order by Salary desc) as rank_sal
   from Employees ) as emp 
   where emp.rank_sal = 1
  
    """)
res2.show()

+----------+------------+----------+------+--------+
|EmployeeID|EmployeeName|Department|Salary|rank_sal|
+----------+------------+----------+------+--------+
|         7|       Grace|   Finance|  5500|       1|
|         2|         Bob|        HR|  5000|       1|
|         5|         Eve|        IT|  7500|       1|
|        15|       Oscar| Marketing|  4800|       1|
|        12|       Laura|     Sales|  3500|       1|
+----------+------------+----------+------+--------+



# Pyspark

In [14]:
from pyspark.sql import Window
from pyspark.sql.functions import col, row_number

# Define a Window Specification partitioned by Department and ordered by Salary in descending order
window_spec = Window.partitionBy("Department").orderBy(col("Salary").desc())

# Add row numbers to identify the highest salary in each department
highest_salary_df = df.withColumn("row_num", row_number().over(window_spec)) \
    .filter(col("row_num") == 1) \
    .select("EmployeeID", "EmployeeName", "Department", "Salary")

# Show the result
highest_salary_df.show()


+----------+------------+----------+------+
|EmployeeID|EmployeeName|Department|Salary|
+----------+------------+----------+------+
|         7|       Grace|   Finance|  5500|
|         2|         Bob|        HR|  5000|
|         5|         Eve|        IT|  7500|
|        15|       Oscar| Marketing|  4800|
|        12|       Laura|     Sales|  3500|
+----------+------------+----------+------+



In [15]:
from pyspark.sql.functions import dense_rank

# Define a Window Specification partitioned by Department and ordered by Salary in descending order
window_spec_rank = Window.partitionBy("Department").orderBy(col("Salary").desc())

# Add ranks to identify the highest salary in each department
ranked_salary_df = df.withColumn("rank", dense_rank().over(window_spec_rank)) \
    .filter(col("rank") == 1) \
    .select("EmployeeID", "EmployeeName", "Department", "Salary")

# Show the result
ranked_salary_df.show()


+----------+------------+----------+------+
|EmployeeID|EmployeeName|Department|Salary|
+----------+------------+----------+------+
|         7|       Grace|   Finance|  5500|
|         2|         Bob|        HR|  5000|
|         5|         Eve|        IT|  7500|
|        15|       Oscar| Marketing|  4800|
|        12|       Laura|     Sales|  3500|
+----------+------------+----------+------+



In [17]:
from pyspark.sql.window import Window
from pyspark.sql.functions import col, max

# Define a window to partition by Department and order by Salary in descending order
window_spec = Window.partitionBy("Department").orderBy(col("Salary").desc())

# Use window function to rank salaries within each department
df_with_rank = df.withColumn("rank", max("Salary").over(window_spec))

# Filter to keep only employees with the highest salary (rank = 1)
highest_salary_df = df_with_rank.filter(col("rank") == col("Salary"))

# Show the result
highest_salary_df.show()


+----------+------------+----------+------+----+
|EmployeeID|EmployeeName|Department|Salary|rank|
+----------+------------+----------+------+----+
|         7|       Grace|   Finance|  5500|5500|
|         2|         Bob|        HR|  5000|5000|
|         5|         Eve|        IT|  7500|7500|
|        15|       Oscar| Marketing|  4800|4800|
|        12|       Laura|     Sales|  3500|3500|
+----------+------------+----------+------+----+



In [19]:
from pyspark.sql.functions import col, max

# Step 1: Calculate maximum salary for each department
dept_max_salary = df.groupBy("Department").agg(max("Salary").alias("MaxSalary"))

# Step 2: Alias the original DataFrame and the max salary DataFrame
df_alias = df.alias("e")
dept_max_salary_alias = dept_max_salary.alias("d")

# Step 3: Join with original DataFrame to find employees with the max salary
highest_salary_df = df_alias.join(
    dept_max_salary_alias, 
    (df_alias.Department == dept_max_salary_alias.Department) & 
    (df_alias.Salary == dept_max_salary_alias.MaxSalary)
).select(
    df_alias.EmployeeID, 
    df_alias.EmployeeName, 
    df_alias.Department, 
    df_alias.Salary
)

# Show the result
highest_salary_df.show()



+----------+------------+----------+------+
|EmployeeID|EmployeeName|Department|Salary|
+----------+------------+----------+------+
|         2|         Bob|        HR|  5000|
|         5|         Eve|        IT|  7500|
|         7|       Grace|   Finance|  5500|
|        12|       Laura|     Sales|  3500|
|        15|       Oscar| Marketing|  4800|
+----------+------------+----------+------+

