In [1]:
import os
os.getcwd()
os.chdir("H:\pyspark_advanced-coding_interview")
os.getcwd()

'H:\\pyspark_advanced-coding_interview'

In [2]:
from pyspark.sql import SparkSession

# Create a Spark session with optimized settings
spark = (
    SparkSession.builder 
    .appName("OptimizedLocalSpark") 
    .config("spark.driver.memory", "8g")        
    .config("spark.executor.memory", "8g")    
    .config("spark.executor.cores", "4")       
    .config("spark.cores.max", "12")           
    .config("spark.sql.shuffle.partitions", "28")  
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") 
    .getOrCreate()
)
sc = spark.sparkContext

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql import Row


# Sample employee data
data = [
    Row(EmployeeID=1, EmployeeName="Alice", Department="HR", Salary=4000),
    Row(EmployeeID=2, EmployeeName="Bob", Department="HR", Salary=5000),
    Row(EmployeeID=3, EmployeeName="Charlie", Department="HR", Salary=4500),
    Row(EmployeeID=4, EmployeeName="David", Department="IT", Salary=6000),
    Row(EmployeeID=5, EmployeeName="Eve", Department="IT", Salary=7500),
    Row(EmployeeID=6, EmployeeName="Frank", Department="IT", Salary=7000),
    Row(EmployeeID=7, EmployeeName="Grace", Department="Finance", Salary=5500),
    Row(EmployeeID=8, EmployeeName="Heidi", Department="Finance", Salary=5000),
    Row(EmployeeID=9, EmployeeName="Ivan", Department="Finance", Salary=4500),
    Row(EmployeeID=10, EmployeeName="Judy", Department="Sales", Salary=3000),
    Row(EmployeeID=11, EmployeeName="Kevin", Department="Sales", Salary=3200),
    Row(EmployeeID=12, EmployeeName="Laura", Department="Sales", Salary=3500),
    Row(EmployeeID=13, EmployeeName="Mallory", Department="Marketing", Salary=4000),
    Row(EmployeeID=14, EmployeeName="Niaj", Department="Marketing", Salary=4500),
    Row(EmployeeID=15, EmployeeName="Oscar", Department="Marketing", Salary=4800)
]

# Create DataFrame
df = spark.createDataFrame(data)
df.createOrReplaceTempView("Employees")

# Display the DataFrame
df.show()


+----------+------------+----------+------+
|EmployeeID|EmployeeName|Department|Salary|
+----------+------------+----------+------+
|         1|       Alice|        HR|  4000|
|         2|         Bob|        HR|  5000|
|         3|     Charlie|        HR|  4500|
|         4|       David|        IT|  6000|
|         5|         Eve|        IT|  7500|
|         6|       Frank|        IT|  7000|
|         7|       Grace|   Finance|  5500|
|         8|       Heidi|   Finance|  5000|
|         9|        Ivan|   Finance|  4500|
|        10|        Judy|     Sales|  3000|
|        11|       Kevin|     Sales|  3200|
|        12|       Laura|     Sales|  3500|
|        13|     Mallory| Marketing|  4000|
|        14|        Niaj| Marketing|  4500|
|        15|       Oscar| Marketing|  4800|
+----------+------------+----------+------+



# Spark SQL

In [12]:
res = spark.sql("""
SELECT e.EmployeeID, e.EmployeeName, e.Department, e.Salary 
FROM Employees e
INNER JOIN (
    SELECT Department, AVG(Salary) AS avg_salary 
    FROM Employees 
    GROUP BY Department
) avgEmpSal
ON e.Department = avgEmpSal.Department
WHERE e.Salary > avgEmpSal.avg_salary
AND e.Salary < (SELECT MAX(avg_salary) FROM (
    SELECT AVG(Salary) AS avg_salary 
    FROM Employees 
    GROUP BY Department
) tempAvg)
""")

res.show()


+----------+------------+----------+------+
|EmployeeID|EmployeeName|Department|Salary|
+----------+------------+----------+------+
|         2|         Bob|        HR|  5000|
|         7|       Grace|   Finance|  5500|
|        12|       Laura|     Sales|  3500|
|        14|        Niaj| Marketing|  4500|
|        15|       Oscar| Marketing|  4800|
+----------+------------+----------+------+



In [13]:
query = """
WITH DepartmentAverages AS (
    SELECT Department, AVG(Salary) AS avg_salary FROM Employees GROUP BY Department
    
    
), EmployeeDetails AS (
    SELECT e.EmployeeID, e.EmployeeName, e.Department, e.Salary, d.avg_salary AS dept_avg_salary
    FROM Employees e
    JOIN DepartmentAverages d ON e.Department = d.Department
)

SELECT ed.EmployeeID, ed.EmployeeName, ed.Department, ed.Salary
FROM EmployeeDetails ed
WHERE ed.Salary < ed.dept_avg_salary
AND ed.Salary > (SELECT MIN(avg_salary) FROM DepartmentAverages WHERE Department != ed.Department)
"""

result = spark.sql(query)
result.show()


+----------+------------+----------+------+
|EmployeeID|EmployeeName|Department|Salary|
+----------+------------+----------+------+
|         1|       Alice|        HR|  4000|
|         4|       David|        IT|  6000|
|         9|        Ivan|   Finance|  4500|
|        13|     Mallory| Marketing|  4000|
+----------+------------+----------+------+



In [18]:
from pyspark.sql.window import Window
from pyspark.sql.functions import avg, min, col



# Step 1: Calculate Department Averages
dept_avg_df = df.groupBy("Department").agg(avg("Salary").alias("dept_avg_salary"))
dept_avg_df.show()

# Step 2: Calculate the Minimum Average Salary of Other Departments
other_dept_avg = dept_avg_df.agg(avg("dept_avg_salary").alias("other_dept_avg")).collect()[0]["other_dept_avg"]


# Step 3: Join the Employee Data with Department Averages
joined_df = df.join(dept_avg_df, "Department")
joined_df.show()
# Step 4: Filter Employees
result_df = joined_df.filter((col("Salary") < col("dept_avg_salary")) & (col("Salary") > other_dept_avg))

result_df.show()



+----------+------------------+
|Department|   dept_avg_salary|
+----------+------------------+
|        HR|            4500.0|
|        IT| 6833.333333333333|
|   Finance|            5000.0|
|     Sales|3233.3333333333335|
| Marketing| 4433.333333333333|
+----------+------------------+

+----------+----------+------------+------+------------------+
|Department|EmployeeID|EmployeeName|Salary|   dept_avg_salary|
+----------+----------+------------+------+------------------+
|        HR|         1|       Alice|  4000|            4500.0|
|        HR|         2|         Bob|  5000|            4500.0|
|        HR|         3|     Charlie|  4500|            4500.0|
|        IT|         4|       David|  6000| 6833.333333333333|
|        IT|         5|         Eve|  7500| 6833.333333333333|
|        IT|         6|       Frank|  7000| 6833.333333333333|
|   Finance|         7|       Grace|  5500|            5000.0|
|   Finance|         8|       Heidi|  5000|            5000.0|
|   Finance|      

In [19]:
from pyspark.sql.window import Window
from pyspark.sql.functions import col, avg, min

# Step 1: Calculate Department Average Using Window Function
dept_avg_window = Window.partitionBy("Department")
df = df.withColumn("dept_avg_salary", avg("Salary").over(dept_avg_window))
df.show()

# Step 2: Calculate Minimum Average of Other Departments
other_dept_avg = df.select("Department", "dept_avg_salary").distinct()
min_other_avg = other_dept_avg.agg(min("dept_avg_salary").alias("min_other_avg")).collect()[0]["min_other_avg"]

# Step 3: Filter Employees
filtered_df = df.filter((col("Salary") < col("dept_avg_salary")) & (col("Salary") > min_other_avg))

filtered_df.show()


+----------+------------+----------+------+-----------------+
|EmployeeID|EmployeeName|Department|Salary|  dept_avg_salary|
+----------+------------+----------+------+-----------------+
|         9|        Ivan|   Finance|  4500|           5000.0|
|         1|       Alice|        HR|  4000|           4500.0|
|         4|       David|        IT|  6000|6833.333333333333|
|        13|     Mallory| Marketing|  4000|4433.333333333333|
+----------+------------+----------+------+-----------------+



In [21]:
# Step 1: Calculate Department Average
dept_avg = df.groupBy("Department").agg(avg("Salary").alias("dept_avg_sal"))

# Step 2: Calculate the Overall Average Salary
overall_avg = df.agg(avg("Salary").alias("overall_avg")).collect()[0]["overall_avg"]

# Step 3: Join and Filter
result_df = df.join(dept_avg, "Department") \
    .filter((col("Salary") < col("dept_avg_sal")) & (col("Salary") > overall_avg))

result_df.show()


+----------+----------+------------+------+-----------------+-----------------+
|Department|EmployeeID|EmployeeName|Salary|  dept_avg_salary|     dept_avg_sal|
+----------+----------+------------+------+-----------------+-----------------+
|        IT|         4|       David|  6000|6833.333333333333|6833.333333333333|
+----------+----------+------------+------+-----------------+-----------------+

