In [1]:
import os
os.getcwd()
os.chdir("H:\pyspark_advanced-coding_interview")
os.getcwd()

'H:\\pyspark_advanced-coding_interview'

In [2]:
from pyspark.sql import SparkSession

# Create a Spark session with optimized settings
spark = (
    SparkSession.builder 
    .appName("OptimizedLocalSpark") 
    .config("spark.driver.memory", "8g")        
    .config("spark.executor.memory", "8g")    
    .config("spark.executor.cores", "4")       
    .config("spark.cores.max", "12")           
    .config("spark.sql.shuffle.partitions", "28")  
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") 
    .getOrCreate()
)
sc = spark.sparkContext

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql import Row


# Sample employee data
data = [
    Row(EmployeeID=1, EmployeeName="Alice", Department="HR", Salary=4000),
    Row(EmployeeID=2, EmployeeName="Bob", Department="HR", Salary=5000),
    Row(EmployeeID=3, EmployeeName="Charlie", Department="HR", Salary=4500),
    Row(EmployeeID=4, EmployeeName="David", Department="IT", Salary=6000),
    Row(EmployeeID=5, EmployeeName="Eve", Department="IT", Salary=7500),
    Row(EmployeeID=6, EmployeeName="Frank", Department="IT", Salary=7000),
    Row(EmployeeID=7, EmployeeName="Grace", Department="Finance", Salary=5500),
    Row(EmployeeID=8, EmployeeName="Heidi", Department="Finance", Salary=5000),
    Row(EmployeeID=9, EmployeeName="Ivan", Department="Finance", Salary=4500),
    Row(EmployeeID=10, EmployeeName="Judy", Department="Sales", Salary=3000),
    Row(EmployeeID=11, EmployeeName="Kevin", Department="Sales", Salary=3200),
    Row(EmployeeID=12, EmployeeName="Laura", Department="Sales", Salary=3500),
    Row(EmployeeID=13, EmployeeName="Mallory", Department="Marketing", Salary=4000),
    Row(EmployeeID=14, EmployeeName="Niaj", Department="Marketing", Salary=4500),
    Row(EmployeeID=15, EmployeeName="Oscar", Department="Marketing", Salary=4800)
]

# Create DataFrame
df = spark.createDataFrame(data)
df.createOrReplaceTempView("Employees")
df.cache()
# Display the DataFrame
df.show()

+----------+------------+----------+------+
|EmployeeID|EmployeeName|Department|Salary|
+----------+------------+----------+------+
|         1|       Alice|        HR|  4000|
|         2|         Bob|        HR|  5000|
|         3|     Charlie|        HR|  4500|
|         4|       David|        IT|  6000|
|         5|         Eve|        IT|  7500|
|         6|       Frank|        IT|  7000|
|         7|       Grace|   Finance|  5500|
|         8|       Heidi|   Finance|  5000|
|         9|        Ivan|   Finance|  4500|
|        10|        Judy|     Sales|  3000|
|        11|       Kevin|     Sales|  3200|
|        12|       Laura|     Sales|  3500|
|        13|     Mallory| Marketing|  4000|
|        14|        Niaj| Marketing|  4500|
|        15|       Oscar| Marketing|  4800|
+----------+------------+----------+------+



# Spark SQL

In [11]:
# Register the DataFrame as a temporary SQL view
df.createOrReplaceTempView("Employees")

# SQL Query to Find Employees Based on the Criteria
res5 = spark.sql( """
WITH DepartmentAverages AS (
    SELECT Department, AVG(Salary) AS Dept_Avg_Salary 
    FROM Employees 
    GROUP BY Department
),


CompanyAverage AS (
    SELECT AVG(Salary) AS Company_Avg_Salary FROM Employees
)


SELECT e.EmployeeID, e.EmployeeName, e.Department, e.Salary
FROM Employees e
JOIN DepartmentAverages d ON e.Department = d.Department
JOIN CompanyAverage c
WHERE e.Salary > d.Dept_Avg_Salary 
AND e.Salary < c.Company_Avg_Salary
""")



res5.show()


+----------+------------+----------+------+
|EmployeeID|EmployeeName|Department|Salary|
+----------+------------+----------+------+
|        12|       Laura|     Sales|  3500|
|        14|        Niaj| Marketing|  4500|
+----------+------------+----------+------+



In [8]:
# Employees having greater than average salary of the department


res = spark.sql("""
   SELECT e.EmployeeID, e.EmployeeName, e.Department, e.Salary,avgSalEmp.avg_salary from Employees  e   
   inner join      
   (select  Department, avg(Salary) as avg_salary from Employees group by  Department) avgSalEmp
   on e.Department = avgSalEmp. Department
   and e.Salary > avgSalEmp.avg_salary
                
                """)
res.show()

+----------+------------+----------+------+------------------+
|EmployeeID|EmployeeName|Department|Salary|        avg_salary|
+----------+------------+----------+------+------------------+
|         2|         Bob|        HR|  5000|            4500.0|
|         6|       Frank|        IT|  7000| 6833.333333333333|
|         5|         Eve|        IT|  7500| 6833.333333333333|
|         7|       Grace|   Finance|  5500|            5000.0|
|        12|       Laura|     Sales|  3500|3233.3333333333335|
|        15|       Oscar| Marketing|  4800| 4433.333333333333|
|        14|        Niaj| Marketing|  4500| 4433.333333333333|
+----------+------------+----------+------+------------------+



In [9]:
# Employees having greater than average salary of the department but less than the overall average 
res1 = spark.sql("""
   SELECT e.EmployeeID, e.EmployeeName, e.Department, e.Salary,avgSalEmp.avg_salary from Employees  e   
   inner join      
   (select  Department, avg(Salary) as avg_salary from Employees group by  Department) avgSalEmp
   on e.Department = avgSalEmp. Department
   and e.Salary > avgSalEmp.avg_salary
   where e.Salary < (select avg(Salary) from Employees)
                
                """)
res1.show()

+----------+------------+----------+------+------------------+
|EmployeeID|EmployeeName|Department|Salary|        avg_salary|
+----------+------------+----------+------+------------------+
|        12|       Laura|     Sales|  3500|3233.3333333333335|
|        14|        Niaj| Marketing|  4500| 4433.333333333333|
+----------+------------+----------+------+------------------+



# Pyspark

In [10]:
from pyspark.sql.functions import avg, col

# Step 1: Calculate the Department Average Salary
dept_avg_df = df.groupBy("Department").agg(avg("Salary").alias("Dept_Avg_Salary"))

# Step 2: Calculate the Company Average Salary
company_avg = df.agg(avg("Salary").alias("Company_Avg_Salary")).collect()[0]["Company_Avg_Salary"]

# Step 3: Join the Employee Data with Department Averages
joined_df = df.join(dept_avg_df, "Department")

# Step 4: Filter Employees
result_df = joined_df.filter((col("Salary") > col("Dept_Avg_Salary")) & (col("Salary") < company_avg))

result_df.show()


+----------+----------+------------+------+------------------+
|Department|EmployeeID|EmployeeName|Salary|   Dept_Avg_Salary|
+----------+----------+------------+------+------------------+
|     Sales|        12|       Laura|  3500|3233.3333333333335|
| Marketing|        14|        Niaj|  4500| 4433.333333333333|
+----------+----------+------------+------+------------------+



In [12]:
from pyspark.sql.window import Window
from pyspark.sql.functions import avg

# Step 1: Calculate Department Average Using Window Function
dept_avg_window = Window.partitionBy("Department")
df = df.withColumn("Dept_Avg_Salary", avg("Salary").over(dept_avg_window))

# Step 2: Calculate Company Average
company_avg = df.agg(avg("Salary").alias("Company_Avg_Salary")).collect()[0]["Company_Avg_Salary"]

# Step 3: Filter Employees
result_window_df = df.filter((col("Salary") > col("Dept_Avg_Salary")) & (col("Salary") < company_avg))

result_window_df.show()


+----------+------------+----------+------+------------------+
|EmployeeID|EmployeeName|Department|Salary|   Dept_Avg_Salary|
+----------+------------+----------+------+------------------+
|        14|        Niaj| Marketing|  4500| 4433.333333333333|
|        12|       Laura|     Sales|  3500|3233.3333333333335|
+----------+------------+----------+------+------------------+



In [14]:
# Step 1: Calculate Department and Company Averages Separately
dept_avg = df.groupBy("Department").agg(avg("Salary").alias("Dept_Avg_Sal"))
company_avg = df.agg(avg("Salary").alias("Company_Avg_Salary")).collect()[0]["Company_Avg_Salary"]

# Step 2: Join the Department Averages with Employee Data
joined_avg_df = df.join(dept_avg, "Department")

# Step 3: Apply Filtering Criteria
filtered_result = joined_avg_df.filter((col("Salary") > col("Dept_Avg_Sal")) & (col("Salary") < company_avg))

filtered_result.show()


+----------+----------+------------+------+------------------+------------------+
|Department|EmployeeID|EmployeeName|Salary|   Dept_Avg_Salary|      Dept_Avg_Sal|
+----------+----------+------------+------+------------------+------------------+
| Marketing|        14|        Niaj|  4500| 4433.333333333333| 4433.333333333333|
|     Sales|        12|       Laura|  3500|3233.3333333333335|3233.3333333333335|
+----------+----------+------------+------+------------------+------------------+

