In [1]:
import os
os.getcwd()
os.chdir("H:\pyspark_advanced-coding_interview")
os.getcwd()

'H:\\pyspark_advanced-coding_interview'

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

# Initialize Spark Session
spark = SparkSession.builder.appName("RowToCommaSeparated").getOrCreate()

# Define schema using StructType and StructField
schema = StructType([
    StructField("EmployeeID", IntegerType(), True),
    StructField("EmployeeName", StringType(), True),
    StructField("Department", StringType(), True)
])

# Sample data
data = [
    (1, "Alice", "HR"),
    (2, "Bob", "HR"),
    (3, "Charlie", "IT"),
    (4, "David", "IT"),
    (5, "Eve", "IT"),
    (6, "Frank", "Finance"),
    (7, "Grace", "Finance"),
    (8, "Heidi", "HR"),
    (9, "Ivan", "Sales"),
    (10, "Judy", "Sales"),
    (11, "Kevin", "Sales"),
    (12, "Laura", "IT"),
    (13, "Mallory", "Marketing"),
    (14, "Niaj", "Marketing"),
    (15, "Oscar", "Marketing")
]

# Create DataFrame
df = spark.createDataFrame(data, schema)
df.cache()
# Register the DataFrame as a temporary table
df.createOrReplaceTempView("Employees")

df.show()


+----------+------------+----------+
|EmployeeID|EmployeeName|Department|
+----------+------------+----------+
|         1|       Alice|        HR|
|         2|         Bob|        HR|
|         3|     Charlie|        IT|
|         4|       David|        IT|
|         5|         Eve|        IT|
|         6|       Frank|   Finance|
|         7|       Grace|   Finance|
|         8|       Heidi|        HR|
|         9|        Ivan|     Sales|
|        10|        Judy|     Sales|
|        11|       Kevin|     Sales|
|        12|       Laura|        IT|
|        13|     Mallory| Marketing|
|        14|        Niaj| Marketing|
|        15|       Oscar| Marketing|
+----------+------------+----------+



# Saprk SQL

In [5]:
# How to convert data from rows into comma separated single column | FOR XML PATH | STUFF

query = spark.sql(""" 
                   
SELECT Department, CONCAT_WS(', ', COLLECT_LIST(EmployeeName)) AS EmployeeNames
FROM Employees
GROUP BY Department

                  """)
query.show()

+----------+--------------------+
|Department|       EmployeeNames|
+----------+--------------------+
|        HR|   Alice, Bob, Heidi|
|        IT|Charlie, David, E...|
|   Finance|        Frank, Grace|
|     Sales|   Ivan, Judy, Kevin|
| Marketing|Mallory, Niaj, Oscar|
+----------+--------------------+



In [8]:
query3 = spark.sql(""" 
                   
SELECT Department, CONCAT_WS(', ', COLLECT_LIST(EmployeeName)) AS EmployeeNames
FROM (
    SELECT Department, EmployeeName
    FROM Employees
    ORDER BY Department, EmployeeID
) AS OrderedEmployees
GROUP BY Department

                  """)
query3.show()

+----------+--------------------+
|Department|       EmployeeNames|
+----------+--------------------+
|     Sales|   Ivan, Judy, Kevin|
|        HR|   Alice, Bob, Heidi|
|   Finance|        Frank, Grace|
| Marketing|Mallory, Niaj, Oscar|
|        IT|Charlie, David, E...|
+----------+--------------------+



# Pyspark

In [4]:
from pyspark.sql.functions import collect_list, concat_ws

# Group by Department and concatenate employee names into a single comma-separated string
result_df = df.groupBy("Department") \
    .agg(concat_ws(", ", collect_list("EmployeeName")).alias("EmployeeNames"))

result_df.show(truncate=False)


+----------+--------------------------+
|Department|EmployeeNames             |
+----------+--------------------------+
|HR        |Alice, Bob, Heidi         |
|IT        |Charlie, David, Eve, Laura|
|Finance   |Frank, Grace              |
|Sales     |Ivan, Judy, Kevin         |
|Marketing |Mallory, Niaj, Oscar      |
+----------+--------------------------+



In [12]:
from pyspark.sql.functions import collect_set, concat_ws
# Use collect_set to gather unique employee names into a single string
distinct_agg_df = df.groupBy("Department") \
    .agg(concat_ws(", ", collect_set("EmployeeName")).alias("DistinctEmployeeNames"))

distinct_agg_df.show(truncate=False)


+----------+--------------------------+
|Department|DistinctEmployeeNames     |
+----------+--------------------------+
|HR        |Heidi, Bob, Alice         |
|IT        |Laura, Eve, David, Charlie|
|Finance   |Frank, Grace              |
|Sales     |Kevin, Judy, Ivan         |
|Marketing |Mallory, Oscar, Niaj      |
+----------+--------------------------+



In [11]:
# Define custom function to concatenate names within each partition
def aggregate_names(partition):
    from itertools import groupby
    from operator import itemgetter

    sorted_partition = sorted(partition, key=itemgetter(2))  # Sort by Department
    for key, group in groupby(sorted_partition, key=itemgetter(2)):
        employees = [item[1] for item in group]
        yield (key, ", ".join(employees))

# Convert to RDD, apply custom aggregation, and convert back to DataFrame
rdd = df.rdd.mapPartitions(aggregate_names)
custom_agg_df = rdd.toDF(["Department", "EmployeeNames"])
custom_agg_df.show(truncate=False)


+----------+-------------+
|Department|EmployeeNames|
+----------+-------------+
|HR        |Alice        |
|HR        |Bob          |
|IT        |Charlie      |
|IT        |David        |
|IT        |Eve          |
|Finance   |Frank        |
|Finance   |Grace        |
|HR        |Heidi        |
|Sales     |Ivan         |
|Sales     |Judy         |
|Sales     |Kevin        |
|IT        |Laura        |
|Marketing |Mallory      |
|Marketing |Niaj         |
|Marketing |Oscar        |
+----------+-------------+

