In [1]:
import os
os.getcwd()
os.chdir("H:\pyspark_advanced-coding_interview")
os.getcwd()

'H:\\pyspark_advanced-coding_interview'

# Convert data from rows into single concatenated and delimited string | STRING_AGG

In [2]:
from pyspark.sql import SparkSession

# Initialize Spark Session
spark = SparkSession.builder.appName("StringAggregation").getOrCreate()

# Sample Data: Names of employees from different departments
data = [
    (1, "Alice", "HR"),
    (2, "Bob", "Finance"),
    (3, "Charlie", "IT"),
    (4, "David", "HR"),
    (5, "Eva", "Finance"),
    (6, "Frank", "IT"),
    (7, "Grace", "HR"),
    (8, "Henry", "Finance"),
    (9, "Ivy", "IT"),
    (10, "Jack", "Marketing")
]

# Create DataFrame
df = spark.createDataFrame(data, ["id", "name", "department"])

# Create a Temporary View for Spark SQL
df.createOrReplaceTempView("employees")

# Show the Original DataFrame
df.show(truncate=False)


+---+-------+----------+
|id |name   |department|
+---+-------+----------+
|1  |Alice  |HR        |
|2  |Bob    |Finance   |
|3  |Charlie|IT        |
|4  |David  |HR        |
|5  |Eva    |Finance   |
|6  |Frank  |IT        |
|7  |Grace  |HR        |
|8  |Henry  |Finance   |
|9  |Ivy    |IT        |
|10 |Jack   |Marketing |
+---+-------+----------+



In [3]:
res = spark.sql(""" 
                
SELECT department, 
       concat_ws(', ', collect_list(name)) AS employees
FROM employees
GROUP BY department;
                         
                
                """)
res.show()

+----------+-------------------+
|department|          employees|
+----------+-------------------+
|        HR|Alice, David, Grace|
|   Finance|    Bob, Eva, Henry|
|        IT|Charlie, Frank, Ivy|
| Marketing|               Jack|
+----------+-------------------+



In [4]:
res1 = spark.sql(""" 
                
SELECT department, 
       concat_ws(', ', collect_set(name)) AS unique_employees
FROM employees
GROUP BY department;
                         
                
                """)
res1.show()

+----------+-------------------+
|department|   unique_employees|
+----------+-------------------+
|        HR|Grace, Alice, David|
|   Finance|    Eva, Henry, Bob|
|        IT|Ivy, Frank, Charlie|
| Marketing|               Jack|
+----------+-------------------+



In [5]:
from pyspark.sql.functions import collect_list, concat_ws

# Aggregate names into a single concatenated string by department
df_concat = df.groupBy("department").agg(
    concat_ws(", ", collect_list("name")).alias("employees")
)

# Show the Result
df_concat.show(truncate=False)


+----------+-------------------+
|department|employees          |
+----------+-------------------+
|HR        |Alice, David, Grace|
|Finance   |Bob, Eva, Henry    |
|IT        |Charlie, Frank, Ivy|
|Marketing |Jack               |
+----------+-------------------+



In [6]:
from pyspark.sql.functions import collect_set

# Aggregate names into a unique concatenated string by department
df_unique_concat = df.groupBy("department").agg(
    concat_ws(", ", collect_set("name")).alias("unique_employees")
)

# Show the Result
df_unique_concat.show(truncate=False)


+----------+-------------------+
|department|unique_employees   |
+----------+-------------------+
|HR        |Grace, Alice, David|
|Finance   |Eva, Henry, Bob    |
|IT        |Ivy, Frank, Charlie|
|Marketing |Jack               |
+----------+-------------------+



In [7]:
res3 = spark.sql(""" 
 
 SELECT concat_ws(', ', collect_list(name)) AS all_employees
FROM employees;                
             
                 """)

res3.show()

+--------------------+
|       all_employees|
+--------------------+
|Alice, Bob, Charl...|
+--------------------+



In [8]:

df_all_concat = df.agg(
    concat_ws(", ", collect_list("name")).alias("all_employees")
)

df_all_concat.show(truncate=False)

+---------------------------------------------------------------+
|all_employees                                                  |
+---------------------------------------------------------------+
|Alice, Bob, Charlie, David, Eva, Frank, Grace, Henry, Ivy, Jack|
+---------------------------------------------------------------+



In [9]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

# Define UDF to concatenate names
def concatenate_names(names):
    return ", ".join(names)

# Register UDF
concat_udf = udf(concatenate_names, StringType())

# Use UDF to aggregate names
df_udf_concat = df.groupBy("department").agg(
    concat_udf(collect_list("name")).alias("employees")
)

# Show the Result
df_udf_concat.show(truncate=False)


+----------+-------------------+
|department|employees          |
+----------+-------------------+
|HR        |Alice, David, Grace|
|Finance   |Bob, Eva, Henry    |
|IT        |Charlie, Frank, Ivy|
|Marketing |Jack               |
+----------+-------------------+

