In [1]:
%run _spark_init.ipynb


- Write a structured query that splits a column by using delimiters from another column.

- EXTRA Write a structured query that removes empty tokens.

In [2]:
dept = (
  ("50000.0#0#0#", "#"),
  ("0@1000.0@", "@"),
  ("1$", "$"),
  ("1000.00^Test_string", "^"))

dept_df = spark.createDataFrame(dept, ["values", "delimeter"])

In [3]:
dept_df.show()

+-------------------+---------+
|             values|delimeter|
+-------------------+---------+
|       50000.0#0#0#|        #|
|          0@1000.0@|        @|
|                 1$|        $|
|1000.00^Test_string|        ^|
+-------------------+---------+



In [4]:
from pyspark.sql.functions import udf, col
from pyspark.sql.types import ArrayType, StringType
# Define a user-defined function (UDF) to split values based on delimiter
def split_values_udf(values, delimeter):
    return values.split(delimeter)

# Register the UDF
split_udf = udf(split_values_udf, ArrayType(StringType()))

# Add a new column by applying the UDF to each row
dept_df = dept_df.withColumn("split_values", split_udf(col("values"), col("delimeter")))
dept_df.show(truncate=False)

+-------------------+---------+----------------------+
|values             |delimeter|split_values          |
+-------------------+---------+----------------------+
|50000.0#0#0#       |#        |[50000.0, 0, 0, ]     |
|0@1000.0@          |@        |[0, 1000.0, ]         |
|1$                 |$        |[1, ]                 |
|1000.00^Test_string|^        |[1000.00, Test_string]|
+-------------------+---------+----------------------+



- Extra: 

In [5]:
from pyspark.sql.functions import udf, col
from pyspark.sql.types import ArrayType, StringType
# Define a user-defined function (UDF) to split values based on delimiter
def split_values_udf(values, delimeter):
    result = values.split(delimeter)
    if result[::-1][0] == "":
        return result[:len(result)-1]
    else:
        return result 

# Register the UDF
split_udf = udf(split_values_udf, ArrayType(StringType()))

# Add a new column by applying the UDF to each row
dept_df = dept_df.withColumn("split_values", split_udf(col("values"), col("delimeter")))
dept_df.show(truncate=False)

+-------------------+---------+----------------------+
|values             |delimeter|split_values          |
+-------------------+---------+----------------------+
|50000.0#0#0#       |#        |[50000.0, 0, 0]       |
|0@1000.0@          |@        |[0, 1000.0]           |
|1$                 |$        |[1]                   |
|1000.00^Test_string|^        |[1000.00, Test_string]|
+-------------------+---------+----------------------+

