In [1]:
import os
os.getcwd()
os.chdir("H:\pyspark_advanced-coding_interview")
os.getcwd()

'H:\\pyspark_advanced-coding_interview'

# extract numbers from String | Split word into characters

In [2]:
from pyspark.sql import SparkSession

# Initialize Spark Session
spark = SparkSession.builder.appName("ExtractNumbersSplitCharacters").getOrCreate()

# Sample Data: Strings with mixed content
data = [
    (1, "Order123"),
    (2, "Item4567"),
    (3, "Prod98"),
    (4, "Ref2001"),
    (5, "Code33x"),
    (6, "Alpha1234beta"),
    (7, "XYZ000"),
    (8, "num42value"),
    (9, "Num100number"),
    (10, "Val56"),
    (11, "AlphaBeta123"),
    (12, "Box789"),
    (13, "CodeX11"),
    (14, "SampleText2"),
    (15, "DataValue500")
]

# Create DataFrame
df = spark.createDataFrame(data, ["id", "mixed_string"])

# Create a Temporary View for Spark SQL
df.createOrReplaceTempView("mixed_data")
df.cache()
# Show the original DataFrame
df.show(truncate=False)


+---+-------------+
|id |mixed_string |
+---+-------------+
|1  |Order123     |
|2  |Item4567     |
|3  |Prod98       |
|4  |Ref2001      |
|5  |Code33x      |
|6  |Alpha1234beta|
|7  |XYZ000       |
|8  |num42value   |
|9  |Num100number |
|10 |Val56        |
|11 |AlphaBeta123 |
|12 |Box789       |
|13 |CodeX11      |
|14 |SampleText2  |
|15 |DataValue500 |
+---+-------------+



In [3]:
res = spark.sql("""  
                
   SELECT id, mixed_string, 
       regexp_extract(mixed_string, '\\d+', 0) AS extracted_number
FROM mixed_data;
             
                """)

res.show()

+---+-------------+----------------+
| id| mixed_string|extracted_number|
+---+-------------+----------------+
|  1|     Order123|               d|
|  2|     Item4567|                |
|  3|       Prod98|               d|
|  4|      Ref2001|                |
|  5|      Code33x|               d|
|  6|Alpha1234beta|                |
|  7|       XYZ000|                |
|  8|   num42value|                |
|  9| Num100number|                |
| 10|        Val56|                |
| 11| AlphaBeta123|                |
| 12|       Box789|                |
| 13|      CodeX11|               d|
| 14|  SampleText2|                |
| 15| DataValue500|                |
+---+-------------+----------------+



In [4]:
res1 = spark.sql("""  
                
SELECT id, mixed_string, 
       regexp_replace(mixed_string, '[^0-9]', '') AS only_numbers
FROM mixed_data;

             
                """)

res1.show()

+---+-------------+------------+
| id| mixed_string|only_numbers|
+---+-------------+------------+
|  1|     Order123|         123|
|  2|     Item4567|        4567|
|  3|       Prod98|          98|
|  4|      Ref2001|        2001|
|  5|      Code33x|          33|
|  6|Alpha1234beta|        1234|
|  7|       XYZ000|         000|
|  8|   num42value|          42|
|  9| Num100number|         100|
| 10|        Val56|          56|
| 11| AlphaBeta123|         123|
| 12|       Box789|         789|
| 13|      CodeX11|          11|
| 14|  SampleText2|           2|
| 15| DataValue500|         500|
+---+-------------+------------+



In [5]:
from pyspark.sql.functions import regexp_extract

# Extract numbers from the string using regexp_extract
df_numbers = df.withColumn("extracted_number", regexp_extract("mixed_string", "\\d+", 0))

# Show the result
df_numbers.show(truncate=False)


+---+-------------+----------------+
|id |mixed_string |extracted_number|
+---+-------------+----------------+
|1  |Order123     |123             |
|2  |Item4567     |4567            |
|3  |Prod98       |98              |
|4  |Ref2001      |2001            |
|5  |Code33x      |33              |
|6  |Alpha1234beta|1234            |
|7  |XYZ000       |000             |
|8  |num42value   |42              |
|9  |Num100number |100             |
|10 |Val56        |56              |
|11 |AlphaBeta123 |123             |
|12 |Box789       |789             |
|13 |CodeX11      |11              |
|14 |SampleText2  |2               |
|15 |DataValue500 |500             |
+---+-------------+----------------+



In [6]:
from pyspark.sql.functions import regexp_replace

# Remove non-numeric characters to extract only numbers
df_only_numbers = df.withColumn("only_numbers", regexp_replace("mixed_string", "[^0-9]", ""))

# Show the result
df_only_numbers.show(truncate=False)


+---+-------------+------------+
|id |mixed_string |only_numbers|
+---+-------------+------------+
|1  |Order123     |123         |
|2  |Item4567     |4567        |
|3  |Prod98       |98          |
|4  |Ref2001      |2001        |
|5  |Code33x      |33          |
|6  |Alpha1234beta|1234        |
|7  |XYZ000       |000         |
|8  |num42value   |42          |
|9  |Num100number |100         |
|10 |Val56        |56          |
|11 |AlphaBeta123 |123         |
|12 |Box789       |789         |
|13 |CodeX11      |11          |
|14 |SampleText2  |2           |
|15 |DataValue500 |500         |
+---+-------------+------------+



In [7]:
res2 = spark.sql("""  
                
SELECT id, mixed_string, 
       explode(split(mixed_string, '')) AS character
FROM mixed_data;

             
                """)

res2.show()

+---+------------+---------+
| id|mixed_string|character|
+---+------------+---------+
|  1|    Order123|        O|
|  1|    Order123|        r|
|  1|    Order123|        d|
|  1|    Order123|        e|
|  1|    Order123|        r|
|  1|    Order123|        1|
|  1|    Order123|        2|
|  1|    Order123|        3|
|  2|    Item4567|        I|
|  2|    Item4567|        t|
|  2|    Item4567|        e|
|  2|    Item4567|        m|
|  2|    Item4567|        4|
|  2|    Item4567|        5|
|  2|    Item4567|        6|
|  2|    Item4567|        7|
|  3|      Prod98|        P|
|  3|      Prod98|        r|
|  3|      Prod98|        o|
|  3|      Prod98|        d|
+---+------------+---------+
only showing top 20 rows



In [8]:
from pyspark.sql.functions import split, explode

# Split string into characters and explode them into separate rows
df_chars = df.withColumn("character", explode(split("mixed_string", "")))

# Show the result
df_chars.show(truncate=False)


+---+------------+---------+
|id |mixed_string|character|
+---+------------+---------+
|1  |Order123    |O        |
|1  |Order123    |r        |
|1  |Order123    |d        |
|1  |Order123    |e        |
|1  |Order123    |r        |
|1  |Order123    |1        |
|1  |Order123    |2        |
|1  |Order123    |3        |
|2  |Item4567    |I        |
|2  |Item4567    |t        |
|2  |Item4567    |e        |
|2  |Item4567    |m        |
|2  |Item4567    |4        |
|2  |Item4567    |5        |
|2  |Item4567    |6        |
|2  |Item4567    |7        |
|3  |Prod98      |P        |
|3  |Prod98      |r        |
|3  |Prod98      |o        |
|3  |Prod98      |d        |
+---+------------+---------+
only showing top 20 rows



In [9]:
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType

# Define UDF to split string into characters
def split_to_chars(s):
    return list(s)

# Register UDF
split_to_chars_udf = udf(split_to_chars, ArrayType(StringType()))

# Use UDF to create a new column
df_udf = df.withColumn("characters", split_to_chars_udf("mixed_string"))

# Explode characters into rows
df_chars_udf = df_udf.select("id", "mixed_string", explode("characters").alias("character"))
df_chars_udf.show(truncate=False)


+---+------------+---------+
|id |mixed_string|character|
+---+------------+---------+
|1  |Order123    |O        |
|1  |Order123    |r        |
|1  |Order123    |d        |
|1  |Order123    |e        |
|1  |Order123    |r        |
|1  |Order123    |1        |
|1  |Order123    |2        |
|1  |Order123    |3        |
|2  |Item4567    |I        |
|2  |Item4567    |t        |
|2  |Item4567    |e        |
|2  |Item4567    |m        |
|2  |Item4567    |4        |
|2  |Item4567    |5        |
|2  |Item4567    |6        |
|2  |Item4567    |7        |
|3  |Prod98      |P        |
|3  |Prod98      |r        |
|3  |Prod98      |o        |
|3  |Prod98      |d        |
+---+------------+---------+
only showing top 20 rows

