In [1]:
import os
os.getcwd()
os.chdir("H:\pyspark_advanced-coding_interview")
os.getcwd()

'H:\\pyspark_advanced-coding_interview'

In [2]:
from pyspark.sql import SparkSession

# Create a Spark session with optimized settings
spark = (
    SparkSession.builder 
    .appName("OptimizedLocalSpark") 
    .config("spark.driver.memory", "8g")        
    .config("spark.executor.memory", "8g")    
    .config("spark.executor.cores", "4")       
    .config("spark.cores.max", "12")           
    .config("spark.sql.shuffle.partitions", "28")  
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") 
    .getOrCreate()
)
sc = spark.sparkContext

In [1]:
from pyspark.sql import SparkSession

# Initialize Spark Session
spark = SparkSession.builder.appName("RegexTutorial").getOrCreate()

# Sample Data: Strings with different patterns
data = [
    (1, "Order123"),
    (2, "Item-456"),
    (3, "Prod_789"),
    (4, "Ref2001"),
    (5, "Test123Test"),
    (6, "Alpha1234beta"),
    (7, "XYZ000"),
    (8, "num42value"),
    (9, "Num100number"),
    (10, "Val-56"),
    (11, "Alpha-Beta123"),
    (12, "Box_789"),
    (13, "CodeX11"),
    (14, "SampleText2"),
    (15, "DataValue500")
]

# Create DataFrame
df = spark.createDataFrame(data, ["id", "mixed_string"])

# Create a Temporary View for Spark SQL
df.createOrReplaceTempView("regex_table")
df.cache()
# Show the Original DataFrame
df.show(truncate=False)


+---+-------------+
|id |mixed_string |
+---+-------------+
|1  |Order123     |
|2  |Item-456     |
|3  |Prod_789     |
|4  |Ref2001      |
|5  |Test123Test  |
|6  |Alpha1234beta|
|7  |XYZ000       |
|8  |num42value   |
|9  |Num100number |
|10 |Val-56       |
|11 |Alpha-Beta123|
|12 |Box_789      |
|13 |CodeX11      |
|14 |SampleText2  |
|15 |DataValue500 |
+---+-------------+



# Extract Numbers from String

In [3]:
res1 = spark.sql (""" 
  
  SELECT id, mixed_string, regexp_extract(mixed_string, '\\d+', 0) AS extracted_number
FROM regex_table;
                 
                   
                   """)

res1.show()

+---+-------------+----------------+
| id| mixed_string|extracted_number|
+---+-------------+----------------+
|  1|     Order123|               d|
|  2|     Item-456|                |
|  3|     Prod_789|               d|
|  4|      Ref2001|                |
|  5|  Test123Test|                |
|  6|Alpha1234beta|                |
|  7|       XYZ000|                |
|  8|   num42value|                |
|  9| Num100number|                |
| 10|       Val-56|                |
| 11|Alpha-Beta123|                |
| 12|      Box_789|                |
| 13|      CodeX11|               d|
| 14|  SampleText2|                |
| 15| DataValue500|                |
+---+-------------+----------------+



In [4]:
from pyspark.sql.functions import regexp_extract

df.withColumn("extracted_number", regexp_extract("mixed_string", "\\d+", 0)).show(truncate=False)


+---+-------------+----------------+
|id |mixed_string |extracted_number|
+---+-------------+----------------+
|1  |Order123     |123             |
|2  |Item-456     |456             |
|3  |Prod_789     |789             |
|4  |Ref2001      |2001            |
|5  |Test123Test  |123             |
|6  |Alpha1234beta|1234            |
|7  |XYZ000       |000             |
|8  |num42value   |42              |
|9  |Num100number |100             |
|10 |Val-56       |56              |
|11 |Alpha-Beta123|123             |
|12 |Box_789      |789             |
|13 |CodeX11      |11              |
|14 |SampleText2  |2               |
|15 |DataValue500 |500             |
+---+-------------+----------------+



# Extract Alphabets Only

In [5]:
res2 = spark.sql (""" 
  
SELECT id, mixed_string, regexp_extract(mixed_string, '[A-Za-z]+', 0) AS extracted_text
FROM regex_table;

                 
                   
                   """)

res2.show()

+---+-------------+--------------+
| id| mixed_string|extracted_text|
+---+-------------+--------------+
|  1|     Order123|         Order|
|  2|     Item-456|          Item|
|  3|     Prod_789|          Prod|
|  4|      Ref2001|           Ref|
|  5|  Test123Test|          Test|
|  6|Alpha1234beta|         Alpha|
|  7|       XYZ000|           XYZ|
|  8|   num42value|           num|
|  9| Num100number|           Num|
| 10|       Val-56|           Val|
| 11|Alpha-Beta123|         Alpha|
| 12|      Box_789|           Box|
| 13|      CodeX11|         CodeX|
| 14|  SampleText2|    SampleText|
| 15| DataValue500|     DataValue|
+---+-------------+--------------+



In [6]:
df.withColumn("extracted_text", regexp_extract("mixed_string", "[A-Za-z]+", 0)).show(truncate=False)


+---+-------------+--------------+
|id |mixed_string |extracted_text|
+---+-------------+--------------+
|1  |Order123     |Order         |
|2  |Item-456     |Item          |
|3  |Prod_789     |Prod          |
|4  |Ref2001      |Ref           |
|5  |Test123Test  |Test          |
|6  |Alpha1234beta|Alpha         |
|7  |XYZ000       |XYZ           |
|8  |num42value   |num           |
|9  |Num100number |Num           |
|10 |Val-56       |Val           |
|11 |Alpha-Beta123|Alpha         |
|12 |Box_789      |Box           |
|13 |CodeX11      |CodeX         |
|14 |SampleText2  |SampleText    |
|15 |DataValue500 |DataValue     |
+---+-------------+--------------+



# Check if String Contains Numbers

In [7]:
res3 = spark.sql (""" 
  
SELECT id, mixed_string, 
       CASE WHEN mixed_string RLIKE '\\d+' THEN 'Yes' ELSE 'No' END AS has_numbers
FROM regex_table;

                   
                   """)

res3.show()

+---+-------------+-----------+
| id| mixed_string|has_numbers|
+---+-------------+-----------+
|  1|     Order123|        Yes|
|  2|     Item-456|         No|
|  3|     Prod_789|        Yes|
|  4|      Ref2001|         No|
|  5|  Test123Test|         No|
|  6|Alpha1234beta|         No|
|  7|       XYZ000|         No|
|  8|   num42value|         No|
|  9| Num100number|         No|
| 10|       Val-56|         No|
| 11|Alpha-Beta123|         No|
| 12|      Box_789|         No|
| 13|      CodeX11|        Yes|
| 14|  SampleText2|         No|
| 15| DataValue500|         No|
+---+-------------+-----------+



In [8]:
df.withColumn("has_numbers", df["mixed_string"].rlike("\\d+")).show(truncate=False)


+---+-------------+-----------+
|id |mixed_string |has_numbers|
+---+-------------+-----------+
|1  |Order123     |true       |
|2  |Item-456     |true       |
|3  |Prod_789     |true       |
|4  |Ref2001      |true       |
|5  |Test123Test  |true       |
|6  |Alpha1234beta|true       |
|7  |XYZ000       |true       |
|8  |num42value   |true       |
|9  |Num100number |true       |
|10 |Val-56       |true       |
|11 |Alpha-Beta123|true       |
|12 |Box_789      |true       |
|13 |CodeX11      |true       |
|14 |SampleText2  |true       |
|15 |DataValue500 |true       |
+---+-------------+-----------+



# Replace Numbers with X

In [9]:
res4 = spark.sql (""" 
  
SELECT id, mixed_string, regexp_replace(mixed_string, '\\d+', 'X') AS replaced_text
FROM regex_table;


                   
                   """)

res4.show()

+---+-------------+-------------+
| id| mixed_string|replaced_text|
+---+-------------+-------------+
|  1|     Order123|     OrXer123|
|  2|     Item-456|     Item-456|
|  3|     Prod_789|     ProX_789|
|  4|      Ref2001|      Ref2001|
|  5|  Test123Test|  Test123Test|
|  6|Alpha1234beta|Alpha1234beta|
|  7|       XYZ000|       XYZ000|
|  8|   num42value|   num42value|
|  9| Num100number| Num100number|
| 10|       Val-56|       Val-56|
| 11|Alpha-Beta123|Alpha-Beta123|
| 12|      Box_789|      Box_789|
| 13|      CodeX11|      CoXeX11|
| 14|  SampleText2|  SampleText2|
| 15| DataValue500| DataValue500|
+---+-------------+-------------+



In [10]:
from pyspark.sql.functions import regexp_replace

df.withColumn("replaced_text", regexp_replace("mixed_string", "\\d+", "X")).show(truncate=False)


+---+-------------+-------------+
|id |mixed_string |replaced_text|
+---+-------------+-------------+
|1  |Order123     |OrderX       |
|2  |Item-456     |Item-X       |
|3  |Prod_789     |Prod_X       |
|4  |Ref2001      |RefX         |
|5  |Test123Test  |TestXTest    |
|6  |Alpha1234beta|AlphaXbeta   |
|7  |XYZ000       |XYZX         |
|8  |num42value   |numXvalue    |
|9  |Num100number |NumXnumber   |
|10 |Val-56       |Val-X        |
|11 |Alpha-Beta123|Alpha-BetaX  |
|12 |Box_789      |Box_X        |
|13 |CodeX11      |CodeXX       |
|14 |SampleText2  |SampleTextX  |
|15 |DataValue500 |DataValueX   |
+---+-------------+-------------+



# Remove Special Characters

In [11]:
res5 = spark.sql (""" 
  
SELECT id, mixed_string, regexp_replace(mixed_string, '[^A-Za-z0-9]', '') AS clean_text
FROM regex_table;


                   
                   """)

res5.show()

+---+-------------+-------------+
| id| mixed_string|   clean_text|
+---+-------------+-------------+
|  1|     Order123|     Order123|
|  2|     Item-456|      Item456|
|  3|     Prod_789|      Prod789|
|  4|      Ref2001|      Ref2001|
|  5|  Test123Test|  Test123Test|
|  6|Alpha1234beta|Alpha1234beta|
|  7|       XYZ000|       XYZ000|
|  8|   num42value|   num42value|
|  9| Num100number| Num100number|
| 10|       Val-56|        Val56|
| 11|Alpha-Beta123| AlphaBeta123|
| 12|      Box_789|       Box789|
| 13|      CodeX11|      CodeX11|
| 14|  SampleText2|  SampleText2|
| 15| DataValue500| DataValue500|
+---+-------------+-------------+



In [12]:
df.withColumn("clean_text", regexp_replace("mixed_string", "[^A-Za-z0-9]", "")).show(truncate=False)


+---+-------------+-------------+
|id |mixed_string |clean_text   |
+---+-------------+-------------+
|1  |Order123     |Order123     |
|2  |Item-456     |Item456      |
|3  |Prod_789     |Prod789      |
|4  |Ref2001      |Ref2001      |
|5  |Test123Test  |Test123Test  |
|6  |Alpha1234beta|Alpha1234beta|
|7  |XYZ000       |XYZ000       |
|8  |num42value   |num42value   |
|9  |Num100number |Num100number |
|10 |Val-56       |Val56        |
|11 |Alpha-Beta123|AlphaBeta123 |
|12 |Box_789      |Box789       |
|13 |CodeX11      |CodeX11      |
|14 |SampleText2  |SampleText2  |
|15 |DataValue500 |DataValue500 |
+---+-------------+-------------+



# Extract First Occurrence of Digits After a Hyphen

In [13]:
res6 = spark.sql (""" 
  
SELECT id, mixed_string, regexp_extract(mixed_string, '-(\\d+)', 1) AS digits_after_hyphen
FROM regex_table;



                   
                   """)

res6.show()

+---+-------------+-------------------+
| id| mixed_string|digits_after_hyphen|
+---+-------------+-------------------+
|  1|     Order123|                   |
|  2|     Item-456|                   |
|  3|     Prod_789|                   |
|  4|      Ref2001|                   |
|  5|  Test123Test|                   |
|  6|Alpha1234beta|                   |
|  7|       XYZ000|                   |
|  8|   num42value|                   |
|  9| Num100number|                   |
| 10|       Val-56|                   |
| 11|Alpha-Beta123|                   |
| 12|      Box_789|                   |
| 13|      CodeX11|                   |
| 14|  SampleText2|                   |
| 15| DataValue500|                   |
+---+-------------+-------------------+



In [14]:
df.withColumn("digits_after_hyphen", regexp_extract("mixed_string", "-(\\d+)", 1)).show(truncate=False)


+---+-------------+-------------------+
|id |mixed_string |digits_after_hyphen|
+---+-------------+-------------------+
|1  |Order123     |                   |
|2  |Item-456     |456                |
|3  |Prod_789     |                   |
|4  |Ref2001      |                   |
|5  |Test123Test  |                   |
|6  |Alpha1234beta|                   |
|7  |XYZ000       |                   |
|8  |num42value   |                   |
|9  |Num100number |                   |
|10 |Val-56       |56                 |
|11 |Alpha-Beta123|                   |
|12 |Box_789      |                   |
|13 |CodeX11      |                   |
|14 |SampleText2  |                   |
|15 |DataValue500 |                   |
+---+-------------+-------------------+



# Replace All Digits with #

In [15]:
res7 = spark.sql (""" 
  
SELECT id, mixed_string, regexp_replace(mixed_string, '\\d', '#') AS replaced_digits
FROM regex_table;



                   
                   """)

res7.show()

+---+-------------+---------------+
| id| mixed_string|replaced_digits|
+---+-------------+---------------+
|  1|     Order123|       Or#er123|
|  2|     Item-456|       Item-456|
|  3|     Prod_789|       Pro#_789|
|  4|      Ref2001|        Ref2001|
|  5|  Test123Test|    Test123Test|
|  6|Alpha1234beta|  Alpha1234beta|
|  7|       XYZ000|         XYZ000|
|  8|   num42value|     num42value|
|  9| Num100number|   Num100number|
| 10|       Val-56|         Val-56|
| 11|Alpha-Beta123|  Alpha-Beta123|
| 12|      Box_789|        Box_789|
| 13|      CodeX11|        Co#eX11|
| 14|  SampleText2|    SampleText2|
| 15| DataValue500|   DataValue500|
+---+-------------+---------------+



In [16]:
df.withColumn("replaced_digits", regexp_replace("mixed_string", "\\d", "#")).show(truncate=False)


+---+-------------+---------------+
|id |mixed_string |replaced_digits|
+---+-------------+---------------+
|1  |Order123     |Order###       |
|2  |Item-456     |Item-###       |
|3  |Prod_789     |Prod_###       |
|4  |Ref2001      |Ref####        |
|5  |Test123Test  |Test###Test    |
|6  |Alpha1234beta|Alpha####beta  |
|7  |XYZ000       |XYZ###         |
|8  |num42value   |num##value     |
|9  |Num100number |Num###number   |
|10 |Val-56       |Val-##         |
|11 |Alpha-Beta123|Alpha-Beta###  |
|12 |Box_789      |Box_###        |
|13 |CodeX11      |CodeX##        |
|14 |SampleText2  |SampleText#    |
|15 |DataValue500 |DataValue###   |
+---+-------------+---------------+



# Extract Letters Before Digits

In [17]:
res8 = spark.sql (""" 
SELECT id, mixed_string, regexp_extract(mixed_string, '([A-Za-z]+)\\d+', 1) AS letters_before_digits
FROM regex_table;




                   
                   """)

res8.show()

+---+-------------+---------------------+
| id| mixed_string|letters_before_digits|
+---+-------------+---------------------+
|  1|     Order123|                   Or|
|  2|     Item-456|                     |
|  3|     Prod_789|                  Pro|
|  4|      Ref2001|                     |
|  5|  Test123Test|                     |
|  6|Alpha1234beta|                     |
|  7|       XYZ000|                     |
|  8|   num42value|                     |
|  9| Num100number|                     |
| 10|       Val-56|                     |
| 11|Alpha-Beta123|                     |
| 12|      Box_789|                     |
| 13|      CodeX11|                   Co|
| 14|  SampleText2|                     |
| 15| DataValue500|                     |
+---+-------------+---------------------+



In [18]:
df.withColumn("letters_before_digits", regexp_extract("mixed_string", "([A-Za-z]+)\\d+", 1)).show(truncate=False)


+---+-------------+---------------------+
|id |mixed_string |letters_before_digits|
+---+-------------+---------------------+
|1  |Order123     |Order                |
|2  |Item-456     |                     |
|3  |Prod_789     |                     |
|4  |Ref2001      |Ref                  |
|5  |Test123Test  |Test                 |
|6  |Alpha1234beta|Alpha                |
|7  |XYZ000       |XYZ                  |
|8  |num42value   |num                  |
|9  |Num100number |Num                  |
|10 |Val-56       |                     |
|11 |Alpha-Beta123|Beta                 |
|12 |Box_789      |                     |
|13 |CodeX11      |CodeX                |
|14 |SampleText2  |SampleText           |
|15 |DataValue500 |DataValue            |
+---+-------------+---------------------+



# Replace Multiple Spaces with Single Space

In [19]:
res9 = spark.sql (""" 
SELECT id, mixed_string, regexp_replace(mixed_string, '\\s+', ' ') AS single_spaced
FROM regex_table;



                   
                   """)

res9.show()

+---+-------------+-------------+
| id| mixed_string|single_spaced|
+---+-------------+-------------+
|  1|     Order123|     Order123|
|  2|     Item-456|     Item-456|
|  3|     Prod_789|     Prod_789|
|  4|      Ref2001|      Ref2001|
|  5|  Test123Test|  Te t123Te t|
|  6|Alpha1234beta|Alpha1234beta|
|  7|       XYZ000|       XYZ000|
|  8|   num42value|   num42value|
|  9| Num100number| Num100number|
| 10|       Val-56|       Val-56|
| 11|Alpha-Beta123|Alpha-Beta123|
| 12|      Box_789|      Box_789|
| 13|      CodeX11|      CodeX11|
| 14|  SampleText2|  SampleText2|
| 15| DataValue500| DataValue500|
+---+-------------+-------------+



In [20]:
df.withColumn("single_spaced", regexp_replace("mixed_string", "\\s+", " ")).show(truncate=False)


+---+-------------+-------------+
|id |mixed_string |single_spaced|
+---+-------------+-------------+
|1  |Order123     |Order123     |
|2  |Item-456     |Item-456     |
|3  |Prod_789     |Prod_789     |
|4  |Ref2001      |Ref2001      |
|5  |Test123Test  |Test123Test  |
|6  |Alpha1234beta|Alpha1234beta|
|7  |XYZ000       |XYZ000       |
|8  |num42value   |num42value   |
|9  |Num100number |Num100number |
|10 |Val-56       |Val-56       |
|11 |Alpha-Beta123|Alpha-Beta123|
|12 |Box_789      |Box_789      |
|13 |CodeX11      |CodeX11      |
|14 |SampleText2  |SampleText2  |
|15 |DataValue500 |DataValue500 |
+---+-------------+-------------+



#  Remove Leading and Trailing Spaces

In [21]:
res10 = spark.sql (""" 
SELECT id, mixed_string, trim(mixed_string) AS trimmed_text
FROM regex_table;


                   
                   """)

res10.show()

+---+-------------+-------------+
| id| mixed_string| trimmed_text|
+---+-------------+-------------+
|  1|     Order123|     Order123|
|  2|     Item-456|     Item-456|
|  3|     Prod_789|     Prod_789|
|  4|      Ref2001|      Ref2001|
|  5|  Test123Test|  Test123Test|
|  6|Alpha1234beta|Alpha1234beta|
|  7|       XYZ000|       XYZ000|
|  8|   num42value|   num42value|
|  9| Num100number| Num100number|
| 10|       Val-56|       Val-56|
| 11|Alpha-Beta123|Alpha-Beta123|
| 12|      Box_789|      Box_789|
| 13|      CodeX11|      CodeX11|
| 14|  SampleText2|  SampleText2|
| 15| DataValue500| DataValue500|
+---+-------------+-------------+



In [23]:
df.show()

+---+-------------+
| id| mixed_string|
+---+-------------+
|  1|     Order123|
|  2|     Item-456|
|  3|     Prod_789|
|  4|      Ref2001|
|  5|  Test123Test|
|  6|Alpha1234beta|
|  7|       XYZ000|
|  8|   num42value|
|  9| Num100number|
| 10|       Val-56|
| 11|Alpha-Beta123|
| 12|      Box_789|
| 13|      CodeX11|
| 14|  SampleText2|
| 15| DataValue500|
+---+-------------+



In [26]:
from pyspark.sql import functions as F

# Assuming 'df' is your DataFrame and 'mixed_string' is your column
df.withColumn("trimmed_text", F.trim(df["mixed_string"])).show(truncate=False)



+---+-------------+-------------+
|id |mixed_string |trimmed_text |
+---+-------------+-------------+
|1  |Order123     |Order123     |
|2  |Item-456     |Item-456     |
|3  |Prod_789     |Prod_789     |
|4  |Ref2001      |Ref2001      |
|5  |Test123Test  |Test123Test  |
|6  |Alpha1234beta|Alpha1234beta|
|7  |XYZ000       |XYZ000       |
|8  |num42value   |num42value   |
|9  |Num100number |Num100number |
|10 |Val-56       |Val-56       |
|11 |Alpha-Beta123|Alpha-Beta123|
|12 |Box_789      |Box_789      |
|13 |CodeX11      |CodeX11      |
|14 |SampleText2  |SampleText2  |
|15 |DataValue500 |DataValue500 |
+---+-------------+-------------+

