# Explanation of Regex Components:
1. \\d - Matches any digit.
2. \\D - Matches any non-digit.
3. [A-Za-z] - Matches any alphabet character.
4. [^...] - Matches any character not in the brackets.
5.    + plus sign - Matches one or more of the preceding token.
6.    * Multiply sign - Matches zero or more of the preceding token.
7.    ? - Makes the preceding token optional.
8.    $ - End of string.
9.    ^ - Beginning of string.
10.   \\b - Word boundary.

In [1]:
import os
os.getcwd()
os.chdir("H:\pyspark_advanced-coding_interview")
os.getcwd()

'H:\\pyspark_advanced-coding_interview'

In [2]:
from pyspark.sql import SparkSession

# Create a Spark session with optimized settings
spark = (
    SparkSession.builder 
    .appName("OptimizedLocalSpark") 
    .config("spark.driver.memory", "8g")        
    .config("spark.executor.memory", "8g")    
    .config("spark.executor.cores", "4")       
    .config("spark.cores.max", "12")           
    .config("spark.sql.shuffle.partitions", "28")  
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") 
    .getOrCreate()
)
sc = spark.sparkContext

In [27]:
from pyspark.sql import SparkSession

# Initialize Spark Session
spark = SparkSession.builder.appName("AdvancedRegexExamples").getOrCreate()

# Sample Data: Various strings for regex processing
data = [
    (1, "2024-10-22", "example@domain.com", "https://www.google.com", "This_is-A_Test123", "Product_100", "CamelCaseWords"),
    (2, "1990/11/30", "invalid-email@", "https://mail.yahoo.com", "HELLO_world!", "Prod_450", "anotherCamelCase"),
    (3, "2005.05.10", "user@domain.co.uk", "ftp://example.org", "UpperCASE123Lower", "prod-123", "YetAnotherCamelCase"),
    (4, "12-12-2012", "contact@sub.domain.com", "https://openai.com/blog", "Replace-these_chars", "Ref-200", "thisIsCamelCase"),
    (5, "1980-06-15", "name@example.com", "http://spark.apache.org", "UpperCaseString", "Product_789", "CamelCases")
]

# Create DataFrame
df = spark.createDataFrame(data, ["id", "date", "email", "url", "mixed_string", "product_code", "camel_case"])

# Create a Temporary View for Spark SQL
df.createOrReplaceTempView("regex_examples")
df.cache()
# Show the Original DataFrame
df.show(truncate=False)


+---+----------+----------------------+-----------------------+-------------------+------------+-------------------+
|id |date      |email                 |url                    |mixed_string       |product_code|camel_case         |
+---+----------+----------------------+-----------------------+-------------------+------------+-------------------+
|1  |2024-10-22|example@domain.com    |https://www.google.com |This_is-A_Test123  |Product_100 |CamelCaseWords     |
|2  |1990/11/30|invalid-email@        |https://mail.yahoo.com |HELLO_world!       |Prod_450    |anotherCamelCase   |
|3  |2005.05.10|user@domain.co.uk     |ftp://example.org      |UpperCASE123Lower  |prod-123    |YetAnotherCamelCase|
|4  |12-12-2012|contact@sub.domain.com|https://openai.com/blog|Replace-these_chars|Ref-200     |thisIsCamelCase    |
|5  |1980-06-15|name@example.com      |http://spark.apache.org|UpperCaseString    |Product_789 |CamelCases         |
+---+----------+----------------------+-----------------------+-

# Extract Year from a Date String Using Regex

In [2]:
res1 = spark.sql (""" 
  
SELECT id, date, regexp_extract(date, '\\d{4}', 0) AS year
FROM regex_examples;

                 
                   
                   """)

res1.show()

+---+----------+----+
| id|      date|year|
+---+----------+----+
|  1|2024-10-22|    |
|  2|1990/11/30|    |
|  3|2005.05.10|    |
|  4|12-12-2012|    |
|  5|1980-06-15|    |
+---+----------+----+



In [3]:
from pyspark.sql.functions import regexp_extract

df.withColumn("year", regexp_extract("date", "\\d{4}", 0)).show(truncate=False)


+---+----------+----------------------+-----------------------+-------------------+------------+-------------------+----+
|id |date      |email                 |url                    |mixed_string       |product_code|camel_case         |year|
+---+----------+----------------------+-----------------------+-------------------+------------+-------------------+----+
|1  |2024-10-22|example@domain.com    |https://www.google.com |This_is-A_Test123  |Product_100 |CamelCaseWords     |2024|
|2  |1990/11/30|invalid-email@        |https://mail.yahoo.com |HELLO_world!       |Prod_450    |anotherCamelCase   |1990|
|3  |2005.05.10|user@domain.co.uk     |ftp://example.org      |UpperCASE123Lower  |prod-123    |YetAnotherCamelCase|2005|
|4  |12-12-2012|contact@sub.domain.com|https://openai.com/blog|Replace-these_chars|Ref-200     |thisIsCamelCase    |2012|
|5  |1980-06-15|name@example.com      |http://spark.apache.org|UpperCaseString    |Product_789 |CamelCases         |1980|
+---+----------+--------

# Validate if a String is a Valid Email Address

In [4]:
res2 = spark.sql (""" 
  
SELECT id, email, 
       CASE WHEN email RLIKE '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,}$' THEN 'Valid' ELSE 'Invalid' END AS email_status
FROM regex_examples;


                 
                   
                   """)

res2.show()

+---+--------------------+------------+
| id|               email|email_status|
+---+--------------------+------------+
|  1|  example@domain.com|       Valid|
|  2|      invalid-email@|     Invalid|
|  3|   user@domain.co.uk|       Valid|
|  4|contact@sub.domai...|       Valid|
|  5|    name@example.com|       Valid|
+---+--------------------+------------+



In [5]:
df.withColumn("email_status", df["email"].rlike("^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,}$")).show(truncate=False)


+---+----------+----------------------+-----------------------+-------------------+------------+-------------------+------------+
|id |date      |email                 |url                    |mixed_string       |product_code|camel_case         |email_status|
+---+----------+----------------------+-----------------------+-------------------+------------+-------------------+------------+
|1  |2024-10-22|example@domain.com    |https://www.google.com |This_is-A_Test123  |Product_100 |CamelCaseWords     |true        |
|2  |1990/11/30|invalid-email@        |https://mail.yahoo.com |HELLO_world!       |Prod_450    |anotherCamelCase   |false       |
|3  |2005.05.10|user@domain.co.uk     |ftp://example.org      |UpperCASE123Lower  |prod-123    |YetAnotherCamelCase|true        |
|4  |12-12-2012|contact@sub.domain.com|https://openai.com/blog|Replace-these_chars|Ref-200     |thisIsCamelCase    |true        |
|5  |1980-06-15|name@example.com      |http://spark.apache.org|UpperCaseString    |Product

# Extract Domain Names from URLs

In [6]:
res3 = spark.sql (""" 
  
SELECT id, url, regexp_extract(url, 'https?://(www\\.)?([^/]+)', 2) AS domain
FROM regex_examples;



                 
                   
                   """)

res3.show()

+---+--------------------+----------------+
| id|                 url|          domain|
+---+--------------------+----------------+
|  1|https://www.googl...|      google.com|
|  2|https://mail.yaho...|  mail.yahoo.com|
|  3|   ftp://example.org|                |
|  4|https://openai.co...|      openai.com|
|  5|http://spark.apac...|spark.apache.org|
+---+--------------------+----------------+



In [7]:
df.withColumn("domain", regexp_extract("url", "https?://(www\\.)?([^/]+)", 2)).show(truncate=False)


+---+----------+----------------------+-----------------------+-------------------+------------+-------------------+----------------+
|id |date      |email                 |url                    |mixed_string       |product_code|camel_case         |domain          |
+---+----------+----------------------+-----------------------+-------------------+------------+-------------------+----------------+
|1  |2024-10-22|example@domain.com    |https://www.google.com |This_is-A_Test123  |Product_100 |CamelCaseWords     |google.com      |
|2  |1990/11/30|invalid-email@        |https://mail.yahoo.com |HELLO_world!       |Prod_450    |anotherCamelCase   |mail.yahoo.com  |
|3  |2005.05.10|user@domain.co.uk     |ftp://example.org      |UpperCASE123Lower  |prod-123    |YetAnotherCamelCase|                |
|4  |12-12-2012|contact@sub.domain.com|https://openai.com/blog|Replace-these_chars|Ref-200     |thisIsCamelCase    |openai.com      |
|5  |1980-06-15|name@example.com      |http://spark.apache.org

# Replace - and _ with a Space

In [8]:
res4 = spark.sql (""" 
  

SELECT id, mixed_string, regexp_replace(mixed_string, '[-_]', ' ') AS replaced
FROM regex_examples;


                 
                   
                   """)

res4.show()

+---+-------------------+-------------------+
| id|       mixed_string|           replaced|
+---+-------------------+-------------------+
|  1|  This_is-A_Test123|  This is A Test123|
|  2|       HELLO_world!|       HELLO world!|
|  3|  UpperCASE123Lower|  UpperCASE123Lower|
|  4|Replace-these_chars|Replace these chars|
|  5|    UpperCaseString|    UpperCaseString|
+---+-------------------+-------------------+



In [10]:
from pyspark.sql.functions import regexp_replace
df.withColumn("replaced", regexp_replace("mixed_string", "[-_]", " ")).show(truncate=False)


+---+----------+----------------------+-----------------------+-------------------+------------+-------------------+-------------------+
|id |date      |email                 |url                    |mixed_string       |product_code|camel_case         |replaced           |
+---+----------+----------------------+-----------------------+-------------------+------------+-------------------+-------------------+
|1  |2024-10-22|example@domain.com    |https://www.google.com |This_is-A_Test123  |Product_100 |CamelCaseWords     |This is A Test123  |
|2  |1990/11/30|invalid-email@        |https://mail.yahoo.com |HELLO_world!       |Prod_450    |anotherCamelCase   |HELLO world!       |
|3  |2005.05.10|user@domain.co.uk     |ftp://example.org      |UpperCASE123Lower  |prod-123    |YetAnotherCamelCase|UpperCASE123Lower  |
|4  |12-12-2012|contact@sub.domain.com|https://openai.com/blog|Replace-these_chars|Ref-200     |thisIsCamelCase    |Replace these chars|
|5  |1980-06-15|name@example.com      |ht

# Extract Only Uppercase Letters from a String

In [11]:
res5 = spark.sql (""" 
  

SELECT id, mixed_string, regexp_extract(mixed_string, '[A-Z]+', 0) AS uppercase_letters
FROM regex_examples;

                   
                   """)

res5.show()

+---+-------------------+-----------------+
| id|       mixed_string|uppercase_letters|
+---+-------------------+-----------------+
|  1|  This_is-A_Test123|                T|
|  2|       HELLO_world!|            HELLO|
|  3|  UpperCASE123Lower|                U|
|  4|Replace-these_chars|                R|
|  5|    UpperCaseString|                U|
+---+-------------------+-----------------+



In [12]:
df.withColumn("uppercase_letters", regexp_extract("mixed_string", "[A-Z]+", 0)).show(truncate=False)


+---+----------+----------------------+-----------------------+-------------------+------------+-------------------+-----------------+
|id |date      |email                 |url                    |mixed_string       |product_code|camel_case         |uppercase_letters|
+---+----------+----------------------+-----------------------+-------------------+------------+-------------------+-----------------+
|1  |2024-10-22|example@domain.com    |https://www.google.com |This_is-A_Test123  |Product_100 |CamelCaseWords     |T                |
|2  |1990/11/30|invalid-email@        |https://mail.yahoo.com |HELLO_world!       |Prod_450    |anotherCamelCase   |HELLO            |
|3  |2005.05.10|user@domain.co.uk     |ftp://example.org      |UpperCASE123Lower  |prod-123    |YetAnotherCamelCase|U                |
|4  |12-12-2012|contact@sub.domain.com|https://openai.com/blog|Replace-these_chars|Ref-200     |thisIsCamelCase    |R                |
|5  |1980-06-15|name@example.com      |http://spark.apa

# Replace Vowels with *

In [13]:
res6 = spark.sql (""" 
  
SELECT id, mixed_string, regexp_replace(mixed_string, '[AEIOUaeiou]', '*') AS replaced_vowels
FROM regex_examples;

                   
                   """)

res6.show()

+---+-------------------+-------------------+
| id|       mixed_string|    replaced_vowels|
+---+-------------------+-------------------+
|  1|  This_is-A_Test123|  Th*s_*s-*_T*st123|
|  2|       HELLO_world!|       H*LL*_w*rld!|
|  3|  UpperCASE123Lower|  *pp*rC*S*123L*w*r|
|  4|Replace-these_chars|R*pl*c*-th*s*_ch*rs|
|  5|    UpperCaseString|    *pp*rC*s*Str*ng|
+---+-------------------+-------------------+



In [14]:
df.withColumn("replaced_vowels", regexp_replace("mixed_string", "[AEIOUaeiou]", "*")).show(truncate=False)


+---+----------+----------------------+-----------------------+-------------------+------------+-------------------+-------------------+
|id |date      |email                 |url                    |mixed_string       |product_code|camel_case         |replaced_vowels    |
+---+----------+----------------------+-----------------------+-------------------+------------+-------------------+-------------------+
|1  |2024-10-22|example@domain.com    |https://www.google.com |This_is-A_Test123  |Product_100 |CamelCaseWords     |Th*s_*s-*_T*st123  |
|2  |1990/11/30|invalid-email@        |https://mail.yahoo.com |HELLO_world!       |Prod_450    |anotherCamelCase   |H*LL*_w*rld!       |
|3  |2005.05.10|user@domain.co.uk     |ftp://example.org      |UpperCASE123Lower  |prod-123    |YetAnotherCamelCase|*pp*rC*S*123L*w*r  |
|4  |12-12-2012|contact@sub.domain.com|https://openai.com/blog|Replace-these_chars|Ref-200     |thisIsCamelCase    |R*pl*c*-th*s*_ch*rs|
|5  |1980-06-15|name@example.com      |ht

# Extract Numbers That Appear at the End of the String

In [15]:
res7 = spark.sql (""" 
  
SELECT id, product_code, regexp_extract(product_code, '\\d+$', 0) AS end_numbers
FROM regex_examples;


                   
                   """)

res7.show()

+---+------------+-----------+
| id|product_code|end_numbers|
+---+------------+-----------+
|  1| Product_100|           |
|  2|    Prod_450|           |
|  3|    prod-123|           |
|  4|     Ref-200|           |
|  5| Product_789|           |
+---+------------+-----------+



In [16]:
df.withColumn("end_numbers", regexp_extract("product_code", "\\d+$", 0)).show(truncate=False)


+---+----------+----------------------+-----------------------+-------------------+------------+-------------------+-----------+
|id |date      |email                 |url                    |mixed_string       |product_code|camel_case         |end_numbers|
+---+----------+----------------------+-----------------------+-------------------+------------+-------------------+-----------+
|1  |2024-10-22|example@domain.com    |https://www.google.com |This_is-A_Test123  |Product_100 |CamelCaseWords     |100        |
|2  |1990/11/30|invalid-email@        |https://mail.yahoo.com |HELLO_world!       |Prod_450    |anotherCamelCase   |450        |
|3  |2005.05.10|user@domain.co.uk     |ftp://example.org      |UpperCASE123Lower  |prod-123    |YetAnotherCamelCase|123        |
|4  |12-12-2012|contact@sub.domain.com|https://openai.com/blog|Replace-these_chars|Ref-200     |thisIsCamelCase    |200        |
|5  |1980-06-15|name@example.com      |http://spark.apache.org|UpperCaseString    |Product_789 |C

# Check if String Contains Special Characters

In [17]:
res8 = spark.sql (""" 
  
SELECT id, mixed_string, 
       CASE WHEN mixed_string RLIKE '[^A-Za-z0-9]' THEN 'Yes' ELSE 'No' END AS has_special_char
FROM regex_examples;

                   
                   """)

res8.show()

+---+-------------------+----------------+
| id|       mixed_string|has_special_char|
+---+-------------------+----------------+
|  1|  This_is-A_Test123|             Yes|
|  2|       HELLO_world!|             Yes|
|  3|  UpperCASE123Lower|              No|
|  4|Replace-these_chars|             Yes|
|  5|    UpperCaseString|              No|
+---+-------------------+----------------+



In [18]:
df.withColumn("has_special_char", df["mixed_string"].rlike("[^A-Za-z0-9]")).show(truncate=False)


+---+----------+----------------------+-----------------------+-------------------+------------+-------------------+----------------+
|id |date      |email                 |url                    |mixed_string       |product_code|camel_case         |has_special_char|
+---+----------+----------------------+-----------------------+-------------------+------------+-------------------+----------------+
|1  |2024-10-22|example@domain.com    |https://www.google.com |This_is-A_Test123  |Product_100 |CamelCaseWords     |true            |
|2  |1990/11/30|invalid-email@        |https://mail.yahoo.com |HELLO_world!       |Prod_450    |anotherCamelCase   |true            |
|3  |2005.05.10|user@domain.co.uk     |ftp://example.org      |UpperCASE123Lower  |prod-123    |YetAnotherCamelCase|false           |
|4  |12-12-2012|contact@sub.domain.com|https://openai.com/blog|Replace-these_chars|Ref-200     |thisIsCamelCase    |true            |
|5  |1980-06-15|name@example.com      |http://spark.apache.org

# Replace Sequences of More Than 1 Digit with#

In [20]:
res9 = spark.sql (""" 
  
SELECT id, product_code, regexp_replace(product_code, '\\d{2,}', '#') AS replaced
FROM regex_examples;

                   
                   """)

res9.show()

+---+------------+-----------+
| id|product_code|   replaced|
+---+------------+-----------+
|  1| Product_100|Product_100|
|  2|    Prod_450|   Prod_450|
|  3|    prod-123|   prod-123|
|  4|     Ref-200|    Ref-200|
|  5| Product_789|Product_789|
+---+------------+-----------+



In [21]:
df.withColumn("replaced", regexp_replace("product_code", "\\d{2,}", "#")).show(truncate=False)


+---+----------+----------------------+-----------------------+-------------------+------------+-------------------+---------+
|id |date      |email                 |url                    |mixed_string       |product_code|camel_case         |replaced |
+---+----------+----------------------+-----------------------+-------------------+------------+-------------------+---------+
|1  |2024-10-22|example@domain.com    |https://www.google.com |This_is-A_Test123  |Product_100 |CamelCaseWords     |Product_#|
|2  |1990/11/30|invalid-email@        |https://mail.yahoo.com |HELLO_world!       |Prod_450    |anotherCamelCase   |Prod_#   |
|3  |2005.05.10|user@domain.co.uk     |ftp://example.org      |UpperCASE123Lower  |prod-123    |YetAnotherCamelCase|prod-#   |
|4  |12-12-2012|contact@sub.domain.com|https://openai.com/blog|Replace-these_chars|Ref-200     |thisIsCamelCase    |Ref-#    |
|5  |1980-06-15|name@example.com      |http://spark.apache.org|UpperCaseString    |Product_789 |CamelCases     

# Extract Words That Start With a Capital Letter

In [22]:
res10 = spark.sql (""" 
  
SELECT id, mixed_string, regexp_extract(mixed_string, '\\b[A-Z][a-zA-Z]*', 0) AS capital_word
FROM regex_examples;

                   
                   """)

res10.show()

+---+-------------------+------------+
| id|       mixed_string|capital_word|
+---+-------------------+------------+
|  1|  This_is-A_Test123|            |
|  2|       HELLO_world!|            |
|  3|  UpperCASE123Lower|            |
|  4|Replace-these_chars|            |
|  5|    UpperCaseString|            |
+---+-------------------+------------+



In [23]:
df.withColumn("capital_word", regexp_extract("mixed_string", "\\b[A-Z][a-zA-Z]*", 0)).show(truncate=False)


+---+----------+----------------------+-----------------------+-------------------+------------+-------------------+---------------+
|id |date      |email                 |url                    |mixed_string       |product_code|camel_case         |capital_word   |
+---+----------+----------------------+-----------------------+-------------------+------------+-------------------+---------------+
|1  |2024-10-22|example@domain.com    |https://www.google.com |This_is-A_Test123  |Product_100 |CamelCaseWords     |This           |
|2  |1990/11/30|invalid-email@        |https://mail.yahoo.com |HELLO_world!       |Prod_450    |anotherCamelCase   |HELLO          |
|3  |2005.05.10|user@domain.co.uk     |ftp://example.org      |UpperCASE123Lower  |prod-123    |YetAnotherCamelCase|UpperCASE      |
|4  |12-12-2012|contact@sub.domain.com|https://openai.com/blog|Replace-these_chars|Ref-200     |thisIsCamelCase    |Replace        |
|5  |1980-06-15|name@example.com      |http://spark.apache.org|UpperC

# Remove All Digits from a String

In [25]:
res11 = spark.sql (""" 
  
SELECT id, mixed_string, regexp_replace(mixed_string, '\\d', '') AS no_digits
FROM regex_examples;

                   
                   """)

res11.show()

+---+-------------------+-------------------+
| id|       mixed_string|          no_digits|
+---+-------------------+-------------------+
|  1|  This_is-A_Test123|  This_is-A_Test123|
|  2|       HELLO_world!|        HELLO_worl!|
|  3|  UpperCASE123Lower|  UpperCASE123Lower|
|  4|Replace-these_chars|Replace-these_chars|
|  5|    UpperCaseString|    UpperCaseString|
+---+-------------------+-------------------+



In [26]:
df.withColumn("no_digits", regexp_replace("mixed_string", "\\d", "")).show(truncate=False)


+---+----------+----------------------+-----------------------+-------------------+------------+-------------------+-------------------+
|id |date      |email                 |url                    |mixed_string       |product_code|camel_case         |no_digits          |
+---+----------+----------------------+-----------------------+-------------------+------------+-------------------+-------------------+
|1  |2024-10-22|example@domain.com    |https://www.google.com |This_is-A_Test123  |Product_100 |CamelCaseWords     |This_is-A_Test     |
|2  |1990/11/30|invalid-email@        |https://mail.yahoo.com |HELLO_world!       |Prod_450    |anotherCamelCase   |HELLO_world!       |
|3  |2005.05.10|user@domain.co.uk     |ftp://example.org      |UpperCASE123Lower  |prod-123    |YetAnotherCamelCase|UpperCASELower     |
|4  |12-12-2012|contact@sub.domain.com|https://openai.com/blog|Replace-these_chars|Ref-200     |thisIsCamelCase    |Replace-these_chars|
|5  |1980-06-15|name@example.com      |ht

# Check if String Matches a Specific Pattern (like Prod_###)

In [28]:
res12 = spark.sql (""" 
  
SELECT id, product_code, 
       CASE WHEN product_code RLIKE '^Prod_\\d+$' THEN 'Matches' ELSE 'Does Not Match' END AS pattern_check
FROM regex_examples;

                   
                   """)

res12.show()

+---+------------+--------------+
| id|product_code| pattern_check|
+---+------------+--------------+
|  1| Product_100|Does Not Match|
|  2|    Prod_450|Does Not Match|
|  3|    prod-123|Does Not Match|
|  4|     Ref-200|Does Not Match|
|  5| Product_789|Does Not Match|
+---+------------+--------------+



In [29]:
df.withColumn("pattern_check", df["product_code"].rlike("^Prod_\\d+$")).show(truncate=False)


+---+----------+----------------------+-----------------------+-------------------+------------+-------------------+-------------+
|id |date      |email                 |url                    |mixed_string       |product_code|camel_case         |pattern_check|
+---+----------+----------------------+-----------------------+-------------------+------------+-------------------+-------------+
|1  |2024-10-22|example@domain.com    |https://www.google.com |This_is-A_Test123  |Product_100 |CamelCaseWords     |false        |
|2  |1990/11/30|invalid-email@        |https://mail.yahoo.com |HELLO_world!       |Prod_450    |anotherCamelCase   |true         |
|3  |2005.05.10|user@domain.co.uk     |ftp://example.org      |UpperCASE123Lower  |prod-123    |YetAnotherCamelCase|false        |
|4  |12-12-2012|contact@sub.domain.com|https://openai.com/blog|Replace-these_chars|Ref-200     |thisIsCamelCase    |false        |
|5  |1980-06-15|name@example.com      |http://spark.apache.org|UpperCaseString    |

# Extract First Word from a String

In [30]:
df.withColumn("first_word", regexp_extract("mixed_string", "^\\w+", 0)).show(truncate=False)


+---+----------+----------------------+-----------------------+-------------------+------------+-------------------+-----------------+
|id |date      |email                 |url                    |mixed_string       |product_code|camel_case         |first_word       |
+---+----------+----------------------+-----------------------+-------------------+------------+-------------------+-----------------+
|1  |2024-10-22|example@domain.com    |https://www.google.com |This_is-A_Test123  |Product_100 |CamelCaseWords     |This_is          |
|2  |1990/11/30|invalid-email@        |https://mail.yahoo.com |HELLO_world!       |Prod_450    |anotherCamelCase   |HELLO_world      |
|3  |2005.05.10|user@domain.co.uk     |ftp://example.org      |UpperCASE123Lower  |prod-123    |YetAnotherCamelCase|UpperCASE123Lower|
|4  |12-12-2012|contact@sub.domain.com|https://openai.com/blog|Replace-these_chars|Ref-200     |thisIsCamelCase    |Replace          |
|5  |1980-06-15|name@example.com      |http://spark.apa

In [31]:
res13 = spark.sql (""" 
  
SELECT id, mixed_string, regexp_extract(mixed_string, '^\\w+', 0) AS first_word
FROM regex_examples;

                   
                   """)

res13.show()

+---+-------------------+----------+
| id|       mixed_string|first_word|
+---+-------------------+----------+
|  1|  This_is-A_Test123|          |
|  2|       HELLO_world!|          |
|  3|  UpperCASE123Lower|          |
|  4|Replace-these_chars|          |
|  5|    UpperCaseString|          |
+---+-------------------+----------+



# Validate if String is a Valid IP Address

In [32]:
res14 = spark.sql (""" 
  
SELECT id, email, 
       CASE WHEN email RLIKE '^\\d{1,3}(\\.\\d{1,3}){3}$' THEN 'Valid IP' ELSE 'Invalid' END AS ip_check
FROM regex_examples;


                   
                   """)

res14.show()


df.withColumn("ip_check", df["email"].rlike("^\\d{1,3}(\\.\\d{1,3}){3}$")).show(truncate=False)


+---+--------------------+--------+
| id|               email|ip_check|
+---+--------------------+--------+
|  1|  example@domain.com| Invalid|
|  2|      invalid-email@| Invalid|
|  3|   user@domain.co.uk| Invalid|
|  4|contact@sub.domai...| Invalid|
|  5|    name@example.com| Invalid|
+---+--------------------+--------+

+---+----------+----------------------+-----------------------+-------------------+------------+-------------------+--------+
|id |date      |email                 |url                    |mixed_string       |product_code|camel_case         |ip_check|
+---+----------+----------------------+-----------------------+-------------------+------------+-------------------+--------+
|1  |2024-10-22|example@domain.com    |https://www.google.com |This_is-A_Test123  |Product_100 |CamelCaseWords     |false   |
|2  |1990/11/30|invalid-email@        |https://mail.yahoo.com |HELLO_world!       |Prod_450    |anotherCamelCase   |false   |
|3  |2005.05.10|user@domain.co.uk     |ftp://

# Extract Year, Month, and Day Separately

In [45]:
res100 = spark.sql (""" 
  
SELECT id, date, 
       regexp_extract(date, '(\\d{4})', 1) AS year,
       regexp_extract(date, '(\\d{2})-(\\d{2})-(\\d{4})', 2) AS month,
       regexp_extract(date, '(\\d{2})-(\\d{2})-(\\d{4})', 1) AS day
FROM regex_examples;


                   
                   """)

res100.show()


df.withColumn("year", regexp_extract("date", "(\\d{4})", 1)) \
  .withColumn("month", regexp_extract("date", "(\\d{2})-(\\d{2})-(\\d{4})", 2)) \
  .withColumn("day", regexp_extract("date", "(\\d{2})-(\\d{2})-(\\d{4})", 1)).show(truncate=False)


+---+----------+----+-----+---+
| id|      date|year|month|day|
+---+----------+----+-----+---+
|  1|2024-10-22|    |     |   |
|  2|1990/11/30|    |     |   |
|  3|2005.05.10|    |     |   |
|  4|12-12-2012|    |     |   |
|  5|1980-06-15|    |     |   |
+---+----------+----+-----+---+

+---+----------+----------------------+-----------------------+-------------------+------------+-------------------+----+-----+---+
|id |date      |email                 |url                    |mixed_string       |product_code|camel_case         |year|month|day|
+---+----------+----------------------+-----------------------+-------------------+------------+-------------------+----+-----+---+
|1  |2024-10-22|example@domain.com    |https://www.google.com |This_is-A_Test123  |Product_100 |CamelCaseWords     |2024|     |   |
|2  |1990/11/30|invalid-email@        |https://mail.yahoo.com |HELLO_world!       |Prod_450    |anotherCamelCase   |1990|     |   |
|3  |2005.05.10|user@domain.co.uk     |ftp://exampl

# Remove Everything After a Specific Character (e.g., -)

In [33]:
res15 = spark.sql (""" 
  
SELECT id, product_code, regexp_replace(product_code, '-.*', '') AS cleaned_code
FROM regex_examples;


                   
                   """)

res15.show()


df.withColumn("cleaned_code", regexp_replace("product_code", "-.*", "")).show(truncate=False)


+---+------------+------------+
| id|product_code|cleaned_code|
+---+------------+------------+
|  1| Product_100| Product_100|
|  2|    Prod_450|    Prod_450|
|  3|    prod-123|        prod|
|  4|     Ref-200|         Ref|
|  5| Product_789| Product_789|
+---+------------+------------+

+---+----------+----------------------+-----------------------+-------------------+------------+-------------------+------------+
|id |date      |email                 |url                    |mixed_string       |product_code|camel_case         |cleaned_code|
+---+----------+----------------------+-----------------------+-------------------+------------+-------------------+------------+
|1  |2024-10-22|example@domain.com    |https://www.google.com |This_is-A_Test123  |Product_100 |CamelCaseWords     |Product_100 |
|2  |1990/11/30|invalid-email@        |https://mail.yahoo.com |HELLO_world!       |Prod_450    |anotherCamelCase   |Prod_450    |
|3  |2005.05.10|user@domain.co.uk     |ftp://example.org     

# Replace Uppercase Letters With Lowercase and Vice Versa

In [35]:
from pyspark.sql.functions import translate

df.withColumn("swapped_case", translate("mixed_string", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz", "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")).show(truncate=False)


+---+----------+----------------------+-----------------------+-------------------+------------+-------------------+-------------------+
|id |date      |email                 |url                    |mixed_string       |product_code|camel_case         |swapped_case       |
+---+----------+----------------------+-----------------------+-------------------+------------+-------------------+-------------------+
|1  |2024-10-22|example@domain.com    |https://www.google.com |This_is-A_Test123  |Product_100 |CamelCaseWords     |tHIS_IS-a_tEST123  |
|2  |1990/11/30|invalid-email@        |https://mail.yahoo.com |HELLO_world!       |Prod_450    |anotherCamelCase   |hello_WORLD!       |
|3  |2005.05.10|user@domain.co.uk     |ftp://example.org      |UpperCASE123Lower  |prod-123    |YetAnotherCamelCase|uPPERcase123lOWER  |
|4  |12-12-2012|contact@sub.domain.com|https://openai.com/blog|Replace-these_chars|Ref-200     |thisIsCamelCase    |rEPLACE-THESE_CHARS|
|5  |1980-06-15|name@example.com      |ht

# Extract Second Word from a String

In [36]:
res17 = spark.sql (""" 
  
SELECT id, mixed_string, regexp_extract(mixed_string, '^\\w+\\s(\\w+)', 1) AS second_word
FROM regex_examples;



                   
                   """)

res17.show()

+---+-------------------+-----------+
| id|       mixed_string|second_word|
+---+-------------------+-----------+
|  1|  This_is-A_Test123|           |
|  2|       HELLO_world!|           |
|  3|  UpperCASE123Lower|           |
|  4|Replace-these_chars|           |
|  5|    UpperCaseString|           |
+---+-------------------+-----------+



In [37]:
df.withColumn("second_word", regexp_extract("mixed_string", "^\\w+\\s(\\w+)", 1)).show(truncate=False)


+---+----------+----------------------+-----------------------+-------------------+------------+-------------------+-----------+
|id |date      |email                 |url                    |mixed_string       |product_code|camel_case         |second_word|
+---+----------+----------------------+-----------------------+-------------------+------------+-------------------+-----------+
|1  |2024-10-22|example@domain.com    |https://www.google.com |This_is-A_Test123  |Product_100 |CamelCaseWords     |           |
|2  |1990/11/30|invalid-email@        |https://mail.yahoo.com |HELLO_world!       |Prod_450    |anotherCamelCase   |           |
|3  |2005.05.10|user@domain.co.uk     |ftp://example.org      |UpperCASE123Lower  |prod-123    |YetAnotherCamelCase|           |
|4  |12-12-2012|contact@sub.domain.com|https://openai.com/blog|Replace-these_chars|Ref-200     |thisIsCamelCase    |           |
|5  |1980-06-15|name@example.com      |http://spark.apache.org|UpperCaseString    |Product_789 |C

# Extract Last Word from a String

In [38]:
res18 = spark.sql (""" 
  
SELECT id, mixed_string, regexp_extract(mixed_string, '\\w+$', 0) AS last_word
FROM regex_examples;

                   
                   """)

res18.show()

+---+-------------------+---------+
| id|       mixed_string|last_word|
+---+-------------------+---------+
|  1|  This_is-A_Test123|         |
|  2|       HELLO_world!|         |
|  3|  UpperCASE123Lower|         |
|  4|Replace-these_chars|         |
|  5|    UpperCaseString|         |
+---+-------------------+---------+



In [39]:
df.withColumn("last_word", regexp_extract("mixed_string", "\\w+$", 0)).show(truncate=False)


+---+----------+----------------------+-----------------------+-------------------+------------+-------------------+-----------------+
|id |date      |email                 |url                    |mixed_string       |product_code|camel_case         |last_word        |
+---+----------+----------------------+-----------------------+-------------------+------------+-------------------+-----------------+
|1  |2024-10-22|example@domain.com    |https://www.google.com |This_is-A_Test123  |Product_100 |CamelCaseWords     |A_Test123        |
|2  |1990/11/30|invalid-email@        |https://mail.yahoo.com |HELLO_world!       |Prod_450    |anotherCamelCase   |                 |
|3  |2005.05.10|user@domain.co.uk     |ftp://example.org      |UpperCASE123Lower  |prod-123    |YetAnotherCamelCase|UpperCASE123Lower|
|4  |12-12-2012|contact@sub.domain.com|https://openai.com/blog|Replace-these_chars|Ref-200     |thisIsCamelCase    |these_chars      |
|5  |1980-06-15|name@example.com      |http://spark.apa

# Validate if String is a Valid Phone Number

In [40]:
res19 = spark.sql (""" 
  
SELECT id, mixed_string, 
       CASE WHEN mixed_string RLIKE '^\\+?\\d{1,3}[- ]?\\d{3}[- ]?\\d{3}[- ]?\\d{4}$' THEN 'Valid' ELSE 'Invalid' END AS phone_check
FROM regex_examples;


                   
                   """)

res19.show()

+---+-------------------+-----------+
| id|       mixed_string|phone_check|
+---+-------------------+-----------+
|  1|  This_is-A_Test123|    Invalid|
|  2|       HELLO_world!|    Invalid|
|  3|  UpperCASE123Lower|    Invalid|
|  4|Replace-these_chars|    Invalid|
|  5|    UpperCaseString|    Invalid|
+---+-------------------+-----------+



In [41]:
df.withColumn("phone_check", df["mixed_string"].rlike("^\\+?\\d{1,3}[- ]?\\d{3}[- ]?\\d{3}[- ]?\\d{4}$")).show(truncate=False)


+---+----------+----------------------+-----------------------+-------------------+------------+-------------------+-----------+
|id |date      |email                 |url                    |mixed_string       |product_code|camel_case         |phone_check|
+---+----------+----------------------+-----------------------+-------------------+------------+-------------------+-----------+
|1  |2024-10-22|example@domain.com    |https://www.google.com |This_is-A_Test123  |Product_100 |CamelCaseWords     |false      |
|2  |1990/11/30|invalid-email@        |https://mail.yahoo.com |HELLO_world!       |Prod_450    |anotherCamelCase   |false      |
|3  |2005.05.10|user@domain.co.uk     |ftp://example.org      |UpperCASE123Lower  |prod-123    |YetAnotherCamelCase|false      |
|4  |12-12-2012|contact@sub.domain.com|https://openai.com/blog|Replace-these_chars|Ref-200     |thisIsCamelCase    |false      |
|5  |1980-06-15|name@example.com      |http://spark.apache.org|UpperCaseString    |Product_789 |C

# Convert CamelCase Words to Space-Separated Words

In [42]:
df.withColumn("converted", regexp_replace("camel_case", "(?<=[a-z])([A-Z])", " $1")).show(truncate=False)


+---+----------+----------------------+-----------------------+-------------------+------------+-------------------+----------------------+
|id |date      |email                 |url                    |mixed_string       |product_code|camel_case         |converted             |
+---+----------+----------------------+-----------------------+-------------------+------------+-------------------+----------------------+
|1  |2024-10-22|example@domain.com    |https://www.google.com |This_is-A_Test123  |Product_100 |CamelCaseWords     |Camel Case Words      |
|2  |1990/11/30|invalid-email@        |https://mail.yahoo.com |HELLO_world!       |Prod_450    |anotherCamelCase   |another Camel Case    |
|3  |2005.05.10|user@domain.co.uk     |ftp://example.org      |UpperCASE123Lower  |prod-123    |YetAnotherCamelCase|Yet Another Camel Case|
|4  |12-12-2012|contact@sub.domain.com|https://openai.com/blog|Replace-these_chars|Ref-200     |thisIsCamelCase    |this Is Camel Case    |
|5  |1980-06-15|name

# Replace https with http

In [48]:
res20 = spark.sql (""" 
  
SELECT id, url, regexp_replace(url, 'https', 'http') AS updated_url 
FROM regex_examples;


                   
                   """)

res20.show()



df.withColumn("updated_url", regexp_replace("url", "https", "http")).show(truncate=False)


+---+--------------------+--------------------+
| id|                 url|         updated_url|
+---+--------------------+--------------------+
|  1|https://www.googl...|http://www.google...|
|  2|https://mail.yaho...|http://mail.yahoo...|
|  3|   ftp://example.org|   ftp://example.org|
|  4|https://openai.co...|http://openai.com...|
|  5|http://spark.apac...|http://spark.apac...|
+---+--------------------+--------------------+

+---+----------+----------------------+-----------------------+-------------------+------------+-------------------+-----------------------+
|id |date      |email                 |url                    |mixed_string       |product_code|camel_case         |updated_url            |
+---+----------+----------------------+-----------------------+-------------------+------------+-------------------+-----------------------+
|1  |2024-10-22|example@domain.com    |https://www.google.com |This_is-A_Test123  |Product_100 |CamelCaseWords     |http://www.google.com  |
|2 

# Extract Domain Name Without Protocol

In [50]:
res21 = spark.sql (""" 
  
SELECT id, url, regexp_extract(url, '://([^/]+)', 1) AS domain_name FROM regex_examples


                   
                   """)

res21.show()

df.withColumn("domain_name", regexp_extract("url", "://([^/]+)", 1)).show(truncate=False)


+---+--------------------+----------------+
| id|                 url|     domain_name|
+---+--------------------+----------------+
|  1|https://www.googl...|  www.google.com|
|  2|https://mail.yaho...|  mail.yahoo.com|
|  3|   ftp://example.org|     example.org|
|  4|https://openai.co...|      openai.com|
|  5|http://spark.apac...|spark.apache.org|
+---+--------------------+----------------+

+---+----------+----------------------+-----------------------+-------------------+------------+-------------------+----------------+
|id |date      |email                 |url                    |mixed_string       |product_code|camel_case         |domain_name     |
+---+----------+----------------------+-----------------------+-------------------+------------+-------------------+----------------+
|1  |2024-10-22|example@domain.com    |https://www.google.com |This_is-A_Test123  |Product_100 |CamelCaseWords     |www.google.com  |
|2  |1990/11/30|invalid-email@        |https://mail.yahoo.com |HELL

# Extract Numbers That Appear in the Middle of a String

In [51]:
res22 = spark.sql (""" 
  
SELECT id, product_code, regexp_extract(product_code, '\\d+', 0) AS middle_number 

 FROM regex_examples;


                   
                   """)

res22.show()

+---+------------+-------------+
| id|product_code|middle_number|
+---+------------+-------------+
|  1| Product_100|            d|
|  2|    Prod_450|            d|
|  3|    prod-123|            d|
|  4|     Ref-200|             |
|  5| Product_789|            d|
+---+------------+-------------+



In [52]:
df.withColumn("middle_number", regexp_extract("product_code", "\\d+", 0)).show(truncate=False)


+---+----------+----------------------+-----------------------+-------------------+------------+-------------------+-------------+
|id |date      |email                 |url                    |mixed_string       |product_code|camel_case         |middle_number|
+---+----------+----------------------+-----------------------+-------------------+------------+-------------------+-------------+
|1  |2024-10-22|example@domain.com    |https://www.google.com |This_is-A_Test123  |Product_100 |CamelCaseWords     |100          |
|2  |1990/11/30|invalid-email@        |https://mail.yahoo.com |HELLO_world!       |Prod_450    |anotherCamelCase   |450          |
|3  |2005.05.10|user@domain.co.uk     |ftp://example.org      |UpperCASE123Lower  |prod-123    |YetAnotherCamelCase|123          |
|4  |12-12-2012|contact@sub.domain.com|https://openai.com/blog|Replace-these_chars|Ref-200     |thisIsCamelCase    |200          |
|5  |1980-06-15|name@example.com      |http://spark.apache.org|UpperCaseString    |

# Extract the First Four Characters from a String

In [55]:
res23 = spark.sql (""" 
  
SELECT id, mixed_string, substring(mixed_string, 1, 4) AS prefix 
 

 FROM regex_examples;


                   
                   """)

res23.show()

df.withColumn("prefix", df["mixed_string"].substr(1, 4)).show(truncate=False)


+---+-------------------+------+
| id|       mixed_string|prefix|
+---+-------------------+------+
|  1|  This_is-A_Test123|  This|
|  2|       HELLO_world!|  HELL|
|  3|  UpperCASE123Lower|  Uppe|
|  4|Replace-these_chars|  Repl|
|  5|    UpperCaseString|  Uppe|
+---+-------------------+------+

+---+----------+----------------------+-----------------------+-------------------+------------+-------------------+------+
|id |date      |email                 |url                    |mixed_string       |product_code|camel_case         |prefix|
+---+----------+----------------------+-----------------------+-------------------+------------+-------------------+------+
|1  |2024-10-22|example@domain.com    |https://www.google.com |This_is-A_Test123  |Product_100 |CamelCaseWords     |This  |
|2  |1990/11/30|invalid-email@        |https://mail.yahoo.com |HELLO_world!       |Prod_450    |anotherCamelCase   |HELL  |
|3  |2005.05.10|user@domain.co.uk     |ftp://example.org      |UpperCASE123Lower  

# Extract the Last Word from a Name


In [57]:
res24 = spark.sql (""" 
  
SELECT id, mixed_string, regexp_extract(mixed_string , '\\b\\w+$', 0) AS last_word 


 FROM regex_examples;


                   
                   """)

res24.show()

+---+-------------------+---------+
| id|       mixed_string|last_word|
+---+-------------------+---------+
|  1|  This_is-A_Test123|         |
|  2|       HELLO_world!|         |
|  3|  UpperCASE123Lower|         |
|  4|Replace-these_chars|         |
|  5|    UpperCaseString|         |
+---+-------------------+---------+



In [58]:
df.withColumn("last_word", regexp_extract("mixed_string", "\\b\\w+$", 0)).show(truncate=False)


+---+----------+----------------------+-----------------------+-------------------+------------+-------------------+-----------------+
|id |date      |email                 |url                    |mixed_string       |product_code|camel_case         |last_word        |
+---+----------+----------------------+-----------------------+-------------------+------------+-------------------+-----------------+
|1  |2024-10-22|example@domain.com    |https://www.google.com |This_is-A_Test123  |Product_100 |CamelCaseWords     |A_Test123        |
|2  |1990/11/30|invalid-email@        |https://mail.yahoo.com |HELLO_world!       |Prod_450    |anotherCamelCase   |                 |
|3  |2005.05.10|user@domain.co.uk     |ftp://example.org      |UpperCASE123Lower  |prod-123    |YetAnotherCamelCase|UpperCASE123Lower|
|4  |12-12-2012|contact@sub.domain.com|https://openai.com/blog|Replace-these_chars|Ref-200     |thisIsCamelCase    |these_chars      |
|5  |1980-06-15|name@example.com      |http://spark.apa

# Extract Serial Number from a String

In [59]:
res25 = spark.sql (""" 
  
SELECT id, product_code, regexp_extract(product_code, '[A-Z]{3}-\\d{4}', 0) AS extracted_serial 



 FROM regex_examples;


                   
                   """)

res25.show()

+---+------------+----------------+
| id|product_code|extracted_serial|
+---+------------+----------------+
|  1| Product_100|                |
|  2|    Prod_450|                |
|  3|    prod-123|                |
|  4|     Ref-200|                |
|  5| Product_789|                |
+---+------------+----------------+



In [60]:
df.withColumn("extracted_serial", regexp_extract("product_code", "[A-Z]{3}-\\d{4}", 0)).show(truncate=False)


+---+----------+----------------------+-----------------------+-------------------+------------+-------------------+----------------+
|id |date      |email                 |url                    |mixed_string       |product_code|camel_case         |extracted_serial|
+---+----------+----------------------+-----------------------+-------------------+------------+-------------------+----------------+
|1  |2024-10-22|example@domain.com    |https://www.google.com |This_is-A_Test123  |Product_100 |CamelCaseWords     |                |
|2  |1990/11/30|invalid-email@        |https://mail.yahoo.com |HELLO_world!       |Prod_450    |anotherCamelCase   |                |
|3  |2005.05.10|user@domain.co.uk     |ftp://example.org      |UpperCASE123Lower  |prod-123    |YetAnotherCamelCase|                |
|4  |12-12-2012|contact@sub.domain.com|https://openai.com/blog|Replace-these_chars|Ref-200     |thisIsCamelCase    |                |
|5  |1980-06-15|name@example.com      |http://spark.apache.org