In [1]:
import os       
os.getcwd()
os.chdir("H:\pyspark-coding-interview")
os.getcwd()

'H:\\pyspark-coding-interview'

In [9]:
from pyspark.sql import SparkSession

# Create a Spark session with optimized settings
spark = (
    SparkSession.builder 
    .appName("OptimizedLocalSpark") 
    .config("spark.driver.memory", "8g")        
    .config("spark.executor.memory", "8g")    
    .config("spark.executor.cores", "4")       
    .config("spark.cores.max", "12")           
    .config("spark.sql.shuffle.partitions", "28")  
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") 
    .getOrCreate()
)
sc = spark.sparkContext
# Check Configuration
print(sc.getConf().getAll())




[('spark.app.startTime', '1729289650138'), ('spark.driver.extraJavaOptions', '-Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/jdk.internal.ref=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED -Djdk.reflect.useDirectMethodHandle=false'), ('spark.sql.warehouse.dir', 'file:/H:/pyspark-coding-interview/

In [24]:
# Sample data
data = [("abc123xyz456",), ("test789",), ("hello123",), ("abc",), ("123",)]
columns = ["alphanumeric_string"]

# Create DataFrame
df = spark.createDataFrame(data, columns)
df.cache()

# Register DataFrame as a temporary view
df.createOrReplaceTempView("alphanumeric_table")
df.show()

+-------------------+
|alphanumeric_string|
+-------------------+
|       abc123xyz456|
|            test789|
|           hello123|
|                abc|
|                123|
+-------------------+



## Using SPARK SQL

In [25]:
result = spark.sql("""
    SELECT 
        alphanumeric_string,
        regexp_replace(alphanumeric_string, '[^0-9]', '') AS numbers,
        regexp_replace(alphanumeric_string, '[^a-zA-Z]', '') AS alphabets
    FROM alphanumeric_table
""")

# Show result
result.show()

+-------------------+-------+---------+
|alphanumeric_string|numbers|alphabets|
+-------------------+-------+---------+
|       abc123xyz456| 123456|   abcxyz|
|            test789|    789|     test|
|           hello123|    123|    hello|
|                abc|       |      abc|
|                123|    123|         |
+-------------------+-------+---------+



In [29]:
res2 = spark.sql("""
    SELECT 
        alphanumeric_string, 
        regexp_extract(alphanumeric_string, '[0-9]+', 0) AS numbers,
        regexp_extract(alphanumeric_string, '[a-zA-Z]+', 0) AS alphabets
    FROM alphanumeric_table
""")

# Display the result
res2.show()

+-------------------+-------+---------+
|alphanumeric_string|numbers|alphabets|
+-------------------+-------+---------+
|       abc123xyz456|    123|      abc|
|            test789|    789|     test|
|           hello123|    123|    hello|
|                abc|       |      abc|
|                123|    123|         |
+-------------------+-------+---------+



# Using Pyspark

In [20]:

from pyspark.sql.functions import regexp_replace, regexp_extract
df1 = df.withColumn("numbers", regexp_replace("alphanumeric_string", "[^0-9]", "")).withColumn("alphabets", regexp_replace("alphanumeric_string", "[^a-zA-Z]", ""))
df1.show()

+-------------------+-------+---------+
|alphanumeric_string|numbers|alphabets|
+-------------------+-------+---------+
|       abc123xyz456| 123456|   abcxyz|
|            test789|    789|     test|
|           hello123|    123|    hello|
|                abc|       |      abc|
|                123|    123|         |
+-------------------+-------+---------+



In [21]:
df2 = df.withColumn("only_numbers", regexp_extract("alphanumeric_string", "[0-9]+", 0)).withColumn("only_alphabets", regexp_extract("alphanumeric_string", "[a-zA-Z]+", 0))
df2.show()

+-------------------+------------+--------------+
|alphanumeric_string|only_numbers|only_alphabets|
+-------------------+------------+--------------+
|       abc123xyz456|         123|           abc|
|            test789|         789|          test|
|           hello123|         123|         hello|
|                abc|            |           abc|
|                123|         123|              |
+-------------------+------------+--------------+



In [22]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
import re

# Define UDFs
def extract_numbers(s):
    return "".join(re.findall(r"[0-9]+", s))

def extract_alphabets(s):
    return "".join(re.findall(r"[a-zA-Z]+", s))

# Register UDFs
extract_numbers_udf = udf(extract_numbers, StringType())
extract_alphabets_udf = udf(extract_alphabets, StringType())

# Apply UDFs to DataFrame
df = df.withColumn("numbers_udf", extract_numbers_udf("alphanumeric_string"))
df = df.withColumn("alphabets_udf", extract_alphabets_udf("alphanumeric_string"))

df.show()

+-------------------+-----------+-------------+
|alphanumeric_string|numbers_udf|alphabets_udf|
+-------------------+-----------+-------------+
|       abc123xyz456|     123456|       abcxyz|
|            test789|        789|         test|
|           hello123|        123|        hello|
|                abc|           |          abc|
|                123|        123|             |
+-------------------+-----------+-------------+

