In [1]:
import os
os.getcwd()
os.chdir("H:\pyspark_advanced-coding_interview")
print(os.getcwd())


from pyspark.sql import SparkSession

# Create a Spark session with optimized settings
spark = (
    SparkSession.builder 
    .appName("OptimizedLocalSpark") 
    .config("spark.driver.memory", "8g")        
    .config("spark.executor.memory", "8g")    
    .config("spark.executor.cores", "4")       
    .config("spark.cores.max", "12")           
    .config("spark.sql.shuffle.partitions", "28")  
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") 
    .getOrCreate()
)
sc = spark.sparkContext

H:\pyspark_advanced-coding_interview


In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, reverse, substring_index, expr

# Initialize SparkSession
spark = SparkSession.builder.appName("SparkSQL Example").getOrCreate()

# Sample data
data = [
    ("2024-10-30", "ORD001", "CUST123", "invoice_report.pdf"),
    ("2024-10-31", "ORD002", "CUST456", "order_summary.csv"),
    ("2024-11-01", "ORD003", "CUST789", "client_data.xlsx")
]

# Creating the DataFrame
columns = ["orderDate", "OrderKey", "customerID", "filename"]
df = spark.createDataFrame(data, columns)

# Show initial DataFrame
df.show()


+----------+--------+----------+------------------+
| orderDate|OrderKey|customerID|          filename|
+----------+--------+----------+------------------+
|2024-10-30|  ORD001|   CUST123|invoice_report.pdf|
|2024-10-31|  ORD002|   CUST456| order_summary.csv|
|2024-11-01|  ORD003|   CUST789|  client_data.xlsx|
+----------+--------+----------+------------------+



----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 52516)
Traceback (most recent call last):
  File "c:\Users\lpdda\AppData\Local\Programs\Python\Python311\Lib\socketserver.py", line 317, in _handle_request_noblock
    self.process_request(request, client_address)
  File "c:\Users\lpdda\AppData\Local\Programs\Python\Python311\Lib\socketserver.py", line 348, in process_request
    self.finish_request(request, client_address)
  File "c:\Users\lpdda\AppData\Local\Programs\Python\Python311\Lib\socketserver.py", line 361, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "c:\Users\lpdda\AppData\Local\Programs\Python\Python311\Lib\socketserver.py", line 755, in __init__
    self.handle()
  File "C:\spark\python\pyspark\accumulators.py", line 281, in handle
    poll(accum_updates)
  File "C:\spark\python\pyspark\accumulators.py", line 253, in poll
    if func():
       ^^^^^^
  File "C:\spark\python\py

# Extracting File Extension

In [2]:
from pyspark.sql.functions import split

# Extracting file extension using split
df_with_ext = df.withColumn("extension", split(col("filename"), "\\.").getItem(1))
df_with_ext.show()


+----------+--------+----------+------------------+---------+
| orderDate|OrderKey|customerID|          filename|extension|
+----------+--------+----------+------------------+---------+
|2024-10-30|  ORD001|   CUST123|invoice_report.pdf|      pdf|
|2024-10-31|  ORD002|   CUST456| order_summary.csv|      csv|
|2024-11-01|  ORD003|   CUST789|  client_data.xlsx|     xlsx|
+----------+--------+----------+------------------+---------+



In [3]:
# Create a temporary table
df.createOrReplaceTempView("orders")

# Extracting file extension using Spark SQL
df_with_ext_sql = spark.sql("""
SELECT *, 
       SUBSTRING_INDEX(filename, '.', -1) AS extension
FROM orders
""")
df_with_ext_sql.show()


+----------+--------+----------+------------------+---------+
| orderDate|OrderKey|customerID|          filename|extension|
+----------+--------+----------+------------------+---------+
|2024-10-30|  ORD001|   CUST123|invoice_report.pdf|      pdf|
|2024-10-31|  ORD002|   CUST456| order_summary.csv|      csv|
|2024-11-01|  ORD003|   CUST789|  client_data.xlsx|     xlsx|
+----------+--------+----------+------------------+---------+



# Extracting Last Occurrence of Substring

In [4]:
# Extracting the last occurrence of substring (e.g., last underscore)
df_with_last_occurrence = df.withColumn("last_occurrence", expr("substring_index(filename, '_', -1)"))
df_with_last_occurrence.show()


+----------+--------+----------+------------------+---------------+
| orderDate|OrderKey|customerID|          filename|last_occurrence|
+----------+--------+----------+------------------+---------------+
|2024-10-30|  ORD001|   CUST123|invoice_report.pdf|     report.pdf|
|2024-10-31|  ORD002|   CUST456| order_summary.csv|    summary.csv|
|2024-11-01|  ORD003|   CUST789|  client_data.xlsx|      data.xlsx|
+----------+--------+----------+------------------+---------------+



In [5]:
# Using Spark SQL to extract the last occurrence of substring
df_with_last_occurrence_sql = spark.sql("""
SELECT *,
       SUBSTRING_INDEX(filename, '_', -1) AS last_occurrence
FROM orders
""")
df_with_last_occurrence_sql.show()


+----------+--------+----------+------------------+---------------+
| orderDate|OrderKey|customerID|          filename|last_occurrence|
+----------+--------+----------+------------------+---------------+
|2024-10-30|  ORD001|   CUST123|invoice_report.pdf|     report.pdf|
|2024-10-31|  ORD002|   CUST456| order_summary.csv|    summary.csv|
|2024-11-01|  ORD003|   CUST789|  client_data.xlsx|      data.xlsx|
+----------+--------+----------+------------------+---------------+



# # Reversing the filename column using reverse



In [6]:
# Reversing the filename column using reverse
df_with_reversed = df.withColumn("reversed_filename", reverse(col("filename")))
df_with_reversed.show()


+----------+--------+----------+------------------+------------------+
| orderDate|OrderKey|customerID|          filename| reversed_filename|
+----------+--------+----------+------------------+------------------+
|2024-10-30|  ORD001|   CUST123|invoice_report.pdf|fdp.troper_eciovni|
|2024-10-31|  ORD002|   CUST456| order_summary.csv| vsc.yrammus_redro|
|2024-11-01|  ORD003|   CUST789|  client_data.xlsx|  xslx.atad_tneilc|
+----------+--------+----------+------------------+------------------+



In [7]:
# Using Spark SQL to reverse the string
df_with_reversed_sql = spark.sql("""
SELECT *,
       REVERSE(filename) AS reversed_filename
FROM orders
""")
df_with_reversed_sql.show()


+----------+--------+----------+------------------+------------------+
| orderDate|OrderKey|customerID|          filename| reversed_filename|
+----------+--------+----------+------------------+------------------+
|2024-10-30|  ORD001|   CUST123|invoice_report.pdf|fdp.troper_eciovni|
|2024-10-31|  ORD002|   CUST456| order_summary.csv| vsc.yrammus_redro|
|2024-11-01|  ORD003|   CUST789|  client_data.xlsx|  xslx.atad_tneilc|
+----------+--------+----------+------------------+------------------+

