# trim leading zeroes

In [2]:
import os
os.getcwd()
os.chdir("H:\pyspark_advanced-coding_interview")
print(os.getcwd())


from pyspark.sql import SparkSession

# Create a Spark session with optimized settings
spark = (
    SparkSession.builder 
    .appName("OptimizedLocalSpark") 
    .config("spark.driver.memory", "8g")        
    .config("spark.executor.memory", "8g")    
    .config("spark.executor.cores", "4")       
    .config("spark.cores.max", "12")           
    .config("spark.sql.shuffle.partitions", "28")  
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") 
    .getOrCreate()
)
sc = spark.sparkContext


H:\pyspark_advanced-coding_interview


In [2]:
from pyspark.sql import SparkSession

# Initialize Spark Session
spark = SparkSession.builder.appName("TrimLeadingZeroes").getOrCreate()

# Sample Data: Strings with leading zeroes
data = [
    (1, "001234"),
    (2, "0005678"),
    (3, "012345"),
    (4, "0000001"),
    (5, "12345"),
    (6, "098765"),
    (7, "000900"),
    (8, "004500"),
    (9, "00000"),
    (10, "007")
]

# Create DataFrame
df = spark.createDataFrame(data, ["id", "number_string"])

# Create a Temporary View for Spark SQL
df.createOrReplaceTempView("number_table")

# Show the Original DataFrame
df.show(truncate=False)



+---+-------------+
|id |number_string|
+---+-------------+
|1  |001234       |
|2  |0005678      |
|3  |012345       |
|4  |0000001      |
|5  |12345        |
|6  |098765       |
|7  |000900       |
|8  |004500       |
|9  |00000        |
|10 |007          |
+---+-------------+



In [3]:

# instr: Returns the position of the first occurrence of 0 in the string. If not found, it returns 0.
res = spark.sql(""" 
                
  SELECT id, number_string, instr(number_string, '0') AS first_zero_position
FROM number_table;
              
                """)
res.show()






+---+-------------+-------------------+
| id|number_string|first_zero_position|
+---+-------------+-------------------+
|  1|       001234|                  1|
|  2|      0005678|                  1|
|  3|       012345|                  1|
|  4|      0000001|                  1|
|  5|        12345|                  0|
|  6|       098765|                  1|
|  7|       000900|                  1|
|  8|       004500|                  1|
|  9|        00000|                  1|
| 10|          007|                  1|
+---+-------------+-------------------+



In [4]:
# regexp_extract: Extracts everything before the first occurrence of 0. length provides the index.
#Conditional Logic: Returns 0 if no zero is found

res1 = spark.sql(""" 
                
SELECT id, number_string, 
       CASE WHEN number_string RLIKE '0' THEN length(regexp_extract(number_string, '^(.*?)0', 1)) + 1 ELSE 0 END AS first_zero_position
FROM number_table;

              
                """)
res1.show()

+---+-------------+-------------------+
| id|number_string|first_zero_position|
+---+-------------+-------------------+
|  1|       001234|                  1|
|  2|      0005678|                  1|
|  3|       012345|                  1|
|  4|      0000001|                  1|
|  5|        12345|                  0|
|  6|       098765|                  1|
|  7|       000900|                  1|
|  8|       004500|                  1|
|  9|        00000|                  1|
| 10|          007|                  1|
+---+-------------+-------------------+



In [5]:
from pyspark.sql.functions import expr

# Use rlike and expr to mimic PATINDEX behavior
df_patindex = df.withColumn("first_zero_position", 
                            expr("CASE WHEN number_string RLIKE '0' THEN length(regexp_extract(number_string, '^(.*?)0', 1)) + 1 ELSE 0 END"))

df_patindex.show(truncate=False)


+---+-------------+-------------------+
|id |number_string|first_zero_position|
+---+-------------+-------------------+
|1  |001234       |1                  |
|2  |0005678      |1                  |
|3  |012345       |1                  |
|4  |0000001      |1                  |
|5  |12345        |0                  |
|6  |098765       |1                  |
|7  |000900       |1                  |
|8  |004500       |1                  |
|9  |00000        |1                  |
|10 |007          |1                  |
+---+-------------+-------------------+



In [6]:
res2 = spark.sql(""" 
                
SELECT id, number_string, 
       CASE WHEN number_string RLIKE '[1-9]' THEN length(regexp_extract(number_string, '^(.*?)\\d', 1)) + 1 ELSE 0 END AS first_non_zero_position
FROM number_table;


              
                """)
res2.show()


df_non_zero_pos = df.withColumn("first_non_zero_position", 
                                expr("CASE WHEN number_string RLIKE '[1-9]' THEN length(regexp_extract(number_string, '^(.*?)\\d', 1)) + 1 ELSE 0 END"))

df_non_zero_pos.show(truncate=False)


+---+-------------+-----------------------+
| id|number_string|first_non_zero_position|
+---+-------------+-----------------------+
|  1|       001234|                      1|
|  2|      0005678|                      1|
|  3|       012345|                      1|
|  4|      0000001|                      1|
|  5|        12345|                      1|
|  6|       098765|                      1|
|  7|       000900|                      1|
|  8|       004500|                      1|
|  9|        00000|                      0|
| 10|          007|                      1|
+---+-------------+-----------------------+

+---+-------------+-----------------------+
|id |number_string|first_non_zero_position|
+---+-------------+-----------------------+
|1  |001234       |1                      |
|2  |0005678      |1                      |
|3  |012345       |1                      |
|4  |0000001      |1                      |
|5  |12345        |1                      |
|6  |098765       |1           