In [1]:
import os
os.getcwd()
os.chdir("H:\pyspark_advanced-coding_interview")
os.getcwd()

'H:\\pyspark_advanced-coding_interview'

# How to sort alphanumeric data | alphabets and numbers in correct order | PATINDEX

In [2]:
from pyspark.sql import SparkSession

# Initialize Spark Session
spark = SparkSession.builder.appName("SortAlphanumericData").getOrCreate()

# Sample Data: Alphanumeric strings
data = [
    (1, "A1"),
    (2, "A10"),
    (3, "A2"),
    (4, "B1"),
    (5, "B10"),
    (6, "B2"),
    (7, "C3"),
    (8, "C1"),
    (9, "C20"),
    (10, "D100"),
    (11, "D10"),
    (12, "D2"),
    (13, "E25"),
    (14, "E5"),
    (15, "F7"),
    (16, "Z1"),
    (17, "Z10")
]

# Create DataFrame
df = spark.createDataFrame(data, ["id", "alphanumeric"])

# Create a Temporary View for Spark SQL
df.createOrReplaceTempView("alphanumeric_data")

# Show the original DataFrame
df.show(truncate=False)


+---+------------+
|id |alphanumeric|
+---+------------+
|1  |A1          |
|2  |A10         |
|3  |A2          |
|4  |B1          |
|5  |B10         |
|6  |B2          |
|7  |C3          |
|8  |C1          |
|9  |C20         |
|10 |D100        |
|11 |D10         |
|12 |D2          |
|13 |E25         |
|14 |E5          |
|15 |F7          |
|16 |Z1          |
|17 |Z10         |
+---+------------+



In [3]:
res = spark.sql(""" 
                
  SELECT *,
       regexp_extract(alphanumeric, '^[A-Za-z]+', 0) AS letters,
       CAST(regexp_extract(alphanumeric, '[0-9]+$', 0) AS INT) AS numbers
FROM alphanumeric_data
ORDER BY letters ASC, numbers ASC;
              
                """)
res.show()

+---+------------+-------+-------+
| id|alphanumeric|letters|numbers|
+---+------------+-------+-------+
|  1|          A1|      A|      1|
|  3|          A2|      A|      2|
|  2|         A10|      A|     10|
|  4|          B1|      B|      1|
|  6|          B2|      B|      2|
|  5|         B10|      B|     10|
|  8|          C1|      C|      1|
|  7|          C3|      C|      3|
|  9|         C20|      C|     20|
| 12|          D2|      D|      2|
| 11|         D10|      D|     10|
| 10|        D100|      D|    100|
| 14|          E5|      E|      5|
| 13|         E25|      E|     25|
| 15|          F7|      F|      7|
| 16|          Z1|      Z|      1|
| 17|         Z10|      Z|     10|
+---+------------+-------+-------+



In [4]:
from pyspark.sql.functions import regexp_extract, col

# Extract letters and numbers separately
df_sorted = df.withColumn("letters", regexp_extract("alphanumeric", "^[A-Za-z]+", 0)) \
              .withColumn("numbers", regexp_extract("alphanumeric", "[0-9]+$", 0).cast("int")) \
              .orderBy(col("letters").asc(), col("numbers").asc())

# Show the sorted DataFrame
df_sorted.show(truncate=False)


+---+------------+-------+-------+
|id |alphanumeric|letters|numbers|
+---+------------+-------+-------+
|1  |A1          |A      |1      |
|3  |A2          |A      |2      |
|2  |A10         |A      |10     |
|4  |B1          |B      |1      |
|6  |B2          |B      |2      |
|5  |B10         |B      |10     |
|8  |C1          |C      |1      |
|7  |C3          |C      |3      |
|9  |C20         |C      |20     |
|12 |D2          |D      |2      |
|11 |D10         |D      |10     |
|10 |D100        |D      |100    |
|14 |E5          |E      |5      |
|13 |E25         |E      |25     |
|15 |F7          |F      |7      |
|16 |Z1          |Z      |1      |
|17 |Z10         |Z      |10     |
+---+------------+-------+-------+



In [5]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType, IntegerType

# UDF to extract letters
def extract_letters(s):
    import re
    return re.findall(r'[A-Za-z]+', s)[0] if re.findall(r'[A-Za-z]+', s) else None

# UDF to extract numbers
def extract_numbers(s):
    import re
    return int(re.findall(r'\d+', s)[0]) if re.findall(r'\d+', s) else None

# Register UDFs
extract_letters_udf = udf(extract_letters, StringType())
extract_numbers_udf = udf(extract_numbers, IntegerType())

# Apply UDFs to extract and sort
df_sorted_udf = df.withColumn("letters", extract_letters_udf("alphanumeric")) \
                  .withColumn("numbers", extract_numbers_udf("alphanumeric")) \
                  .orderBy("letters", "numbers")

df_sorted_udf.show(truncate=False)


+---+------------+-------+-------+
|id |alphanumeric|letters|numbers|
+---+------------+-------+-------+
|1  |A1          |A      |1      |
|3  |A2          |A      |2      |
|2  |A10         |A      |10     |
|4  |B1          |B      |1      |
|6  |B2          |B      |2      |
|5  |B10         |B      |10     |
|8  |C1          |C      |1      |
|7  |C3          |C      |3      |
|9  |C20         |C      |20     |
|12 |D2          |D      |2      |
|11 |D10         |D      |10     |
|10 |D100        |D      |100    |
|14 |E5          |E      |5      |
|13 |E25         |E      |25     |
|15 |F7          |F      |7      |
|16 |Z1          |Z      |1      |
|17 |Z10         |Z      |10     |
+---+------------+-------+-------+



In [6]:
res1 = spark.sql(""" 
                
SELECT *,
       CASE WHEN alphanumeric LIKE 'A%' THEN 1
            WHEN alphanumeric LIKE 'B%' THEN 2
            ELSE 3
       END AS custom_order
FROM alphanumeric_data
ORDER BY custom_order, alphanumeric;

              
                """)
res1.show()

+---+------------+------------+
| id|alphanumeric|custom_order|
+---+------------+------------+
|  1|          A1|           1|
|  2|         A10|           1|
|  3|          A2|           1|
|  4|          B1|           2|
|  5|         B10|           2|
|  6|          B2|           2|
|  8|          C1|           3|
|  9|         C20|           3|
|  7|          C3|           3|
| 11|         D10|           3|
| 10|        D100|           3|
| 12|          D2|           3|
| 13|         E25|           3|
| 14|          E5|           3|
| 15|          F7|           3|
| 16|          Z1|           3|
| 17|         Z10|           3|
+---+------------+------------+

