In [1]:
import os
os.getcwd()
os.chdir("H:\pyspark_advanced-coding_interview")
os.getcwd()

'H:\\pyspark_advanced-coding_interview'

# Spark SQL

In [2]:
# Create a Spark Session (if not already created)
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("AlphabeticSequence").getOrCreate()

# Generate the first and second letter sequences using ASCII values
query = spark.sql( """
WITH Alphabet AS (
    SELECT CHAR(SEQUENCE.ID) AS Letter
    FROM (SELECT EXPLODE(SEQUENCE(65, 90)) AS ID) AS SEQUENCE
)
SELECT CONCAT(A.Letter, B.Letter) AS TwoDigitAlpha
FROM Alphabet A
CROSS JOIN Alphabet B
ORDER BY TwoDigitAlpha
"""
)
query.show()


query.show(10)  # Show the first 10 results


+-------------+
|TwoDigitAlpha|
+-------------+
|           AA|
|           AB|
|           AC|
|           AD|
|           AE|
|           AF|
|           AG|
|           AH|
|           AI|
|           AJ|
|           AK|
|           AL|
|           AM|
|           AN|
|           AO|
|           AP|
|           AQ|
|           AR|
|           AS|
|           AT|
+-------------+
only showing top 20 rows

+-------------+
|TwoDigitAlpha|
+-------------+
|           AA|
|           AB|
|           AC|
|           AD|
|           AE|
|           AF|
|           AG|
|           AH|
|           AI|
|           AJ|
+-------------+
only showing top 10 rows



In [3]:
query2 = spark.sql("""WITH Letters AS (
    SELECT char(ascii('A') + n) AS Letter
    FROM (
        SELECT explode(array(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25)) AS n
    )
)
SELECT CONCAT(l1.Letter, l2.Letter) AS TwoDigitAlphabetic
FROM Letters l1
CROSS JOIN Letters l2
ORDER BY TwoDigitAlphabetic;
""")
query2.show()

+------------------+
|TwoDigitAlphabetic|
+------------------+
|                AA|
|                AB|
|                AC|
|                AD|
|                AE|
|                AF|
|                AG|
|                AH|
|                AI|
|                AJ|
|                AK|
|                AL|
|                AM|
|                AN|
|                AO|
|                AP|
|                AQ|
|                AR|
|                AS|
|                AT|
+------------------+
only showing top 20 rows



In [4]:
query3 = spark.sql(""" 
    WITH Alphabet AS (
    SELECT char(ASCII('A') + idx) AS Letter
    FROM (
        SELECT posexplode(array_repeat(0, 26)) AS (idx, _)
    )
)
SELECT CONCAT(A.Letter, B.Letter) AS TwoDigitAlphabetic
FROM Alphabet A
CROSS JOIN Alphabet B
ORDER BY TwoDigitAlphabetic;
               
                   """)

query3.show()

+------------------+
|TwoDigitAlphabetic|
+------------------+
|                AA|
|                AB|
|                AC|
|                AD|
|                AE|
|                AF|
|                AG|
|                AH|
|                AI|
|                AJ|
|                AK|
|                AL|
|                AM|
|                AN|
|                AO|
|                AP|
|                AQ|
|                AR|
|                AS|
|                AT|
+------------------+
only showing top 20 rows



In [7]:
query4 = spark.sql(""" 
WITH Alphabet AS (
    SELECT char(ASCII('A') + idx) AS Letter
    FROM (
        SELECT posexplode(array_repeat(0, 26)) AS (idx, _)
    )
)
SELECT CONCAT(A.Letter, B.Letter) AS TwoDigitAlphabetic
FROM Alphabet A
CROSS JOIN Alphabet B
ORDER BY TwoDigitAlphabetic;


              
                   """)

query4.show()

+------------------+
|TwoDigitAlphabetic|
+------------------+
|                AA|
|                AB|
|                AC|
|                AD|
|                AE|
|                AF|
|                AG|
|                AH|
|                AI|
|                AJ|
|                AK|
|                AL|
|                AM|
|                AN|
|                AO|
|                AP|
|                AQ|
|                AR|
|                AS|
|                AT|
+------------------+
only showing top 20 rows



# Pyspark

In [8]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr, ascii, col

# Create Spark session
spark = SparkSession.builder.appName("AlphabetSequence").getOrCreate()

# Generate DataFrame for letters A-Z using ASCII values
letters = spark.sql("SELECT char(ascii('A') + n) AS Letter FROM (SELECT explode(sequence(0, 25)) AS n)")

# Use Cartesian product to create combinations
two_letter_combinations = letters.alias("l1").crossJoin(letters.alias("l2")) \
    .select(expr("concat(l1.Letter, l2.Letter) AS TwoDigitAlphabetic")) \
    .orderBy("TwoDigitAlphabetic")

two_letter_combinations.show(52)


+------------------+
|TwoDigitAlphabetic|
+------------------+
|                AA|
|                AB|
|                AC|
|                AD|
|                AE|
|                AF|
|                AG|
|                AH|
|                AI|
|                AJ|
|                AK|
|                AL|
|                AM|
|                AN|
|                AO|
|                AP|
|                AQ|
|                AR|
|                AS|
|                AT|
|                AU|
|                AV|
|                AW|
|                AX|
|                AY|
|                AZ|
|                BA|
|                BB|
|                BC|
|                BD|
|                BE|
|                BF|
|                BG|
|                BH|
|                BI|
|                BJ|
|                BK|
|                BL|
|                BM|
|                BN|
|                BO|
|                BP|
|                BQ|
|                BR|
|            

In [10]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Initialize Spark Session
spark = SparkSession.builder.appName("TwoDigitAlphabeticSequence").getOrCreate()

# Create a DataFrame with letters A to Z
letters = [chr(i) for i in range(ord('A'), ord('Z') + 1)]
df1 = spark.createDataFrame([(l,) for l in letters], ["Letter1"])
df2 = spark.createDataFrame([(l,) for l in letters], ["Letter2"])

# Perform a cartesian join to generate two-letter sequences
two_letter_sequence = df1.crossJoin(df2).select(col("Letter1"), col("Letter2"))

# Combine the two letters to form a two-letter code
result_df = two_letter_sequence.withColumn("TwoLetterCode", col("Letter1") + col("Letter2"))
result_df.show(5)  # Show first 5 results for brevity



df1.createOrReplaceTempView("Letters1")
df2.createOrReplaceTempView("Letters2")

query6 = spark.sql("""
SELECT CONCAT(Letter1, Letter2) AS TwoLetterCode
FROM Letters1 CROSS JOIN Letters2
""")

query6.show()


+-------+-------+-------------+
|Letter1|Letter2|TwoLetterCode|
+-------+-------+-------------+
|      A|      A|         null|
|      A|      B|         null|
|      A|      C|         null|
|      A|      D|         null|
|      A|      E|         null|
+-------+-------+-------------+
only showing top 5 rows

+-------------+
|TwoLetterCode|
+-------------+
|           AA|
|           AB|
|           AC|
|           AD|
|           AE|
|           AF|
|           AG|
|           AH|
|           AI|
|           AJ|
|           AK|
|           AL|
|           AM|
|           AN|
|           AO|
|           AP|
|           AQ|
|           AR|
|           AS|
|           AT|
+-------------+
only showing top 20 rows



In [11]:
from pyspark.sql.functions import lit

# Create a DataFrame for alphabet letters using ASCII values
alphabet_df = spark.createDataFrame([(chr(i),) for i in range(ord('A'), ord('Z') + 1)], ["Letter"])

# Generate the sequence using two loops and a list comprehension
sequence_data = [(chr(i) + chr(j),) for i in range(ord('A'), ord('Z') + 1) for j in range(ord('A'), ord('Z') + 1)]

# Convert to DataFrame
sequence_df = spark.createDataFrame(sequence_data, ["TwoLetterCode"])
sequence_df.show(5)  # Show first 5 results for brevity


+-------------+
|TwoLetterCode|
+-------------+
|           AA|
|           AB|
|           AC|
|           AD|
|           AE|
+-------------+
only showing top 5 rows



In [12]:
from pyspark.sql.types import StringType
from pyspark.sql.functions import udf

# Define a UDF to generate two-letter sequences
def generate_two_letter_code():
    return [chr(i) + chr(j) for i in range(ord('A'), ord('Z') + 1) for j in range(ord('A'), ord('Z') + 1)]

# Convert the list to a PySpark DataFrame
two_letter_code_df = spark.createDataFrame(generate_two_letter_code(), StringType()).toDF("TwoLetterCode")

# Show the DataFrame
two_letter_code_df.show(5)  # Show first 5 results for brevity


+-------------+
|TwoLetterCode|
+-------------+
|           AA|
|           AB|
|           AC|
|           AD|
|           AE|
+-------------+
only showing top 5 rows

