In [1]:
import os
os.getcwd()
os.chdir("H:\pyspark_advanced-coding_interview")
print(os.getcwd())


from pyspark.sql import SparkSession

# Create a Spark session with optimized settings
spark = (
    SparkSession.builder 
    .appName("OptimizedLocalSpark") 
    .config("spark.driver.memory", "8g")        
    .config("spark.executor.memory", "8g")    
    .config("spark.executor.cores", "4")       
    .config("spark.cores.max", "12")           
    .config("spark.sql.shuffle.partitions", "28")  
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") 
    .getOrCreate()
)
sc = spark.sparkContext

H:\pyspark_advanced-coding_interview


# finding numbers that occur consecutively

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Initialize Spark session
spark = SparkSession.builder.appName("ConsecutiveOccurrences").getOrCreate()

# Sample data with consecutive occurrences
data = [
    (1, 1),
    (2, 1),
    (3, 1),
    (4, 2),
    (5, 2),
    (6, 2),
    (7, 3),
    (8, 3),
    (9, 4),
    (10, 5),
    (11, 5),
    (12, 5),
    (13, 5),
    (14, 6)
]

columns = ["id", "number"]
df = spark.createDataFrame(data, columns)

# Create a temporary table for Spark SQL
df.createOrReplaceTempView("number_table")
df.show()


+---+------+
| id|number|
+---+------+
|  1|     1|
|  2|     1|
|  3|     1|
|  4|     2|
|  5|     2|
|  6|     2|
|  7|     3|
|  8|     3|
|  9|     4|
| 10|     5|
| 11|     5|
| 12|     5|
| 13|     5|
| 14|     6|
+---+------+



In [2]:
result_sql = spark.sql("""
    SELECT number
    FROM (
        SELECT number,
               LAG(number) OVER (ORDER BY id) AS prev_num,
               LEAD(number) OVER (ORDER BY id) AS next_num
        FROM number_table
    ) AS t
    WHERE number = prev_num AND number = next_num
    GROUP BY number
""")
result_sql.show()


+------+
|number|
+------+
|     1|
|     2|
|     5|
+------+



In [3]:
# Create a window to identify changes in consecutive numbers
window = Window.orderBy("id")

# Identify groups based on change in numbers
df_with_lag = df.withColumn("prev_num", F.lag("number").over(window))
df_with_group = df_with_lag.withColumn("group", F.when(F.col("number") == F.col("prev_num"), 0).otherwise(1))
df_with_group = df_with_group.withColumn("group_id", F.sum("group").over(window.rowsBetween(Window.unboundedPreceding, 0)))

# Count occurrences within each group and filter for counts >= 3
df_group_count = df_with_group.groupBy("number", "group_id").count()
df_consecutive = df_group_count.filter(F.col("count") >= 3).select("number").distinct()
df_consecutive.show()


+------+
|number|
+------+
|     1|
|     2|
|     5|
+------+



In [4]:
# Create a window partitioned by 'number' to assign row numbers
window_row = Window.orderBy("id")
df = df.withColumn("row_num", F.row_number().over(window_row))

# Calculate the difference between row number and current position
df = df.withColumn("group", F.col("row_num") - F.monotonically_increasing_id())

# Group by number and group, count occurrences, and filter for counts >= 3
df_grouped = df.groupBy("number", "group").count()
df_result = df_grouped.filter(F.col("count") >= 3).select("number").distinct()
df_result.show()


+------+
|number|
+------+
|     1|
|     2|
|     5|
+------+

