In [1]:
import os
os.getcwd()
os.chdir("H:\pyspark_advanced-coding_interview")
print(os.getcwd())


from pyspark.sql import SparkSession

# Create a Spark session with optimized settings
spark = (
    SparkSession.builder 
    .appName("OptimizedLocalSpark") 
    .config("spark.driver.memory", "8g")        
    .config("spark.executor.memory", "8g")    
    .config("spark.executor.cores", "4")       
    .config("spark.cores.max", "12")           
    .config("spark.sql.shuffle.partitions", "28")  
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") 
    .getOrCreate()
)
sc = spark.sparkContext

H:\pyspark_advanced-coding_interview


# find most frequently purchased together items

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

# Initialize Spark session
spark = SparkSession.builder.appName("FrequentItems").getOrCreate()


data = [
    (1, 'apple'),
    (1, 'banana'),
    (1, 'orange'),
    (2, 'banana'),
    (2, 'apple'),
    (3, 'orange'),
    (3, 'apple'),
    (4, 'banana'),
    (4, 'orange'),
    (5, 'apple'),
    (5, 'banana')
]



# Create a DataFrame
df = spark.createDataFrame(data, ["transaction_id", "item"])
df.show()

# Register the DataFrame as a temp table
df.createOrReplaceTempView("purchases")






+--------------+------+
|transaction_id|  item|
+--------------+------+
|             1| apple|
|             1|banana|
|             1|orange|
|             2|banana|
|             2| apple|
|             3|orange|
|             3| apple|
|             4|banana|
|             4|orange|
|             5| apple|
|             5|banana|
+--------------+------+



----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 54618)
Traceback (most recent call last):
  File "c:\Users\lpdda\AppData\Local\Programs\Python\Python311\Lib\socketserver.py", line 317, in _handle_request_noblock
    self.process_request(request, client_address)
  File "c:\Users\lpdda\AppData\Local\Programs\Python\Python311\Lib\socketserver.py", line 348, in process_request
    self.finish_request(request, client_address)
  File "c:\Users\lpdda\AppData\Local\Programs\Python\Python311\Lib\socketserver.py", line 361, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "c:\Users\lpdda\AppData\Local\Programs\Python\Python311\Lib\socketserver.py", line 755, in __init__
    self.handle()
  File "C:\spark\python\pyspark\accumulators.py", line 281, in handle
    poll(accum_updates)
  File "C:\spark\python\pyspark\accumulators.py", line 253, in poll
    if func():
       ^^^^^^
  File "C:\spark\python\py

In [4]:
# Group items by transaction_id
grouped_df = df.groupBy("transaction_id").agg(F.collect_list("item").alias("items"))
grouped_df.show()


from pyspark.sql.functions import explode

# Create a DataFrame to explode item pairs
pair_df = grouped_df.selectExpr("transaction_id", "explode(items) as item_1") \
    .join(grouped_df.selectExpr("transaction_id", "explode(items) as item_2"), "transaction_id") \
    .filter("item_1 < item_2")  # Avoid duplicates and reverse pairs

# Count item pair occurrences
pair_count_df = pair_df.groupBy("item_1", "item_2").count().orderBy(F.desc("count"))
pair_count_df.show()



+--------------+--------------------+
|transaction_id|               items|
+--------------+--------------------+
|             1|[apple, banana, o...|
|             2|     [banana, apple]|
|             3|     [orange, apple]|
|             4|    [banana, orange]|
|             5|     [apple, banana]|
+--------------+--------------------+

+------+------+-----+
|item_1|item_2|count|
+------+------+-----+
| apple|banana|    3|
| apple|orange|    2|
|banana|orange|    2|
+------+------+-----+



In [5]:
spark.sql("""
    SELECT t1.item AS item_1, t2.item AS item_2, COUNT(*) AS count
    FROM purchases t1
    JOIN purchases t2 ON t1.transaction_id = t2.transaction_id AND t1.item < t2.item
    GROUP BY t1.item, t2.item
    ORDER BY count DESC
""").show()


+------+------+-----+
|item_1|item_2|count|
+------+------+-----+
| apple|banana|    3|
| apple|orange|    2|
|banana|orange|    2|
+------+------+-----+

