In [1]:
import os
os.getcwd()
os.chdir("H:\pyspark_advanced-coding_interview")
os.getcwd()

'H:\\pyspark_advanced-coding_interview'

In [2]:
from pyspark.sql import SparkSession

# Create a Spark session with optimized settings
spark = (
    SparkSession.builder 
    .appName("OptimizedLocalSpark") 
    .config("spark.driver.memory", "8g")        
    .config("spark.executor.memory", "8g")    
    .config("spark.executor.cores", "4")       
    .config("spark.cores.max", "12")           
    .config("spark.sql.shuffle.partitions", "28")  
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") 
    .getOrCreate()
)
sc = spark.sparkContext


In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, year, month, sum



# Extended sample data
data = [
    ("SO71858", "2008-01-04", 15275.1977),
    ("SO71895", "2008-03-22", 272.6468),
    ("SO71897", "2008-04-10", 14017.9083),
    ("SO71920", "2008-05-15", 3293.7761),
    ("SO71774", "2008-06-25", 972.785),
    ("SO71786", "2008-07-03", 87.0851),
    ("SO71782", "2008-08-13", 42452.6519),
    ("SO71783", "2008-08-16", 43962.7901),
    ("SO71793", "2008-09-21", 32663.5609),
    ("SO71796", "2008-10-11", 63686.2708),
    ("SO71797", "2008-10-25", 86222.8072),
    ("SO81758", "2009-01-14", 18275.4871),
    ("SO81895", "2009-03-30", 3172.468),
    ("SO81897", "2009-04-12", 16027.1903),
    ("SO81920", "2009-05-20", 4529.8761),
    ("SO91774", "2009-06-05", 19872.785),
    ("SO91786", "2009-07-13", 5687.0901),
    ("SO91782", "2009-08-19", 30242.6519),
    ("SO91783", "2009-09-25", 50962.7901),
    ("SO91793", "2009-10-18", 41263.7609),
    ("SO91896", "2009-11-05", 73686.2908),
    ("SO71858", "2010-01-04", 17275.1277),
    ("SO71895", "2010-03-22", 3272.7468),
    ("SO71897", "2010-04-10", 14417.9083),
    ("SO71920", "2010-05-15", 1293.7761),
    ("SO71774", "2010-06-25", 1472.785),
    ("SO71786", "2010-07-03", 187.0851),
    ("SO71782", "2010-08-13", 37452.6519),
    ("SO71783", "2010-09-16", 43962.2901),
    ("SO71793", "2010-10-21", 22663.5609)
]

# Define schema for DataFrame
columns = ["SalesOrderNumber", "OrderDate", "TotalDue"]

# Create DataFrame
df = spark.createDataFrame(data, columns)

# Convert OrderDate to proper Date type
df = df.withColumn("OrderDate", col("OrderDate").cast("date"))

# Register DataFrame as a SQL temporary view
df.createOrReplaceTempView("Sales")




df.cache()
# Register DataFrame as a SQL temporary view
df.createOrReplaceTempView("Sales")
df.show()


+----------------+----------+----------+
|SalesOrderNumber| OrderDate|  TotalDue|
+----------------+----------+----------+
|         SO71858|2008-01-04|15275.1977|
|         SO71895|2008-03-22|  272.6468|
|         SO71897|2008-04-10|14017.9083|
|         SO71920|2008-05-15| 3293.7761|
|         SO71774|2008-06-25|   972.785|
|         SO71786|2008-07-03|   87.0851|
|         SO71782|2008-08-13|42452.6519|
|         SO71783|2008-08-16|43962.7901|
|         SO71793|2008-09-21|32663.5609|
|         SO71796|2008-10-11|63686.2708|
|         SO71797|2008-10-25|86222.8072|
|         SO81758|2009-01-14|18275.4871|
|         SO81895|2009-03-30|  3172.468|
|         SO81897|2009-04-12|16027.1903|
|         SO81920|2009-05-20| 4529.8761|
|         SO91774|2009-06-05| 19872.785|
|         SO91786|2009-07-13| 5687.0901|
|         SO91782|2009-08-19|30242.6519|
|         SO91783|2009-09-25|50962.7901|
|         SO91793|2009-10-18|41263.7609|
+----------------+----------+----------+
only showing top

# Spark SQL

In [6]:
res = spark.sql("""
select SalesOrderNumber, OrderDate, TotalDue, 
first_value(SalesOrderNumber) over (partition by year(OrderDate) order by OrderDate) as FIRST_ORDER,
first_value(SalesOrderNumber) over (partition by year(OrderDate) order by OrderDate rows between unbounded preceding and current row) as FIRST_ORDER_FRAME,
last_value(SalesOrderNumber) over (partition by year(OrderDate) order by OrderDate) as LAST_ORDER,
last_value(SalesOrderNumber) over (partition by year(OrderDate) order by OrderDate rows between unbounded preceding and current row ) as LAST_ORDER_FRAME
from Sales             
                
                """)
res.show()

+----------------+----------+----------+-----------+-----------------+----------+----------------+
|SalesOrderNumber| OrderDate|  TotalDue|FIRST_ORDER|FIRST_ORDER_FRAME|LAST_ORDER|LAST_ORDER_FRAME|
+----------------+----------+----------+-----------+-----------------+----------+----------------+
|         SO71858|2008-01-04|15275.1977|    SO71858|          SO71858|   SO71858|         SO71858|
|         SO71895|2008-03-22|  272.6468|    SO71858|          SO71858|   SO71895|         SO71895|
|         SO71897|2008-04-10|14017.9083|    SO71858|          SO71858|   SO71897|         SO71897|
|         SO71920|2008-05-15| 3293.7761|    SO71858|          SO71858|   SO71920|         SO71920|
|         SO71774|2008-06-25|   972.785|    SO71858|          SO71858|   SO71774|         SO71774|
|         SO71786|2008-07-03|   87.0851|    SO71858|          SO71858|   SO71786|         SO71786|
|         SO71782|2008-08-13|42452.6519|    SO71858|          SO71858|   SO71782|         SO71782|
|         

In [8]:


# Get the first and last SalesOrderNumber using Window functions
res2 = spark.sql( """
SELECT 
    FIRST_VALUE(SalesOrderNumber) OVER (ORDER BY OrderDate ASC) AS First_SalesOrder,
    LAST_VALUE(SalesOrderNumber) OVER (ORDER BY OrderDate ASC ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS Last_SalesOrder
FROM Sales
LIMIT 1
""")

res2.show()



+----------------+---------------+
|First_SalesOrder|Last_SalesOrder|
+----------------+---------------+
|         SO71858|        SO71793|
+----------------+---------------+



# PYSPARK

In [10]:
from pyspark.sql.window import Window
from pyspark.sql.functions import col, first, last
windowSpec = Window.orderBy("OrderDate")

df1 = df.withColumn("FirstOrder", first("SalesOrderNumber").over(windowSpec))\
        .withColumn("LastOrder", last("SalesOrderNumber").over(windowSpec)) \
        #.select("FirstOrder", "LastOrder").distinct()
df1.show()

+----------------+----------+----------+----------+---------+
|SalesOrderNumber| OrderDate|  TotalDue|FirstOrder|LastOrder|
+----------------+----------+----------+----------+---------+
|         SO71858|2008-01-04|15275.1977|   SO71858|  SO71858|
|         SO71895|2008-03-22|  272.6468|   SO71858|  SO71895|
|         SO71897|2008-04-10|14017.9083|   SO71858|  SO71897|
|         SO71920|2008-05-15| 3293.7761|   SO71858|  SO71920|
|         SO71774|2008-06-25|   972.785|   SO71858|  SO71774|
|         SO71786|2008-07-03|   87.0851|   SO71858|  SO71786|
|         SO71782|2008-08-13|42452.6519|   SO71858|  SO71782|
|         SO71783|2008-08-16|43962.7901|   SO71858|  SO71783|
|         SO71793|2008-09-21|32663.5609|   SO71858|  SO71793|
|         SO71796|2008-10-11|63686.2708|   SO71858|  SO71796|
|         SO71797|2008-10-25|86222.8072|   SO71858|  SO71797|
|         SO81758|2009-01-14|18275.4871|   SO71858|  SO81758|
|         SO81895|2009-03-30|  3172.468|   SO71858|  SO81895|
|       