In [1]:
import os
os.getcwd()
os.chdir("H:\pyspark_advanced-coding_interview")
print(os.getcwd())


from pyspark.sql import SparkSession

# Create a Spark session with optimized settings
spark = (
    SparkSession.builder 
    .appName("OptimizedLocalSpark") 
    .config("spark.driver.memory", "8g")        
    .config("spark.executor.memory", "8g")    
    .config("spark.executor.cores", "4")       
    .config("spark.cores.max", "12")           
    .config("spark.sql.shuffle.partitions", "28")  
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") 
    .getOrCreate()
)
sc = spark.sparkContext

H:\pyspark_advanced-coding_interview


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Initialize Spark session
spark = SparkSession.builder.appName("ForwardFillNULLValues").getOrCreate()

# Sample data with NULL values in the logindate column
data = [
    (1, '2024-10-01', None),
    (1, '2024-10-02', None),
    (1, '2024-10-03', '2024-10-03 08:00:00'),
    (1, '2024-10-04', None),
    (1, '2024-10-05', None),
    (2, '2024-10-01', None),
    (2, '2024-10-02', '2024-10-02 09:00:00'),
    (2, '2024-10-03', None),
    (2, '2024-10-04', None),
    (3, '2024-10-01', None),
    (3, '2024-10-02', None),
    (3, '2024-10-03', '2024-10-03 10:00:00'),
]

columns = ["employee_id", "date", "logindate"]

# Create DataFrame
df = spark.createDataFrame(data, columns)

# Create temporary table for SQL queries
df.createOrReplaceTempView("employee_logins")
df.show()


+-----------+----------+-------------------+
|employee_id|      date|          logindate|
+-----------+----------+-------------------+
|          1|2024-10-01|               null|
|          1|2024-10-02|               null|
|          1|2024-10-03|2024-10-03 08:00:00|
|          1|2024-10-04|               null|
|          1|2024-10-05|               null|
|          2|2024-10-01|               null|
|          2|2024-10-02|2024-10-02 09:00:00|
|          2|2024-10-03|               null|
|          2|2024-10-04|               null|
|          3|2024-10-01|               null|
|          3|2024-10-02|               null|
|          3|2024-10-03|2024-10-03 10:00:00|
+-----------+----------+-------------------+



In [3]:
res_sql = spark.sql("""
SELECT employee_id, 
       date, 
       COALESCE(logindate, LAST_VALUE(logindate, TRUE) OVER (
           PARTITION BY employee_id ORDER BY date 
           ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
       )) AS filled_logindate
FROM employee_logins
ORDER BY employee_id, date
""")
res_sql.show()


+-----------+----------+-------------------+
|employee_id|      date|   filled_logindate|
+-----------+----------+-------------------+
|          1|2024-10-01|               null|
|          1|2024-10-02|               null|
|          1|2024-10-03|2024-10-03 08:00:00|
|          1|2024-10-04|2024-10-03 08:00:00|
|          1|2024-10-05|2024-10-03 08:00:00|
|          2|2024-10-01|               null|
|          2|2024-10-02|2024-10-02 09:00:00|
|          2|2024-10-03|2024-10-02 09:00:00|
|          2|2024-10-04|2024-10-02 09:00:00|
|          3|2024-10-01|               null|
|          3|2024-10-02|               null|
|          3|2024-10-03|2024-10-03 10:00:00|
+-----------+----------+-------------------+



In [4]:
# Define window specification
window_spec = Window.partitionBy("employee_id").orderBy("date").rowsBetween(Window.unboundedPreceding, 0)

# Use `last` function with `ignoreNulls=True` to forward-fill NULL values
df_filled = df.withColumn("filled_logindate", F.last("logindate", ignorenulls=True).over(window_spec))
df_filled.show()


+-----------+----------+-------------------+-------------------+
|employee_id|      date|          logindate|   filled_logindate|
+-----------+----------+-------------------+-------------------+
|          1|2024-10-01|               null|               null|
|          1|2024-10-02|               null|               null|
|          1|2024-10-03|2024-10-03 08:00:00|2024-10-03 08:00:00|
|          1|2024-10-04|               null|2024-10-03 08:00:00|
|          1|2024-10-05|               null|2024-10-03 08:00:00|
|          2|2024-10-01|               null|               null|
|          2|2024-10-02|2024-10-02 09:00:00|2024-10-02 09:00:00|
|          2|2024-10-03|               null|2024-10-02 09:00:00|
|          2|2024-10-04|               null|2024-10-02 09:00:00|
|          3|2024-10-01|               null|               null|
|          3|2024-10-02|               null|               null|
|          3|2024-10-03|2024-10-03 10:00:00|2024-10-03 10:00:00|
+-----------+----------+-

In [6]:
df_filled = df.withColumn("filled_logindate", 
                          F.last("logindate", ignorenulls=True).over(window_spec))
df_filled.show()



+-----------+----------+-------------------+-------------------+
|employee_id|      date|          logindate|   filled_logindate|
+-----------+----------+-------------------+-------------------+
|          1|2024-10-01|               null|               null|
|          1|2024-10-02|               null|               null|
|          1|2024-10-03|2024-10-03 08:00:00|2024-10-03 08:00:00|
|          1|2024-10-04|               null|2024-10-03 08:00:00|
|          1|2024-10-05|               null|2024-10-03 08:00:00|
|          2|2024-10-01|               null|               null|
|          2|2024-10-02|2024-10-02 09:00:00|2024-10-02 09:00:00|
|          2|2024-10-03|               null|2024-10-02 09:00:00|
|          2|2024-10-04|               null|2024-10-02 09:00:00|
|          3|2024-10-01|               null|               null|
|          3|2024-10-02|               null|               null|
|          3|2024-10-03|2024-10-03 10:00:00|2024-10-03 10:00:00|
+-----------+----------+-

In [7]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Initialize Spark session
spark = SparkSession.builder.appName("CurrencyRateGrp").getOrCreate()

# Sample data
data = [
    (3, 20201229, 0.9998),
    (3, 20201230, 1.0009),
    (3, 20201231, 0.9108),
    (3, 20210101, None),
    (3, 20210102, None),
    (3, 20210103, None),
    (4, 20201229, 0.8500),
    (4, 20201230, 0.8600),
    (4, 20201231, None),
    (4, 20210101, None),
    (4, 20210102, 0.8700),
    (4, 20210103, None)
]

columns = ["CurrencyKey", "DateKey", "Rate"]

# Create DataFrame
df = spark.createDataFrame(data, columns)

# Create a temporary view for SQL usage
df.createOrReplaceTempView("CurrencyRate")
df.show()


+-----------+--------+------+
|CurrencyKey| DateKey|  Rate|
+-----------+--------+------+
|          3|20201229|0.9998|
|          3|20201230|1.0009|
|          3|20201231|0.9108|
|          3|20210101|  null|
|          3|20210102|  null|
|          3|20210103|  null|
|          4|20201229|  0.85|
|          4|20201230|  0.86|
|          4|20201231|  null|
|          4|20210101|  null|
|          4|20210102|  0.87|
|          4|20210103|  null|
+-----------+--------+------+



In [9]:
res_sql = spark.sql("""
                    
with currencyGrp as (
SELECT 
    CurrencyKey,
    DateKey,
    Rate,
    COUNT(Rate) OVER (PARTITION BY CurrencyKey ORDER BY DateKey) AS Grp
FROM CurrencyRate
ORDER BY CurrencyKey, DateKey
)

select *,
max(Rate) over (partition by CurrencyKey, Grp order by DateKey) as currencyRate
from currencyGrp 
""")
res_sql.show()


+-----------+--------+------+---+------------+
|CurrencyKey| DateKey|  Rate|Grp|currencyRate|
+-----------+--------+------+---+------------+
|          3|20201229|0.9998|  1|      0.9998|
|          3|20201230|1.0009|  2|      1.0009|
|          3|20201231|0.9108|  3|      0.9108|
|          3|20210101|  null|  3|      0.9108|
|          3|20210102|  null|  3|      0.9108|
|          3|20210103|  null|  3|      0.9108|
|          4|20201229|  0.85|  1|        0.85|
|          4|20201230|  0.86|  2|        0.86|
|          4|20201231|  null|  2|        0.86|
|          4|20210101|  null|  2|        0.86|
|          4|20210102|  0.87|  3|        0.87|
|          4|20210103|  null|  3|        0.87|
+-----------+--------+------+---+------------+



In [10]:
# Define window specification for cumulative count of non-null Rate within each CurrencyKey partition
window_spec = Window.partitionBy("CurrencyKey").orderBy("DateKey").rowsBetween(Window.unboundedPreceding, Window.currentRow)

# Use count function with the window to calculate Grp
df_with_grp = df.withColumn("Grp", F.count("Rate").over(window_spec))
df_with_grp.orderBy("CurrencyKey", "DateKey").show()


+-----------+--------+------+---+
|CurrencyKey| DateKey|  Rate|Grp|
+-----------+--------+------+---+
|          3|20201229|0.9998|  1|
|          3|20201230|1.0009|  2|
|          3|20201231|0.9108|  3|
|          3|20210101|  null|  3|
|          3|20210102|  null|  3|
|          3|20210103|  null|  3|
|          4|20201229|  0.85|  1|
|          4|20201230|  0.86|  2|
|          4|20201231|  null|  2|
|          4|20210101|  null|  2|
|          4|20210102|  0.87|  3|
|          4|20210103|  null|  3|
+-----------+--------+------+---+

