In [0]:
sampleDate = [
    ('Tom Prescott', 'Furniture', 'Chairs', 4000),
    ('Tom Prescott', 'Furniture', 'Tables', 5000),
    ('Tom Prescott', 'Furniture', 'Sofas', 6000),
    ('Tom Prescott', 'Furniture', 'Beds', 7000),
    ('Tom Prescott', 'Office Supplies', 'Binders', 8000),
    ('Tom Prescott', 'Office Supplies', 'Supplies', 4000),
    ('Tom Prescott', 'Office Supplies', 'Storage', 4000),
    ('Tom Prescott', 'Office Supplies', 'Fasteners', 6000),
    ('Tom Prescott', 'Technology', 'Machines', 20000),
    ('John Mur','Technology', 'Copiers', 20000),
      ('John Mur','Technology', 'Printers', 20000),
      ('John Mur','Technology', 'Scanners', 15000),
       ('John Mur','Technology', 'Projectors', 10000)
]
# column names for dataframe
columns = ['Customer Name', 'Category', 'Subcategory', 'Sales']
# create dataframe
df = spark.createDataFrame(sampleDate, columns)
df.show()

+-------------+---------------+-----------+-----+
|Customer Name|       Category|Subcategory|Sales|
+-------------+---------------+-----------+-----+
| Tom Prescott|      Furniture|     Chairs| 4000|
| Tom Prescott|      Furniture|     Tables| 5000|
| Tom Prescott|      Furniture|      Sofas| 6000|
| Tom Prescott|      Furniture|       Beds| 7000|
| Tom Prescott|Office Supplies|    Binders| 8000|
| Tom Prescott|Office Supplies|   Supplies| 4000|
| Tom Prescott|Office Supplies|    Storage| 4000|
| Tom Prescott|Office Supplies|  Fasteners| 6000|
| Tom Prescott|     Technology|   Machines|20000|
|     John Mur|     Technology|    Copiers|20000|
|     John Mur|     Technology|   Printers|20000|
|     John Mur|     Technology|   Scanners|15000|
|     John Mur|     Technology| Projectors|10000|
+-------------+---------------+-----------+-----+



In [0]:
from pyspark.sql.functions import * # this has to used when you are trying to use transformation functions
from pyspark.sql.types import * # this has to be used when you are trying to assign datatypes and struct field and struct type
from pyspark.sql.window import * # this has to used when you are working with window functions
from pyspark.sql.column import *  # this has to used when you are working with col in dataframe
from pyspark.sql import SparkSession

In [0]:

# to create a window function in pyspark, there are two steps.
# step1: assign a partition nd order level, either table or specific column partion levels
# step2: write any functions based on above partition created

In [0]:
example for  SQL table level : over(order by sales desc)
pyspark code for table level: Window.orderBy(col('sales').desc())

example for SQL partiti on level: over(partition by category order by sales desc)
pyspark code for partition level: Window.partitionBy("Category").orderBy(col('sales').desc())")

In [0]:
# create a row_number() for above table based on sales descending

#step-1: Create a window partition at table level

table_partition = Window.orderBy(col('sales').desc())

#step-2: cREATE A  row_number() function 
df2 = df.withColumn('row_number', row_number().over(table_partition)).show()



+-------------+---------------+-----------+-----+----------+
|Customer Name|       Category|Subcategory|Sales|row_number|
+-------------+---------------+-----------+-----+----------+
| Tom Prescott|     Technology|   Machines|20000|         1|
|     John Mur|     Technology|    Copiers|20000|         2|
|     John Mur|     Technology|   Printers|20000|         3|
|     John Mur|     Technology|   Scanners|15000|         4|
|     John Mur|     Technology| Projectors|10000|         5|
| Tom Prescott|Office Supplies|    Binders| 8000|         6|
| Tom Prescott|      Furniture|       Beds| 7000|         7|
| Tom Prescott|      Furniture|      Sofas| 6000|         8|
| Tom Prescott|Office Supplies|  Fasteners| 6000|         9|
| Tom Prescott|      Furniture|     Tables| 5000|        10|
| Tom Prescott|      Furniture|     Chairs| 4000|        11|
| Tom Prescott|Office Supplies|   Supplies| 4000|        12|
| Tom Prescott|Office Supplies|    Storage| 4000|        13|
+-------------+---------

In [0]:
# Create a row_number() for above table based on category partition and sales descending

#step-1: Create a window partition at table level
window_cat = Window.partitionBy("Category").orderBy(col('sales').desc())

#step-2: cREATE A  row_number() function 
df2 = df.withColumn('row_number', row_number().over(window_cat))\
    .withColumn('rank', rank().over(window_cat))\
        .withColumn('dense_rank', dense_rank().over(window_cat))
df2.show()

+-------------+---------------+-----------+-----+----------+----+----------+
|Customer Name|       Category|Subcategory|Sales|row_number|rank|dense_rank|
+-------------+---------------+-----------+-----+----------+----+----------+
| Tom Prescott|      Furniture|       Beds| 7000|         1|   1|         1|
| Tom Prescott|      Furniture|      Sofas| 6000|         2|   2|         2|
| Tom Prescott|      Furniture|     Tables| 5000|         3|   3|         3|
| Tom Prescott|      Furniture|     Chairs| 4000|         4|   4|         4|
| Tom Prescott|Office Supplies|    Binders| 8000|         1|   1|         1|
| Tom Prescott|Office Supplies|  Fasteners| 6000|         2|   2|         2|
| Tom Prescott|Office Supplies|   Supplies| 4000|         3|   3|         3|
| Tom Prescott|Office Supplies|    Storage| 4000|         4|   3|         3|
| Tom Prescott|     Technology|   Machines|20000|         1|   1|         1|
|     John Mur|     Technology|    Copiers|20000|         2|   1|         1|

In [0]:
# Lag : get the previous row value into next row
# syntax: lead ("COLNAME",<NO_OF_NEXTCOLUMN>).over(<PARTATION LEVEL>)
# Lead: get the last row value into current row
# syntax: lag ("COLNAME",<NO_OF_LASTCOLUMN>).over(<PARTATION LEVEL>)

In [0]:
# Get THE PREVIOUS ROW SALES INTO CURRENT ROW

#STEP 1: CREATE A WINDOW PARTITION AT TABLE LEVEL
table_partition = Window.orderBy(col('sales').desc())

#step-2: cREATE A  lag Function
df2 = df.withColumn("lag1",lag("sales",1).over(table_partition))\
    .withColumn("lag2",lag("sales",2).over(table_partition))
df2.show()



+-------------+---------------+-----------+-----+-----+-----+
|Customer Name|       Category|Subcategory|Sales| lag1| lag2|
+-------------+---------------+-----------+-----+-----+-----+
| Tom Prescott|     Technology|   Machines|20000| NULL| NULL|
|     John Mur|     Technology|    Copiers|20000|20000| NULL|
|     John Mur|     Technology|   Printers|20000|20000|20000|
|     John Mur|     Technology|   Scanners|15000|20000|20000|
|     John Mur|     Technology| Projectors|10000|15000|20000|
| Tom Prescott|Office Supplies|    Binders| 8000|10000|15000|
| Tom Prescott|      Furniture|       Beds| 7000| 8000|10000|
| Tom Prescott|      Furniture|      Sofas| 6000| 7000| 8000|
| Tom Prescott|Office Supplies|  Fasteners| 6000| 6000| 7000|
| Tom Prescott|      Furniture|     Tables| 5000| 6000| 6000|
| Tom Prescott|      Furniture|     Chairs| 4000| 5000| 6000|
| Tom Prescott|Office Supplies|   Supplies| 4000| 4000| 5000|
| Tom Prescott|Office Supplies|    Storage| 4000| 4000| 4000|
+-------

In [0]:
# get the last row value into current row
#step 1: create a window partition at table level
table_partition = Window.orderBy(col('sales').desc())
#step-2: create a lead function
df2 = df.withColumn("lead1",lead("sales",1).over(table_partition))\
    .withColumn("lead2",lead("sales",2).over(table_partition))
df2.show()



+-------------+---------------+-----------+-----+-----+-----+
|Customer Name|       Category|Subcategory|Sales|lead1|lead2|
+-------------+---------------+-----------+-----+-----+-----+
| Tom Prescott|     Technology|   Machines|20000|20000|20000|
|     John Mur|     Technology|    Copiers|20000|20000|15000|
|     John Mur|     Technology|   Printers|20000|15000|10000|
|     John Mur|     Technology|   Scanners|15000|10000| 8000|
|     John Mur|     Technology| Projectors|10000| 8000| 7000|
| Tom Prescott|Office Supplies|    Binders| 8000| 7000| 6000|
| Tom Prescott|      Furniture|       Beds| 7000| 6000| 6000|
| Tom Prescott|      Furniture|      Sofas| 6000| 6000| 5000|
| Tom Prescott|Office Supplies|  Fasteners| 6000| 5000| 4000|
| Tom Prescott|      Furniture|     Tables| 5000| 4000| 4000|
| Tom Prescott|      Furniture|     Chairs| 4000| 4000| 4000|
| Tom Prescott|Office Supplies|   Supplies| 4000| 4000| NULL|
| Tom Prescott|Office Supplies|    Storage| 4000| NULL| NULL|
+-------

In [0]:
# Get top 2 Sales from each category
from pyspark.sql.functions import *
from pyspark.sql.window import *
#step 1: create a window partition at table level
partition_cat = Window.partitionBy("Category").orderBy(col('sales').desc())
df2 = df.withColumn('R_N', row_number().over(partition_cat))\
    .filter(col('R_N') <= 2)
df2.show()

df3 = df.withColumn('R_N', row_number().over(partition_cat))
df3=df3.filter("R_N <= 2")
df3.show()



[0;31m---------------------------------------------------------------------------[0m
[0;31mAttributeError[0m                            Traceback (most recent call last)
File [0;32m<command-8843907571725001>, line 6[0m
[1;32m      4[0m [38;5;66;03m#step 1: create a window partition at table level[39;00m
[1;32m      5[0m partition_cat [38;5;241m=[39m Window[38;5;241m.[39mpartitionBy([38;5;124m"[39m[38;5;124mCategory[39m[38;5;124m"[39m)[38;5;241m.[39morderBy(col([38;5;124m'[39m[38;5;124msales[39m[38;5;124m'[39m)[38;5;241m.[39mdesc())
[0;32m----> 6[0m df2 [38;5;241m=[39m df[38;5;241m.[39mwithColumn([38;5;124m'[39m[38;5;124mR_N[39m[38;5;124m'[39m, row_number()[38;5;241m.[39mover(partition_cat))\
[1;32m      7[0m     [38;5;241m.[39mfilter(col([38;5;124m'[39m[38;5;124mR_N[39m[38;5;124m'[39m) [38;5;241m<[39m[38;5;241m=[39m [38;5;241m2[39m)
[1;32m      8[0m df2[38;5;241m.[39mshow()
[1;32m     10[0m df3 [38;5;241m=[39m df[38

In [0]:

# Initialize Spark Session
spark = SparkSession.builder.appName("DuplicatesRemove").getOrCreate()

# Sample data
data =[
    (1, 'Alice', '2025-01-01', 'A'), (1, 'Alice', '2025-09-02', 'B'), (1, 'Alice', '2025-04-03', 'C'), (1, 'Alice', '2024-01-04', 'D'),
    (2, 'Bob', '2024-02-01', 'E'), (2, 'Bob', '2024-02-02', 'F'), (2, 'Bob', '2024-02-03', 'G'), (2, 'Bob', '2024-02-04', 'H'),
    (3, 'Charlie', '2024-03-01', 'I'), (3, 'Charlie', '2024-03-02', 'J'), (3, 'Charlie', '2024-03-03', 'K'), (3, 'Charlie', '2024-03-04', 'L'),
    (4, 'David', '2024-04-01', 'M'), (4, 'David', '2024-04-02', 'N'), (4, 'David', '2024-04-03', 'O'), (4, 'David', '2024-04-04', 'P'),
    (5, 'Emma', '2024-05-01', 'Q'), (5, 'Emma', '2024-05-02', 'R'), (5, 'Emma', '2024-05-03', 'S'), (5, 'Emma', '2024-05-04', 'T')
]
# Create DataFrame
df = spark.createDataFrame(data, ['CustoemrID', 'CustomerName', 'date', 'Category'])
df.show()

+----------+------------+----------+--------+
|CustoemrID|CustomerName|      date|Category|
+----------+------------+----------+--------+
|         1|       Alice|2025-01-01|       A|
|         1|       Alice|2025-09-02|       B|
|         1|       Alice|2025-04-03|       C|
|         1|       Alice|2024-01-04|       D|
|         2|         Bob|2024-02-01|       E|
|         2|         Bob|2024-02-02|       F|
|         2|         Bob|2024-02-03|       G|
|         2|         Bob|2024-02-04|       H|
|         3|     Charlie|2024-03-01|       I|
|         3|     Charlie|2024-03-02|       J|
|         3|     Charlie|2024-03-03|       K|
|         3|     Charlie|2024-03-04|       L|
|         4|       David|2024-04-01|       M|
|         4|       David|2024-04-02|       N|
|         4|       David|2024-04-03|       O|
|         4|       David|2024-04-04|       P|
|         5|        Emma|2024-05-01|       Q|
|         5|        Emma|2024-05-02|       R|
|         5|        Emma|2024-05-0

In [0]:
df.show()

+----------+------------+----------+--------+
|CustoemrID|CustomerName|      date|Category|
+----------+------------+----------+--------+
|         1|       Alice|2025-01-01|       A|
|         1|       Alice|2025-09-02|       B|
|         1|       Alice|2025-04-03|       C|
|         1|       Alice|2024-01-04|       D|
|         2|         Bob|2024-02-01|       E|
|         2|         Bob|2024-02-02|       F|
|         2|         Bob|2024-02-03|       G|
|         2|         Bob|2024-02-04|       H|
|         3|     Charlie|2024-03-01|       I|
|         3|     Charlie|2024-03-02|       J|
|         3|     Charlie|2024-03-03|       K|
|         3|     Charlie|2024-03-04|       L|
|         4|       David|2024-04-01|       M|
|         4|       David|2024-04-02|       N|
|         4|       David|2024-04-03|       O|
|         4|       David|2024-04-04|       P|
|         5|        Emma|2024-05-01|       Q|
|         5|        Emma|2024-05-02|       R|
|         5|        Emma|2024-05-0

In [0]:
# Get latest date transaction for each customer
#step 1: create a window partition at table level
partition_cat = Window.partitionBy("CustomerName").orderBy(col('date').desc())
df_lastdate = df.withColumn('R_N', row_number().over(partition_cat))
df_lastdate = df_lastdate.filter("R_N = 1")
df_lastdate.show()

+----------+------------+----------+--------+---+
|CustoemrID|CustomerName|      date|Category|R_N|
+----------+------------+----------+--------+---+
|         1|       Alice|2025-09-02|       B|  1|
|         2|         Bob|2024-02-04|       H|  1|
|         3|     Charlie|2024-03-04|       L|  1|
|         4|       David|2024-04-04|       P|  1|
|         5|        Emma|2024-05-04|       T|  1|
+----------+------------+----------+--------+---+



In [0]:
# Initialize Spark Session
spark = SparkSession.builder.appName("listdataframe").getOrCreate()

# Sample list
list1 = [
        (10000, 'Alice', '2024-01-01', 'A'),
    (10000, 'Alice', '2024-01-02', 'B'),
    (5000, 'Bob', '2024-02-01', 'E'),
    (5000, 'Bob', '2024-02-02', 'F'),
    (5000, 'Bob', '2024-02-03', 'G'),
    (5000, 'Bob', '2024-02-04', 'H'),
    (3000, 'Charlie', '2024-03-01', 'I')

]

# create DataFrame
df = spark.createDataFrame(list1, ['CustoemrID', 'CustomerName', 'date', 'Category'])
df.show()
df = df.select(df.CustoemrID.alias("SalesAmount"))
df.show()

+----------+------------+----------+--------+
|CustoemrID|CustomerName|      date|Category|
+----------+------------+----------+--------+
|     10000|       Alice|2024-01-01|       A|
|     10000|       Alice|2024-01-02|       B|
|      5000|         Bob|2024-02-01|       E|
|      5000|         Bob|2024-02-02|       F|
|      5000|         Bob|2024-02-03|       G|
|      5000|         Bob|2024-02-04|       H|
|      3000|     Charlie|2024-03-01|       I|
+----------+------------+----------+--------+

+-----------+
|SalesAmount|
+-----------+
|      10000|
|      10000|
|       5000|
|       5000|
|       5000|
|       5000|
|       3000|
+-----------+



In [0]:
df.show()

+-----------+
|SalesAmount|
+-----------+
|      10000|
|      10000|
|       5000|
|       5000|
|       5000|
|       5000|
|       3000|
+-----------+



In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import col, rank, dense_rank, row_number

part_rank = Window.orderBy(col("SalesAmount").desc())
df2 = df.withColumn("rank", rank().over(part_rank))\
      .withColumn("dense_rank", dense_rank().over(part_rank))\
      .withColumn("row_number", row_number().over(part_rank))

df2.show()



+-----------+----+----------+----------+
|SalesAmount|rank|dense_rank|row_number|
+-----------+----+----------+----------+
|      10000|   1|         1|         1|
|      10000|   1|         1|         2|
|       5000|   3|         2|         3|
|       5000|   3|         2|         4|
|       5000|   3|         2|         5|
|       5000|   3|         2|         6|
|       3000|   7|         3|         7|
+-----------+----+----------+----------+

