In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()

simpleData = (("James", "Sales", 3000), \
    ("Michael", "Sales", 4600),  \
    ("Robert", "Sales", 4100),   \
    ("Maria", "Finance", 3000),  \
    ("James", "Sales", 3000),    \
    ("Scott", "Finance", 3300),  \
    ("Jen", "Finance", 3900),    \
    ("Jeff", "Marketing", 3000), \
    ("Kumar", "Marketing", 2000),\
    ("Saif", "Sales", 4100) \
  )
 
columns= ["employee_name", "department", "salary"]
df = spark.createDataFrame(data = simpleData, schema = columns)
df.printSchema()
df.show(truncate=False)

In [0]:
from pyspark.sql.window import Window as w
from pyspark.sql.functions import *
windowSpec = w.partitionBy("department").orderBy(col("salary"))
df.withColumn("rowid", row_number().over(windowSpec)).show()

In [0]:
windowSpec = w.partitionBy("department").orderBy(col("salary").asc())
df.withColumn("rowid", row_number().over(windowSpec)).show()

In [0]:
df.withColumn("rnk",rank().over(windowSpec)).show()
#with Gaps

In [0]:
#no gaps
from pyspark.sql import functions

df.withColumn("denseRnk",dense_rank().over(windowSpec) ).show()

In [0]:
from pyspark.sql.functions import dense_rank
df.withColumn("dense_rank",dense_rank().over(windowSpec)) \
    .show()
    # without gaps

In [0]:
df.printSchema()
df.withColumn("cume_dist",cume_dist().over(windowSpec)).show()

In [0]:
from pyspark.sql import functions as F

data = [("Apples", 2023, 100), ("Apples", 2024, 150), 
        ("Oranges", 2023, 200), ("Oranges", 2024, 250)]
df = spark.createDataFrame(data, ["Product", "Year", "Sales"])
display(df)

# Pivoting Year values into columns
pivoted_df = df.groupBy("Product").pivot("Year").sum("Sales")

display(pivoted_df)

In [0]:
df.groupBy("Product").pivot("year").agg(sum("Sales")).show()

df.groupBy("Product").pivot("year").sum("Sales").show()

In [0]:
# it gives the total at the year - prouduct+Year level, 
# at the prouduct level total
# Grand total
df.rollup("Product", "Year").sum("Sales").show()

In [0]:
# year level totals (product + year)
# product level subtotals (product only)
# year level subtotals (year only) â€” This is the difference!
# Grand total (All combined)
df.cube("Product", "Year").sum("Sales").show()