In [0]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Create a Spark session
spark = SparkSession.builder.appName("Day2").getOrCreate()

In [0]:
# Sample data 
data = [
    ("Alice", "HR", 50000, "New York"),
    ("Bob", "Engineering", 60000, "San Francisco"),
    ("Charlie", "HR", 55000, "Los Angeles"),
    ("David", "Engineering", 62000, "Seattle"),
    ("Eva", "Finance", 70000, "Chicago"),
    ("Frank", "Finance", 75000, "Houston"),
    ("Grace", "Engineering", 65000, "Boston"),
    ("Hannah", "HR", 48000, "Miami"),
    ("Ian", "Finance", 68000, "Dallas"),
    ("Jessica", "Engineering", 63000, "Atlanta")
]

# Define the schema for the DataFrame
schema = ["name", "dept", "salary", "city"]

# Create the DataFrame
df = spark.createDataFrame(data, schema)

df.show()

+-------+-----------+------+-------------+
|   name|       dept|salary|         city|
+-------+-----------+------+-------------+
|  Alice|         HR| 50000|     New York|
|    Bob|Engineering| 60000|San Francisco|
|Charlie|         HR| 55000|  Los Angeles|
|  David|Engineering| 62000|      Seattle|
|    Eva|    Finance| 70000|      Chicago|
|  Frank|    Finance| 75000|      Houston|
|  Grace|Engineering| 65000|       Boston|
| Hannah|         HR| 48000|        Miami|
|    Ian|    Finance| 68000|       Dallas|
|Jessica|Engineering| 63000|      Atlanta|
+-------+-----------+------+-------------+



In [0]:
# Average over partition 
df.withColumn("running_avg",F.avg("salary").over(Window.partitionBy("dept").orderBy("dept"))).show()

+-------+-----------+------+-------------+-----------+
|   name|       dept|salary|         city|running_avg|
+-------+-----------+------+-------------+-----------+
|    Bob|Engineering| 60000|San Francisco|    62500.0|
|  David|Engineering| 62000|      Seattle|    62500.0|
|  Grace|Engineering| 65000|       Boston|    62500.0|
|Jessica|Engineering| 63000|      Atlanta|    62500.0|
|    Eva|    Finance| 70000|      Chicago|    71000.0|
|  Frank|    Finance| 75000|      Houston|    71000.0|
|    Ian|    Finance| 68000|       Dallas|    71000.0|
|  Alice|         HR| 50000|     New York|    51000.0|
|Charlie|         HR| 55000|  Los Angeles|    51000.0|
| Hannah|         HR| 48000|        Miami|    51000.0|
+-------+-----------+------+-------------+-----------+



In [0]:
# Sum over partition
df.withColumn("Running_sum",F.sum("salary").over(Window.partitionBy("dept").orderBy("salary"))).show()

+-------+-----------+------+-------------+-----------+
|   name|       dept|salary|         city|Running_sum|
+-------+-----------+------+-------------+-----------+
|    Bob|Engineering| 60000|San Francisco|      60000|
|  David|Engineering| 62000|      Seattle|     122000|
|Jessica|Engineering| 63000|      Atlanta|     185000|
|  Grace|Engineering| 65000|       Boston|     250000|
|    Ian|    Finance| 68000|       Dallas|      68000|
|    Eva|    Finance| 70000|      Chicago|     138000|
|  Frank|    Finance| 75000|      Houston|     213000|
| Hannah|         HR| 48000|        Miami|      48000|
|  Alice|         HR| 50000|     New York|      98000|
|Charlie|         HR| 55000|  Los Angeles|     153000|
+-------+-----------+------+-------------+-----------+



In [0]:
# lead function with default value 
lead_df=df.withColumn("lead_sal",F.lead("salary",default=0).over(Window.partitionBy("dept").orderBy("salary")))
lead_df.show()

+-------+-----------+------+-------------+--------+
|   name|       dept|salary|         city|lead_sal|
+-------+-----------+------+-------------+--------+
|    Bob|Engineering| 60000|San Francisco|   62000|
|  David|Engineering| 62000|      Seattle|   63000|
|Jessica|Engineering| 63000|      Atlanta|   65000|
|  Grace|Engineering| 65000|       Boston|       0|
|    Ian|    Finance| 68000|       Dallas|   70000|
|    Eva|    Finance| 70000|      Chicago|   75000|
|  Frank|    Finance| 75000|      Houston|       0|
| Hannah|         HR| 48000|        Miami|   50000|
|  Alice|         HR| 50000|     New York|   55000|
|Charlie|         HR| 55000|  Los Angeles|       0|
+-------+-----------+------+-------------+--------+



In [0]:
# lag function with default value 
lag_df=df.withColumn("lag_sal",F.lag("salary",default=0).over(Window.partitionBy("dept").orderBy("salary")))
lag_df.show()

+-------+-----------+------+-------------+-------+
|   name|       dept|salary|         city|lag_sal|
+-------+-----------+------+-------------+-------+
|    Bob|Engineering| 60000|San Francisco|      0|
|  David|Engineering| 62000|      Seattle|  60000|
|Jessica|Engineering| 63000|      Atlanta|  62000|
|  Grace|Engineering| 65000|       Boston|  63000|
|    Ian|    Finance| 68000|       Dallas|      0|
|    Eva|    Finance| 70000|      Chicago|  68000|
|  Frank|    Finance| 75000|      Houston|  70000|
| Hannah|         HR| 48000|        Miami|      0|
|  Alice|         HR| 50000|     New York|  48000|
|Charlie|         HR| 55000|  Los Angeles|  50000|
+-------+-----------+------+-------------+-------+



In [0]:
# drop column 
lag_df.drop("lag_sal").show()

+-------+-----------+------+-------------+
|   name|       dept|salary|         city|
+-------+-----------+------+-------------+
|  Alice|         HR| 50000|     New York|
|    Bob|Engineering| 60000|San Francisco|
|Charlie|         HR| 55000|  Los Angeles|
|  David|Engineering| 62000|      Seattle|
|    Eva|    Finance| 70000|      Chicago|
|  Frank|    Finance| 75000|      Houston|
|  Grace|Engineering| 65000|       Boston|
| Hannah|         HR| 48000|        Miami|
|    Ian|    Finance| 68000|       Dallas|
|Jessica|Engineering| 63000|      Atlanta|
+-------+-----------+------+-------------+



In [0]:
#rename column 
lead_df.withColumnRenamed("lead_sal","lead_salary").show()

+-------+-----------+------+-------------+-----------+
|   name|       dept|salary|         city|lead_salary|
+-------+-----------+------+-------------+-----------+
|    Bob|Engineering| 60000|San Francisco|      62000|
|  David|Engineering| 62000|      Seattle|      63000|
|Jessica|Engineering| 63000|      Atlanta|      65000|
|  Grace|Engineering| 65000|       Boston|          0|
|    Ian|    Finance| 68000|       Dallas|      70000|
|    Eva|    Finance| 70000|      Chicago|      75000|
|  Frank|    Finance| 75000|      Houston|          0|
| Hannah|         HR| 48000|        Miami|      50000|
|  Alice|         HR| 50000|     New York|      55000|
|Charlie|         HR| 55000|  Los Angeles|          0|
+-------+-----------+------+-------------+-----------+



In [0]:
# change column datatype
lead_df.withColumn("new_lead_sal",F.col("lead_sal").cast("float")).show()

+-------+-----------+------+-------------+--------+------------+
|   name|       dept|salary|         city|lead_sal|new_lead_sal|
+-------+-----------+------+-------------+--------+------------+
|    Bob|Engineering| 60000|San Francisco|   62000|     62000.0|
|  David|Engineering| 62000|      Seattle|   63000|     63000.0|
|Jessica|Engineering| 63000|      Atlanta|   65000|     65000.0|
|  Grace|Engineering| 65000|       Boston|       0|         0.0|
|    Ian|    Finance| 68000|       Dallas|   70000|     70000.0|
|    Eva|    Finance| 70000|      Chicago|   75000|     75000.0|
|  Frank|    Finance| 75000|      Houston|       0|         0.0|
| Hannah|         HR| 48000|        Miami|   50000|     50000.0|
|  Alice|         HR| 50000|     New York|   55000|     55000.0|
|Charlie|         HR| 55000|  Los Angeles|       0|         0.0|
+-------+-----------+------+-------------+--------+------------+

