In [79]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
spark=SparkSession.builder.appName("spark_learning").getOrCreate()

In [80]:
data = [
    (1, "Alice", "HR", 5000, 30),
    (2, "Bob", "IT", 6000, 35),
    (3, "Charlie", "Finance", 7000, 40),
    (4, "David", "IT", 4000, 28),
    (5, "Eve", "HR", 5500, 32)
]

In [81]:
columns = ["id", "name", "department", "salary", "age"]

In [82]:
df = spark.createDataFrame(data, columns)

In [83]:
df.createOrReplaceTempView("Employees")

In [84]:
df.show()

+---+-------+----------+------+---+
| id|   name|department|salary|age|
+---+-------+----------+------+---+
|  1|  Alice|        HR|  5000| 30|
|  2|    Bob|        IT|  6000| 35|
|  3|Charlie|   Finance|  7000| 40|
|  4|  David|        IT|  4000| 28|
|  5|    Eve|        HR|  5500| 32|
+---+-------+----------+------+---+



In [85]:
df.withColumnRenamed("name", "Employeename").show()

+---+------------+----------+------+---+
| id|Employeename|department|salary|age|
+---+------------+----------+------+---+
|  1|       Alice|        HR|  5000| 30|
|  2|         Bob|        IT|  6000| 35|
|  3|     Charlie|   Finance|  7000| 40|
|  4|       David|        IT|  4000| 28|
|  5|         Eve|        HR|  5500| 32|
+---+------------+----------+------+---+



In [88]:
a=spark.sql("""
  select id, department, salary, age, name as Employeename from Employees
""")
a.show()

+---+----------+------+---+------------+
| id|department|salary|age|Employeename|
+---+----------+------+---+------------+
|  1|        HR|  5000| 30|       Alice|
|  2|        IT|  6000| 35|         Bob|
|  3|   Finance|  7000| 40|     Charlie|
|  4|        IT|  4000| 28|       David|
|  5|        HR|  5500| 32|         Eve|
+---+----------+------+---+------------+



In [94]:
from pyspark.sql.functions import col


In [97]:
df.withColumn("bonus", col("salary")*0.10).show()

+---+-------+----------+------+---+-----+
| id|   name|department|salary|age|bonus|
+---+-------+----------+------+---+-----+
|  1|  Alice|        HR|  5000| 30|500.0|
|  2|    Bob|        IT|  6000| 35|600.0|
|  3|Charlie|   Finance|  7000| 40|700.0|
|  4|  David|        IT|  4000| 28|400.0|
|  5|    Eve|        HR|  5500| 32|550.0|
+---+-------+----------+------+---+-----+



In [99]:
spark.sql("""
select *, salary*0.10 as bonus from Employees""").show()

+---+-------+----------+------+---+------+
| id|   name|department|salary|age| bonus|
+---+-------+----------+------+---+------+
|  1|  Alice|        HR|  5000| 30|500.00|
|  2|    Bob|        IT|  6000| 35|600.00|
|  3|Charlie|   Finance|  7000| 40|700.00|
|  4|  David|        IT|  4000| 28|400.00|
|  5|    Eve|        HR|  5500| 32|550.00|
+---+-------+----------+------+---+------+



In [106]:
df.filter((df.department== "IT") & (df.salary>5000)).show()

+---+----+----------+------+---+
| id|name|department|salary|age|
+---+----+----------+------+---+
|  2| Bob|        IT|  6000| 35|
+---+----+----------+------+---+



In [108]:
spark.sql("""
 select * from Employees where department=='IT' and salary>5000 
""").show()

+---+----+----------+------+---+
| id|name|department|salary|age|
+---+----+----------+------+---+
|  2| Bob|        IT|  6000| 35|
+---+----+----------+------+---+



In [111]:
df.orderBy(df.salary.desc()).show()

+---+-------+----------+------+---+
| id|   name|department|salary|age|
+---+-------+----------+------+---+
|  3|Charlie|   Finance|  7000| 40|
|  2|    Bob|        IT|  6000| 35|
|  5|    Eve|        HR|  5500| 32|
|  1|  Alice|        HR|  5000| 30|
|  4|  David|        IT|  4000| 28|
+---+-------+----------+------+---+



In [112]:
spark.sql("""select * from employees ORDER BY salary desc""").show()

+---+-------+----------+------+---+
| id|   name|department|salary|age|
+---+-------+----------+------+---+
|  3|Charlie|   Finance|  7000| 40|
|  2|    Bob|        IT|  6000| 35|
|  5|    Eve|        HR|  5500| 32|
|  1|  Alice|        HR|  5000| 30|
|  4|  David|        IT|  4000| 28|
+---+-------+----------+------+---+



In [114]:
df.withColumn("bonus", col("salary")*0.15).show()

+---+-------+----------+------+---+------+
| id|   name|department|salary|age| bonus|
+---+-------+----------+------+---+------+
|  1|  Alice|        HR|  5000| 30| 750.0|
|  2|    Bob|        IT|  6000| 35| 900.0|
|  3|Charlie|   Finance|  7000| 40|1050.0|
|  4|  David|        IT|  4000| 28| 600.0|
|  5|    Eve|        HR|  5500| 32| 825.0|
+---+-------+----------+------+---+------+



In [119]:
spark.sql("""
 select *, salary * 0.15 as bonus from employees
""").show()

+---+-------+----------+------+---+-------+
| id|   name|department|salary|age|  bonus|
+---+-------+----------+------+---+-------+
|  1|  Alice|        HR|  5000| 30| 750.00|
|  2|    Bob|        IT|  6000| 35| 900.00|
|  3|Charlie|   Finance|  7000| 40|1050.00|
|  4|  David|        IT|  4000| 28| 600.00|
|  5|    Eve|        HR|  5500| 32| 825.00|
+---+-------+----------+------+---+-------+



In [120]:
df.withColumnRenamed("department", "branch").show()

+---+-------+-------+------+---+
| id|   name| branch|salary|age|
+---+-------+-------+------+---+
|  1|  Alice|     HR|  5000| 30|
|  2|    Bob|     IT|  6000| 35|
|  3|Charlie|Finance|  7000| 40|
|  4|  David|     IT|  4000| 28|
|  5|    Eve|     HR|  5500| 32|
+---+-------+-------+------+---+



In [121]:
df.filter(df.department=="IT").show()

+---+-----+----------+------+---+
| id| name|department|salary|age|
+---+-----+----------+------+---+
|  2|  Bob|        IT|  6000| 35|
|  4|David|        IT|  4000| 28|
+---+-----+----------+------+---+



In [122]:
df.orderBy(df.age.desc()).show()

+---+-------+----------+------+---+
| id|   name|department|salary|age|
+---+-------+----------+------+---+
|  3|Charlie|   Finance|  7000| 40|
|  2|    Bob|        IT|  6000| 35|
|  5|    Eve|        HR|  5500| 32|
|  1|  Alice|        HR|  5000| 30|
|  4|  David|        IT|  4000| 28|
+---+-------+----------+------+---+



In [127]:
# Add a new column experience as age - 25, and then rename salary to monthly_salary.
df.withColumn("experience", col("age")-25).show()

+---+-------+----------+------+---+----------+
| id|   name|department|salary|age|experience|
+---+-------+----------+------+---+----------+
|  1|  Alice|        HR|  5000| 30|         5|
|  2|    Bob|        IT|  6000| 35|        10|
|  3|Charlie|   Finance|  7000| 40|        15|
|  4|  David|        IT|  4000| 28|         3|
|  5|    Eve|        HR|  5500| 32|         7|
+---+-------+----------+------+---+----------+



In [131]:
# Find employees in the HR department earning more than 5000, and sort them by salary in ascending order.
df.filter((df.department=="HR") & (df.salary>5000)).orderBy(df.salary).show()

+---+----+----------+------+---+
| id|name|department|salary|age|
+---+----+----------+------+---+
|  5| Eve|        HR|  5500| 32|
+---+----+----------+------+---+



In [135]:
# Retrieve employees whose age is between 30 and 40, inclusive.
df.filter((col("age") >= 30) & (col("age") <= 40)).show()

+---+-------+----------+------+---+
| id|   name|department|salary|age|
+---+-------+----------+------+---+
|  1|  Alice|        HR|  5000| 30|
|  2|    Bob|        IT|  6000| 35|
|  3|Charlie|   Finance|  7000| 40|
|  5|    Eve|        HR|  5500| 32|
+---+-------+----------+------+---+



In [138]:
# Add a bonus column, filter for employees in the IT department, and sort the result by bonus in descending order.
df.withColumn("bonus", col("salary")*10).filter(df.department=="IT").orderBy(col("bonus").desc()).show()


+---+-----+----------+------+---+-----+
| id| name|department|salary|age|bonus|
+---+-----+----------+------+---+-----+
|  2|  Bob|        IT|  6000| 35|60000|
|  4|David|        IT|  4000| 28|40000|
+---+-----+----------+------+---+-----+

