In [25]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('Manager Employee').getOrCreate()

data=[(101,'Ashi',10000,'01-01-2024',None,'IT'),
      (102,'Balu',9000,'02-01-2024',101,'IT'),
      (103,'Chitra',11000,'02-01-2024',101,'IT'),
      (104,'David',4000,'02-01-2024',101,'IT'),
      (105,'Geetha',9000,'03-01-2024',104,'IT'),
      (106,'Sowji',7000,'03-01-2024',104,'IT'),
      (107,'Raj',10000,'04-01-2024',105,'IT'),
      (108,'Ravi',10000,'04-01-2024',105,'IT')]
columns=["empid","empname","salary","dt","mang_id","dept"]

dataDf = spark.createDataFrame(data,columns)

dataDf.show()
dataDf.printSchema()

+-----+-------+------+----------+-------+----+
|empid|empname|salary|        dt|mang_id|dept|
+-----+-------+------+----------+-------+----+
|  101|   Ashi| 10000|01-01-2024|   null|  IT|
|  102|   Balu|  9000|02-01-2024|    101|  IT|
|  103| Chitra| 11000|02-01-2024|    101|  IT|
|  104|  David|  4000|02-01-2024|    101|  IT|
|  105| Geetha|  9000|03-01-2024|    104|  IT|
|  106|  Sowji|  7000|03-01-2024|    104|  IT|
|  107|    Raj| 10000|04-01-2024|    105|  IT|
|  108|   Ravi| 10000|04-01-2024|    105|  IT|
+-----+-------+------+----------+-------+----+

root
 |-- empid: long (nullable = true)
 |-- empname: string (nullable = true)
 |-- salary: long (nullable = true)
 |-- dt: string (nullable = true)
 |-- mang_id: long (nullable = true)
 |-- dept: string (nullable = true)



In [26]:
#extract month from date
from pyspark.sql.functions import unix_timestamp, from_unixtime,to_date,col,month,date_format

df=dataDf.withColumn("month",date_format(to_date(col("dt"),"MM-dd-yyyy"),"MMM"))
mgdf=df.alias('a').join(df.alias('b'),col('b.mang_id')==col('a.empid'),"right").select(
col('a.empname').alias('managername'),col('b.empname'),col('b.salary'),col('b.month'),col('b.dept'))
mgdf.show()




+-----------+-------+------+-----+----+
|managername|empname|salary|month|dept|
+-----------+-------+------+-----+----+
|       null|   Ashi| 10000|  Jan|  IT|
|       Ashi|   Balu|  9000|  Feb|  IT|
|       Ashi| Chitra| 11000|  Feb|  IT|
|       Ashi|  David|  4000|  Feb|  IT|
|      David| Geetha|  9000|  Mar|  IT|
|      David|  Sowji|  7000|  Mar|  IT|
|     Geetha|    Raj| 10000|  Apr|  IT|
|     Geetha|   Ravi| 10000|  Apr|  IT|
+-----------+-------+------+-----+----+



In [32]:
# who is getting 3rd highest salary
from pyspark.sql.window import Window
from pyspark.sql.functions import dense_rank
wdf=Window.partitionBy("dept").orderBy(col("salary").desc())

mgdf.withColumn("rank",dense_rank().over(wdf)).filter(col("rank")==3).show()


+-----------+-------+------+-----+----+----+
|managername|empname|salary|month|dept|rank|
+-----------+-------+------+-----+----+----+
|       Ashi|   Balu|  9000|  Feb|  IT|   3|
|      David| Geetha|  9000|  Mar|  IT|   3|
+-----------+-------+------+-----+----+----+

