In [71]:
df=spark.read.option("header",True).csv("employees.txt")

In [72]:
df.createOrReplaceTempView("empsalary")

In [73]:
sqlContext.sql("select * from empsalary").show()

+-----+---+------+
|  dep|emp|salary|
+-----+---+------+
|sales|  1| 33000|
|sales| 11| 33000|
|sales| 31| 22000|
|sales|331| 98000|
|sales|931| 12000|
|sales|512| 56000|
|sales|115| 91000|
|sales|120| 82000|
|sales|115| 99000|
|   hr|  4| 25000|
|   hr|154| 12000|
|   hr|413| 12000|
|   hr|994| 17000|
|   hr|584| 82000|
+-----+---+------+



Find the difference between the salary of the employee and highest salary in that department



In [74]:
#Find the difference between the salary of the employee and highest salary in that department
sqlContext.sql("select e1.dep,e1.emp,e1.salary, (max_salary- e1.salary) as diff from empsalary e1 inner join \
(select dep, max(salary) as max_salary from empsalary group by dep) e2 on e1.dep = e2.dep").show()

+-----+---+------+-------+
|  dep|emp|salary|   diff|
+-----+---+------+-------+
|sales|  1| 33000|66000.0|
|sales| 11| 33000|66000.0|
|sales| 31| 22000|77000.0|
|sales|331| 98000| 1000.0|
|sales|931| 12000|87000.0|
|sales|512| 56000|43000.0|
|sales|115| 91000| 8000.0|
|sales|120| 82000|17000.0|
|sales|115| 99000|    0.0|
|   hr|  4| 25000|57000.0|
|   hr|154| 12000|70000.0|
|   hr|413| 12000|70000.0|
|   hr|994| 17000|65000.0|
|   hr|584| 82000|    0.0|
+-----+---+------+-------+



In [75]:
sqlContext.sql("select e1.dep,e1.emp,e1.salary, max(e1.salary) over(partition by dep) - e1.salary as diff \
from empsalary e1").show()

+-----+---+------+-------+
|  dep|emp|salary|   diff|
+-----+---+------+-------+
|sales|  1| 33000|66000.0|
|sales| 11| 33000|66000.0|
|sales| 31| 22000|77000.0|
|sales|331| 98000| 1000.0|
|sales|931| 12000|87000.0|
|sales|512| 56000|43000.0|
|sales|115| 91000| 8000.0|
|sales|120| 82000|17000.0|
|sales|115| 99000|    0.0|
|   hr|  4| 25000|57000.0|
|   hr|154| 12000|70000.0|
|   hr|413| 12000|70000.0|
|   hr|994| 17000|65000.0|
|   hr|584| 82000|    0.0|
+-----+---+------+-------+



In [76]:
from pyspark.sql.functions import *
from pyspark.sql.window import *


In [77]:
# salDesc = Window.partitionBy('dep').orderBy(df['salary'].desc())
salDesc = Window.partitionBy('dep')

In [78]:
salDiff = df.withColumn("diff" ,max(df["salary"]).over(salDesc) - df["salary"] )

# salDiff = max(df['salary'].over(salDesc)) - df['salary']

In [79]:
salDiff.show()

+-----+---+------+-------+
|  dep|emp|salary|   diff|
+-----+---+------+-------+
|sales|  1| 33000|66000.0|
|sales| 11| 33000|66000.0|
|sales| 31| 22000|77000.0|
|sales|331| 98000| 1000.0|
|sales|931| 12000|87000.0|
|sales|512| 56000|43000.0|
|sales|115| 91000| 8000.0|
|sales|120| 82000|17000.0|
|sales|115| 99000|    0.0|
|   hr|  4| 25000|57000.0|
|   hr|154| 12000|70000.0|
|   hr|413| 12000|70000.0|
|   hr|994| 17000|65000.0|
|   hr|584| 82000|    0.0|
+-----+---+------+-------+



In [80]:
sqlContext.sql("select dep,emp,salary, \
               rank() over(partition by dep order by salary) rank_col, \
               dense_rank() over(partition by dep order by salary) dense_rank_col from empsalary").show()

+-----+---+------+--------+--------------+
|  dep|emp|salary|rank_col|dense_rank_col|
+-----+---+------+--------+--------------+
|sales|931| 12000|       1|             1|
|sales| 31| 22000|       2|             2|
|sales|  1| 33000|       3|             3|
|sales| 11| 33000|       3|             3|
|sales|512| 56000|       5|             4|
|sales|120| 82000|       6|             5|
|sales|115| 91000|       7|             6|
|sales|331| 98000|       8|             7|
|sales|115| 99000|       9|             8|
|   hr|154| 12000|       1|             1|
|   hr|413| 12000|       1|             1|
|   hr|994| 17000|       3|             2|
|   hr|  4| 25000|       4|             3|
|   hr|584| 82000|       5|             4|
+-----+---+------+--------+--------------+



In [84]:
w = Window.partitionBy('dep').orderBy('salary')

In [87]:
df.withColumn("dense_rank", dense_rank().over(w)).withColumn("rank", rank().over(w)).show()

+-----+---+------+----------+----+
|  dep|emp|salary|dense_rank|rank|
+-----+---+------+----------+----+
|sales|931| 12000|         1|   1|
|sales| 31| 22000|         2|   2|
|sales|  1| 33000|         3|   3|
|sales| 11| 33000|         3|   3|
|sales|512| 56000|         4|   5|
|sales|120| 82000|         5|   6|
|sales|115| 91000|         6|   7|
|sales|331| 98000|         7|   8|
|sales|115| 99000|         8|   9|
|   hr|154| 12000|         1|   1|
|   hr|413| 12000|         1|   1|
|   hr|994| 17000|         2|   3|
|   hr|  4| 25000|         3|   4|
|   hr|584| 82000|         4|   5|
+-----+---+------+----------+----+

