In [52]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [89]:
employees = spark.read.csv("HR_employess.csv",header=True)
employees = employees.withColumn("SALARY",employees["SALARY"].cast(DoubleType()))


In [90]:
employees.show(5)

+-----------+----------+---------+--------+------------+-------------------+-------+-------+--------------+----------+-------------+
|EMPLOYEE_ID|FIRST_NAME|LAST_NAME|   EMAIL|PHONE_NUMBER|          HIRE_DATE| JOB_ID| SALARY|COMMISSION_PCT|MANAGER_ID|DEPARTMENT_ID|
+-----------+----------+---------+--------+------------+-------------------+-------+-------+--------------+----------+-------------+
|        100|    Steven|     King|   SKING|515.123.4567|1987-06-17 00:00:00|AD_PRES|24000.0|          null|      null|           90|
|        101|     Neena|  Kochhar|NKOCHHAR|515.123.4568|1989-09-21 00:00:00|  AD_VP|17000.0|          null|       100|           90|
|        102|       Lex|  De Haan| LDEHAAN|515.123.4569|1993-01-13 00:00:00|  AD_VP|17000.0|          null|       100|           90|
|        103| Alexander|   Hunold| AHUNOLD|590.423.4567|1990-01-03 00:00:00|IT_PROG| 9000.0|          null|       102|           60|
|        104|     Bruce|    Ernst|  BERNST|590.423.4568|1991-05-21 00

In [99]:
employees.printSchema()

root
 |-- EMPLOYEE_ID: string (nullable = true)
 |-- FIRST_NAME: string (nullable = true)
 |-- LAST_NAME: string (nullable = true)
 |-- EMAIL: string (nullable = true)
 |-- PHONE_NUMBER: string (nullable = true)
 |-- HIRE_DATE: string (nullable = true)
 |-- JOB_ID: string (nullable = true)
 |-- SALARY: double (nullable = true)
 |-- COMMISSION_PCT: string (nullable = true)
 |-- MANAGER_ID: string (nullable = true)
 |-- DEPARTMENT_ID: string (nullable = true)



In [None]:
departments = spark.read.csv("HR_departments.csv",header=True)



In [91]:
employees.registerTempTable("employees")

In [10]:
# 1. Write a query to list the number of jobs available in the employees table.
spark.sql("select count(distinct(JOB_ID)) from employees").show()

+----------------------+
|count(DISTINCT JOB_ID)|
+----------------------+
|                    19|
+----------------------+



In [20]:
employees.select("JOB_ID").distinct().count()

19

In [23]:
# 2. Write a query to get the total salaries payable to employees.
spark.sql("select sum(SALARY) from employees").show()

+---------------------------+
|sum(CAST(SALARY AS DOUBLE))|
+---------------------------+
|                   691400.0|
+---------------------------+



In [85]:
sumOfSalary=employees.agg(sum("SALARY"))

In [86]:
sumOfSalary.show()

+-----------+
|sum(SALARY)|
+-----------+
|   691400.0|
+-----------+



In [92]:
# 3. Write a query to get the minimum salary from employees table.
spark.sql("select min(SALARY)  from employees").show()

+-----------+
|min(SALARY)|
+-----------+
|     2100.0|
+-----------+



In [93]:
employees.agg(min("SALARY")).show()

+-----------+
|min(SALARY)|
+-----------+
|     2100.0|
+-----------+



In [95]:
# 4. Write a query to get the maximum salary of an employee working as a Programmer. 
spark.sql("select max(e.SALARY) from employees e where e.job_id = 'IT_PROG' ").show()

+-----------+
|max(SALARY)|
+-----------+
|     9000.0|
+-----------+



In [97]:
employees.filter(col("JOB_ID") == 'IT_PROG').agg(max("SALARY")).show()

+-----------+
|max(SALARY)|
+-----------+
|     9000.0|
+-----------+



In [102]:
# 5. Write a query to get the average salary and number of employees working the department 90.
spark.sql("select avg(e.SALARY), count(DEPARTMENT_ID) from employees e where e.DEPARTMENT_ID = '90' ").show()

+------------------+--------------------+
|       avg(SALARY)|count(DEPARTMENT_ID)|
+------------------+--------------------+
|19333.333333333332|                   3|
+------------------+--------------------+



In [105]:
employees.filter(col("DEPARTMENT_ID") == '90').agg(avg("SALARY"),count("SALARY")).show()

+------------------+-------------+
|       avg(SALARY)|count(SALARY)|
+------------------+-------------+
|19333.333333333332|            3|
+------------------+-------------+



In [106]:
# 6. Write a query to get the highest, lowest, sum, and average salary of all employees.
spark.sql("select max(SALARY), min(SALARY), sum(SALARY), avg(SALARY) from employees ").show()

+-----------+-----------+-----------+-----------------+
|max(SALARY)|min(SALARY)|sum(SALARY)|      avg(SALARY)|
+-----------+-----------+-----------+-----------------+
|    24000.0|     2100.0|   691400.0|6461.682242990654|
+-----------+-----------+-----------+-----------------+



In [107]:
employees.agg(max("SALARY"), min("SALARY"), sum("SALARY"),avg("SALARY")).show()

+-----------+-----------+-----------+-----------------+
|max(SALARY)|min(SALARY)|sum(SALARY)|      avg(SALARY)|
+-----------+-----------+-----------+-----------------+
|    24000.0|     2100.0|   691400.0|6461.682242990654|
+-----------+-----------+-----------+-----------------+



In [110]:
# 7. Write a query to get the number of employees with the same job. 
spark.sql("select JOB_ID,count(*) from employees group by JOB_ID ").show()

+----------+--------+
|    JOB_ID|count(1)|
+----------+--------+
|FI_ACCOUNT|       5|
|    MK_MAN|       1|
|   IT_PROG|       5|
|    FI_MGR|       1|
|AC_ACCOUNT|       1|
|    HR_REP|       1|
|  PU_CLERK|       5|
|    AC_MGR|       1|
|    PR_REP|       1|
|    ST_MAN|       5|
|    MK_REP|       1|
|    SA_REP|      30|
|    SA_MAN|       5|
|    PU_MAN|       1|
|  SH_CLERK|      20|
|   AD_PRES|       1|
|  ST_CLERK|      20|
|   AD_ASST|       1|
|     AD_VP|       2|
+----------+--------+



In [112]:
employees.groupBy("JOB_ID").count().show()

+----------+-----+
|    JOB_ID|count|
+----------+-----+
|FI_ACCOUNT|    5|
|    MK_MAN|    1|
|   IT_PROG|    5|
|    FI_MGR|    1|
|AC_ACCOUNT|    1|
|    HR_REP|    1|
|  PU_CLERK|    5|
|    AC_MGR|    1|
|    PR_REP|    1|
|    ST_MAN|    5|
|    MK_REP|    1|
|    SA_REP|   30|
|    SA_MAN|    5|
|    PU_MAN|    1|
|  SH_CLERK|   20|
|   AD_PRES|    1|
|  ST_CLERK|   20|
|   AD_ASST|    1|
|     AD_VP|    2|
+----------+-----+



In [114]:
# 8. Write a query to get the difference between the highest and lowest salaries
spark.sql("select max(SALARY) - min(SALARY) from employees").show()

+---------------------------+
|(max(SALARY) - min(SALARY))|
+---------------------------+
|                    21900.0|
+---------------------------+



In [115]:
employees.agg( max("SALARY") - min("SALARY")).show()

+---------------------------+
|(max(SALARY) - min(SALARY))|
+---------------------------+
|                    21900.0|
+---------------------------+



In [122]:
# 9. Write a query to find the manager ID and the salary of the lowest-paid employee for that manager
spark.sql("select MANAGER_ID, min(SALARY) from employees where MANAGER_ID is not null \
          group by MANAGER_ID order by min(SALARY) DESC").show()

+----------+-----------+
|MANAGER_ID|min(SALARY)|
+----------+-----------+
|       102|     9000.0|
|       205|     8300.0|
|       146|     7000.0|
|       145|     7000.0|
|       108|     6900.0|
|       147|     6200.0|
|       149|     6200.0|
|       148|     6100.0|
|       201|     6000.0|
|       100|     5800.0|
|       101|     4400.0|
|       103|     4200.0|
|       124|     2500.0|
|       114|     2500.0|
|       123|     2500.0|
|       120|     2200.0|
|       122|     2200.0|
|       121|     2100.0|
+----------+-----------+



In [132]:
employees.filter(col("MANAGER_ID").isNotNull()).groupBy("MANAGER_ID") \
.min("SALARY").sort(col("min(SALARY)").desc()).show()

+----------+-----------+
|MANAGER_ID|min(SALARY)|
+----------+-----------+
|       102|     9000.0|
|       205|     8300.0|
|       145|     7000.0|
|       146|     7000.0|
|       108|     6900.0|
|       149|     6200.0|
|       147|     6200.0|
|       148|     6100.0|
|       201|     6000.0|
|       100|     5800.0|
|       101|     4400.0|
|       103|     4200.0|
|       124|     2500.0|
|       123|     2500.0|
|       114|     2500.0|
|       122|     2200.0|
|       120|     2200.0|
|       121|     2100.0|
+----------+-----------+



In [133]:
# 10. Write a query to get the department ID and the total salary payable in each department. 
spark.sql("select DEPARTMENT_ID, sum(SALARY) from employees group by DEPARTMENT_ID ").show()

+-------------+-----------+
|DEPARTMENT_ID|sum(SALARY)|
+-------------+-----------+
|           30|    24900.0|
|          110|    20300.0|
|         null|     7000.0|
|          100|    51600.0|
|           70|    10000.0|
|           90|    58000.0|
|           60|    28800.0|
|           40|     6500.0|
|           20|    19000.0|
|           10|     4400.0|
|           80|   304500.0|
|           50|   156400.0|
+-------------+-----------+



In [134]:
employees.groupBy("DEPARTMENT_ID").sum("SALARY").show()

+-------------+-----------+
|DEPARTMENT_ID|sum(SALARY)|
+-------------+-----------+
|           30|    24900.0|
|          110|    20300.0|
|         null|     7000.0|
|          100|    51600.0|
|           70|    10000.0|
|           90|    58000.0|
|           60|    28800.0|
|           40|     6500.0|
|           20|    19000.0|
|           10|     4400.0|
|           80|   304500.0|
|           50|   156400.0|
+-------------+-----------+



In [138]:
# 11. Write a query to get the average salary for each job ID excluding programmer. 
spark.sql("select JOB_ID, avg(SALARY) from employees where JOB_ID != 'IT_PROG' group by JOB_ID ").show()

+----------+-----------+
|    JOB_ID|avg(SALARY)|
+----------+-----------+
|FI_ACCOUNT|     7920.0|
|    MK_MAN|    13000.0|
|    FI_MGR|    12000.0|
|AC_ACCOUNT|     8300.0|
|    HR_REP|     6500.0|
|  PU_CLERK|     2780.0|
|    AC_MGR|    12000.0|
|    PR_REP|    10000.0|
|    ST_MAN|     7280.0|
|    MK_REP|     6000.0|
|    SA_REP|     8350.0|
|    SA_MAN|    12200.0|
|    PU_MAN|    11000.0|
|  SH_CLERK|     3215.0|
|   AD_PRES|    24000.0|
|  ST_CLERK|     2785.0|
|   AD_ASST|     4400.0|
|     AD_VP|    17000.0|
+----------+-----------+



In [140]:
employees.filter(col("JOB_ID") != 'IT_PROG').groupBy("JOB_ID").avg('SALARY').show()

+----------+-----------+
|    JOB_ID|avg(SALARY)|
+----------+-----------+
|FI_ACCOUNT|     7920.0|
|    MK_MAN|    13000.0|
|    FI_MGR|    12000.0|
|AC_ACCOUNT|     8300.0|
|    HR_REP|     6500.0|
|  PU_CLERK|     2780.0|
|    AC_MGR|    12000.0|
|    PR_REP|    10000.0|
|    ST_MAN|     7280.0|
|    MK_REP|     6000.0|
|    SA_REP|     8350.0|
|    SA_MAN|    12200.0|
|    PU_MAN|    11000.0|
|  SH_CLERK|     3215.0|
|   AD_PRES|    24000.0|
|  ST_CLERK|     2785.0|
|   AD_ASST|     4400.0|
|     AD_VP|    17000.0|
+----------+-----------+



In [142]:
# 12. Write a query to get the total salary, maximum, minimum, average salary of employees (job ID wise),
# for department ID 90 only.

spark.sql("select JOB_ID, sum(SALARY),max(SALARY),min(SALARY),avg(SALARY) \
          from employees where DEPARTMENT_ID = 90 group by JOB_ID ").show()

+-------+-----------+-----------+-----------+-----------+
| JOB_ID|sum(SALARY)|max(SALARY)|min(SALARY)|avg(SALARY)|
+-------+-----------+-----------+-----------+-----------+
|AD_PRES|    24000.0|    24000.0|    24000.0|    24000.0|
|  AD_VP|    34000.0|    17000.0|    17000.0|    17000.0|
+-------+-----------+-----------+-----------+-----------+



In [148]:
employees.filter(col("DEPARTMENT_ID") == 90).groupBy("JOB_ID") \
.agg(sum("SALARY"),max("SALARY"), min("SALARY") , avg("SALARY")).show()

+-------+-----------+-----------+-----------+-----------+
| JOB_ID|sum(SALARY)|max(SALARY)|min(SALARY)|avg(SALARY)|
+-------+-----------+-----------+-----------+-----------+
|AD_PRES|    24000.0|    24000.0|    24000.0|    24000.0|
|  AD_VP|    34000.0|    17000.0|    17000.0|    17000.0|
+-------+-----------+-----------+-----------+-----------+



In [161]:
# 13. Write a query to get the job ID and maximum salary of the employees
# where maximum salary is greater than or equal to $4000. 
spark.sql("select JOB_ID, max(SALARY) from employees group by JOB_ID having max(SALARY) >= 4000").show()
# spark.sql("select JOB_ID, max(SALARY) from employees where max(SALARY) >= 4000 group by JOB_ID ").show()

+----------+-----------+
|    JOB_ID|max(SALARY)|
+----------+-----------+
|FI_ACCOUNT|     9000.0|
|    MK_MAN|    13000.0|
|   IT_PROG|     9000.0|
|    FI_MGR|    12000.0|
|AC_ACCOUNT|     8300.0|
|    HR_REP|     6500.0|
|    AC_MGR|    12000.0|
|    PR_REP|    10000.0|
|    ST_MAN|     8200.0|
|    MK_REP|     6000.0|
|    SA_REP|    11500.0|
|    SA_MAN|    14000.0|
|    PU_MAN|    11000.0|
|  SH_CLERK|     4200.0|
|   AD_PRES|    24000.0|
|   AD_ASST|     4400.0|
|     AD_VP|    17000.0|
+----------+-----------+



In [166]:
employees.groupBy("JOB_ID").agg(max('SALARY')).filter(col("max(SALARY)")> 4000).show()

+----------+-----------+
|    JOB_ID|max(SALARY)|
+----------+-----------+
|FI_ACCOUNT|     9000.0|
|    MK_MAN|    13000.0|
|   IT_PROG|     9000.0|
|    FI_MGR|    12000.0|
|AC_ACCOUNT|     8300.0|
|    HR_REP|     6500.0|
|    AC_MGR|    12000.0|
|    PR_REP|    10000.0|
|    ST_MAN|     8200.0|
|    MK_REP|     6000.0|
|    SA_REP|    11500.0|
|    SA_MAN|    14000.0|
|    PU_MAN|    11000.0|
|  SH_CLERK|     4200.0|
|   AD_PRES|    24000.0|
|   AD_ASST|     4400.0|
|     AD_VP|    17000.0|
+----------+-----------+



In [168]:
# 14. Write a query to get the average salary for all departments employing more than 10 employees. 
spark.sql("select avg(SALARY),DEPARTMENT_ID from employees \
          group by DEPARTMENT_ID having count(DEPARTMENT_ID) > 10").show()


+------------------+-------------+
|       avg(SALARY)|DEPARTMENT_ID|
+------------------+-------------+
| 8955.882352941177|           80|
|3475.5555555555557|           50|
+------------------+-------------+



In [175]:
employees.groupBy("DEPARTMENT_ID").agg(avg("SALARY"), count("DEPARTMENT_ID")) \
.filter(col("count(DEPARTMENT_ID)") > 10 ).drop("count(DEPARTMENT_ID)").show()

+-------------+------------------+
|DEPARTMENT_ID|       avg(SALARY)|
+-------------+------------------+
|           80| 8955.882352941177|
|           50|3475.5555555555557|
+-------------+------------------+

