In [52]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [89]:
employees = spark.read.csv("HR_employess.csv",header=True)
employees = employees.withColumn("SALARY",employees["SALARY"].cast(DoubleType()))


In [90]:
employees.show(5)

+-----------+----------+---------+--------+------------+-------------------+-------+-------+--------------+----------+-------------+
|EMPLOYEE_ID|FIRST_NAME|LAST_NAME|   EMAIL|PHONE_NUMBER|          HIRE_DATE| JOB_ID| SALARY|COMMISSION_PCT|MANAGER_ID|DEPARTMENT_ID|
+-----------+----------+---------+--------+------------+-------------------+-------+-------+--------------+----------+-------------+
|        100|    Steven|     King|   SKING|515.123.4567|1987-06-17 00:00:00|AD_PRES|24000.0|          null|      null|           90|
|        101|     Neena|  Kochhar|NKOCHHAR|515.123.4568|1989-09-21 00:00:00|  AD_VP|17000.0|          null|       100|           90|
|        102|       Lex|  De Haan| LDEHAAN|515.123.4569|1993-01-13 00:00:00|  AD_VP|17000.0|          null|       100|           90|
|        103| Alexander|   Hunold| AHUNOLD|590.423.4567|1990-01-03 00:00:00|IT_PROG| 9000.0|          null|       102|           60|
|        104|     Bruce|    Ernst|  BERNST|590.423.4568|1991-05-21 00

In [99]:
employees.printSchema()

root
 |-- EMPLOYEE_ID: string (nullable = true)
 |-- FIRST_NAME: string (nullable = true)
 |-- LAST_NAME: string (nullable = true)
 |-- EMAIL: string (nullable = true)
 |-- PHONE_NUMBER: string (nullable = true)
 |-- HIRE_DATE: string (nullable = true)
 |-- JOB_ID: string (nullable = true)
 |-- SALARY: double (nullable = true)
 |-- COMMISSION_PCT: string (nullable = true)
 |-- MANAGER_ID: string (nullable = true)
 |-- DEPARTMENT_ID: string (nullable = true)



In [None]:
departments = spark.read.csv("HR_departments.csv",header=True)



In [91]:
employees.registerTempTable("employees")

In [10]:
# 1. Write a query to list the number of jobs available in the employees table.
spark.sql("select count(distinct(JOB_ID)) from employees").show()

+----------------------+
|count(DISTINCT JOB_ID)|
+----------------------+
|                    19|
+----------------------+



In [20]:
employees.select("JOB_ID").distinct().count()

19

In [23]:
# 2. Write a query to get the total salaries payable to employees.
spark.sql("select sum(SALARY) from employees").show()

+---------------------------+
|sum(CAST(SALARY AS DOUBLE))|
+---------------------------+
|                   691400.0|
+---------------------------+



In [85]:
sumOfSalary=employees.agg(sum("SALARY"))

In [86]:
sumOfSalary.show()

+-----------+
|sum(SALARY)|
+-----------+
|   691400.0|
+-----------+



In [92]:
# 3. Write a query to get the minimum salary from employees table.
spark.sql("select min(SALARY)  from employees").show()

+-----------+
|min(SALARY)|
+-----------+
|     2100.0|
+-----------+



In [93]:
employees.agg(min("SALARY")).show()

+-----------+
|min(SALARY)|
+-----------+
|     2100.0|
+-----------+



In [95]:
# 4. Write a query to get the maximum salary of an employee working as a Programmer. 
spark.sql("select max(e.SALARY) from employees e where e.job_id = 'IT_PROG' ").show()

+-----------+
|max(SALARY)|
+-----------+
|     9000.0|
+-----------+



In [97]:
employees.filter(col("JOB_ID") == 'IT_PROG').agg(max("SALARY")).show()

+-----------+
|max(SALARY)|
+-----------+
|     9000.0|
+-----------+



In [102]:
# 5. Write a query to get the average salary and number of employees working the department 90.
spark.sql("select avg(e.SALARY), count(DEPARTMENT_ID) from employees e where e.DEPARTMENT_ID = '90' ").show()

+------------------+--------------------+
|       avg(SALARY)|count(DEPARTMENT_ID)|
+------------------+--------------------+
|19333.333333333332|                   3|
+------------------+--------------------+



In [105]:
employees.filter(col("DEPARTMENT_ID") == '90').agg(avg("SALARY"),count("SALARY")).show()

+------------------+-------------+
|       avg(SALARY)|count(SALARY)|
+------------------+-------------+
|19333.333333333332|            3|
+------------------+-------------+



In [106]:
# 6. Write a query to get the highest, lowest, sum, and average salary of all employees.
spark.sql("select max(SALARY), min(SALARY), sum(SALARY), avg(SALARY) from employees ").show()

+-----------+-----------+-----------+-----------------+
|max(SALARY)|min(SALARY)|sum(SALARY)|      avg(SALARY)|
+-----------+-----------+-----------+-----------------+
|    24000.0|     2100.0|   691400.0|6461.682242990654|
+-----------+-----------+-----------+-----------------+



In [107]:
employees.agg(max("SALARY"), min("SALARY"), sum("SALARY"),avg("SALARY")).show()

+-----------+-----------+-----------+-----------------+
|max(SALARY)|min(SALARY)|sum(SALARY)|      avg(SALARY)|
+-----------+-----------+-----------+-----------------+
|    24000.0|     2100.0|   691400.0|6461.682242990654|
+-----------+-----------+-----------+-----------------+



In [110]:
# 7. Write a query to get the number of employees with the same job. 
spark.sql("select JOB_ID,count(*) from employees group by JOB_ID ").show()

+----------+--------+
|    JOB_ID|count(1)|
+----------+--------+
|FI_ACCOUNT|       5|
|    MK_MAN|       1|
|   IT_PROG|       5|
|    FI_MGR|       1|
|AC_ACCOUNT|       1|
|    HR_REP|       1|
|  PU_CLERK|       5|
|    AC_MGR|       1|
|    PR_REP|       1|
|    ST_MAN|       5|
|    MK_REP|       1|
|    SA_REP|      30|
|    SA_MAN|       5|
|    PU_MAN|       1|
|  SH_CLERK|      20|
|   AD_PRES|       1|
|  ST_CLERK|      20|
|   AD_ASST|       1|
|     AD_VP|       2|
+----------+--------+



In [112]:
employees.groupBy("JOB_ID").count().show()

+----------+-----+
|    JOB_ID|count|
+----------+-----+
|FI_ACCOUNT|    5|
|    MK_MAN|    1|
|   IT_PROG|    5|
|    FI_MGR|    1|
|AC_ACCOUNT|    1|
|    HR_REP|    1|
|  PU_CLERK|    5|
|    AC_MGR|    1|
|    PR_REP|    1|
|    ST_MAN|    5|
|    MK_REP|    1|
|    SA_REP|   30|
|    SA_MAN|    5|
|    PU_MAN|    1|
|  SH_CLERK|   20|
|   AD_PRES|    1|
|  ST_CLERK|   20|
|   AD_ASST|    1|
|     AD_VP|    2|
+----------+-----+



In [None]:
# 8. Write a query to get the difference between the highest and lowest salaries