In [0]:
'''
For the project we'll be using OfficeDataProject.csv 
Read data from the file in the DF and perform following analytics on it. 
- Print the total number of employees in the company 
- Print the total number of departments in the company 
- Print the department names of the company 
- Print the total number of employees in each department 
- Print the total number of employees in each state 
- Print the total number of employees in each state in each department 
- Print the minimum and maximum salaries 11 each department and sort salaries In ascending order 
- Print the names of employees working in NY state under Finance department whose bonuses are greater than the average bonuses of employees in NY state 
- Raise the salaries $500 of all employees whose age 1s greater than 45 
- Create DF of all those employees whose age is greater than 45 and save them in a file

'''


In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import sum,avg,max,min,mean,count,udf,col
from pyspark.sql.types import IntegerType
spark = SparkSession.builder.appName("Mini-Project").getOrCreate()

dff = spark.read.options(header = 'True', inferSchema= 'True').csv('/FileStore/tables/OfficeDataProject.csv')


In [0]:
dff.show()

In [0]:
dff.count()
dff.sort(dff.salary).show()

In [0]:
# - Print the total number of departments in the company 
dff.groupby(dff.department).count().count()

# OR BETTER

df.select("department").dropDuplicates(["department"]).count()

In [0]:
dff.select(dff.department).distinct().show()
# OR
df.select("department").dropDuplicates(["department"]).show()

In [0]:
dff.groupBy(dff.department).agg(count('*')).show()

In [0]:
dff.groupBy(dff.state).agg(count('*')).show()

In [0]:
dff.groupBy(dff.state, dff.department).agg(count('*')).show()

# OR

dff.groupBy('state', 'department').count().show()

In [0]:
dff1 = dff.groupBy(dff.department).agg(min('salary').alias('min'), max('salary').alias('max'))
dff1.sort(dff1[1].asc(), dff1[2].asc()).show()

In [0]:
# - Print the names of employees working in NY state under Finance department whose bonuses are greater than the average bonuses of employees in NY state 
avg.Bonus = dff.filter(dff.state == 'NY').groupBy('state').agg(avg('bonus').alias('avg_bonus')).select('avg_bonus').collect()[0]['avg_bonus']
dff.filter((dff.state == 'NY') & (dff.department == 'Finance') & (dff.bonus >= avg.Bonus)).show()

In [0]:
# - Raise the salaries $500 of all employees whose age is greater than 45 
def upsal(age, sal):
  if age >= 45:
    sal = sal + 500
  return sal

total = udf(lambda x,y:upsal(x,y), IntegerType())

# using 'withColumn' you can define the same column name to overwrite it or use another column name to create a new table
dff.withColumn('salary', total(dff.age, dff.salary)).show()

In [0]:
# - Create DF of all those employees whose age is greater than 45 and save them in a file
dfp = dff.filter(dff.age >= 45)
dfp.write.options(header='True').csv('/FileStore/tables/StudentData/output2')