#Week -**5**

In [7]:
from google.colab import files
uploaded = files.upload()

Saving attendance.csv to attendance.csv
Saving employees.csv to employees.csv
Saving tasks.csv to tasks.csv


#**Creating Spark Session**

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [5]:
spark = SparkSession.builder.appName("DevOps").getOrCreate()
spark

#**Loading data**

In [8]:
dfAtt = spark.read.csv(r"/content/attendance.csv", header=True, inferSchema=True)
dfEmp = spark.read.csv(r"/content/employees.csv", header=True, inferSchema=True)
dfTas = spark.read.csv(r"/content/tasks.csv", header=True, inferSchema=True)

#**Printing Files**

In [5]:
dfAtt.show()

+------------+----------+----------+----------------+----------------+------+---------+
|attendanceID|employeeID|      date|         clockIN|        clockOUT|isLate|isAbscent|
+------------+----------+----------+----------------+----------------+------+---------+
|           1|         1|07-06-2024|            NULL|            NULL|     0|        1|
|           2|         2|07-06-2024|07-06-2024 08:48|07-06-2024 17:05|     0|        0|
|           3|         3|07-06-2024|07-06-2024 08:51|07-06-2024 17:10|     1|        0|
|           4|         4|07-06-2024|07-06-2024 08:54|07-06-2024 17:15|     0|        0|
|           5|         5|07-06-2024|07-06-2024 08:57|07-06-2024 17:00|     1|        0|
|           6|         1|07-06-2024|       45450.375|07-06-2024 17:05|     0|        0|
|           7|         2|07-06-2024|            NULL|            NULL|     0|        1|
|           8|         3|07-06-2024|07-06-2024 08:48|07-06-2024 17:15|     0|        0|
|           9|         4|07-06-2

In [6]:
dfEmp.show()

+----------+----------+-----------+-----------------+--------------------+-------------------+--------+
|employeeid|      name| department|             role|               email|           hiredate|  status|
+----------+----------+-----------+-----------------+--------------------+-------------------+--------+
|         1|arun kumar|engineering|software engineer|arun.kumar@exampl...|2023-02-10 00:00:00|  Active|
|         2|deepa rani|  marketing|   content writer|deepa.rani@exampl...|2022-12-05 00:00:00|  Active|
|         3| vijay raj|         hr|     hr executive|vijay.raj@example...|2021-10-15 00:00:00|  Active|
|         4| karthik s|engineering|  devops engineer|karthik.s@example...|2023-04-20 00:00:00|  Active|
|         5|   meena p|    finance|       accountant| meena.p@example.com|2022-07-25 00:00:00|Resigned|
+----------+----------+-----------+-----------------+--------------------+-------------------+--------+



In [7]:
dfTas.show()

+------+----------+--------------------+----------+---------------+
|taskID|employeeID|            taskName|  taskDate|tasksCompeleted|
+------+----------+--------------------+----------+---------------+
|     1|         1|         Code Review|2024-07-01|              4|
|     2|         2|   Blog Post Writing|2024-07-01|              5|
|     3|         3|     Employee Survey|2024-07-01|              2|
|     4|         4|Server Configuration|2024-07-01|              3|
|     5|         5| Invoice Preparation|2024-07-01|              6|
|     6|         1|     Feature Testing|2024-07-02|              3|
|     7|         2|    SEO Optimization|2024-07-02|              4|
|     8|         3|Recruitment Scree...|2024-07-02|              1|
|     9|         4| Load Balancer Setup|2024-07-02|              5|
|    10|         5|Travel Reimbursem...|2024-07-02|              2|
|    11|         1|  Unit Test Coverage|2024-07-03|              4|
|    12|         2|Campaign Strategy...|2024-07-

## Printing the schemas

In [8]:
dfAtt.printSchema()

root
 |-- attendanceID: integer (nullable = true)
 |-- employeeID: integer (nullable = true)
 |-- date: string (nullable = true)
 |-- clockIN: string (nullable = true)
 |-- clockOUT: string (nullable = true)
 |-- isLate: integer (nullable = true)
 |-- isAbscent: integer (nullable = true)



# **Printing Schemas**

In [9]:
dfEmp.printSchema()

root
 |-- employeeid: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- role: string (nullable = true)
 |-- email: string (nullable = true)
 |-- hiredate: timestamp (nullable = true)
 |-- status: string (nullable = true)



In [10]:
dfTas.printSchema()

root
 |-- taskID: integer (nullable = true)
 |-- employeeID: integer (nullable = true)
 |-- taskName: string (nullable = true)
 |-- taskDate: date (nullable = true)
 |-- tasksCompeleted: integer (nullable = true)



#Cleaning Null values

In [9]:
dfEmp = dfEmp.dropna()
dfTas = dfTas.dropna()

#**Top 5`abscentees`**

In [10]:
Top5Absentees = dfEmp.join(dfAtt, on="employeeID", how="inner").groupBy("employeeID").agg(
                 F.sum("isAbscent").alias("AbscentCount")
             ).join(dfEmp.select(["employeeID", "name"]), how="inner", on="employeeID").sort("AbscentCount", ascending=False).limit(5).select(["employeeID", "name", "AbscentCount"])

Top5Absentees.write.mode("overwrite").csv("abscentees_top_5")

Top5Absentees.show()

+----------+----------+------------+
|employeeID|      name|AbscentCount|
+----------+----------+------------+
|         1|arun kumar|           1|
|         3| vijay raj|           1|
|         5|   meena p|           1|
|         4| karthik s|           1|
|         2|deepa rani|           1|
+----------+----------+------------+



#**Lowest performing departments**

In [11]:
LowestPerformingDept = dfEmp.join(dfAtt, on="employeeID", how="inner").join(dfTas, on="employeeID", how="inner").groupBy("department").agg(
                    F.sum("tasksCompeleted").alias("TasksProductivityScore")
                ).sort("TasksProductivityScore", ascending=True).limit(2)

LowestPerformingDept.write.mode("overwrite").csv("lowest_performing_departments")

LowestPerformingDept.show()

+----------+----------------------+
|department|TasksProductivityScore|
+----------+----------------------+
|        hr|                    42|
|   finance|                    96|
+----------+----------------------+



#**Deliverables**
## Report with top 5 absentees/lowest performing departments

In [12]:
Top5Absentees.coalesce(1) \
    .write.mode("overwrite").option("header", True) \
    .csv("/content/Top5Absentees")

LowestPerformingDept.coalesce(1) \
    .write.mode("overwrite").option("header", True) \
    .csv("/content/LowestPerformingDept")

In [13]:
import shutil, glob

Abs_file = glob.glob("/content/Top5Absentees/part-*.csv")[0]
shutil.move(Abs_file, "/content/Top5Absentees.csv")

low_file = glob.glob("/content/LowestPerformingDept/part-*.csv")[0]
shutil.move(low_file, "/content/LowestPerformingDept.csv")

'/content/LowestPerformingDept.csv'

In [14]:
from google.colab import files

files.download("/content/Top5Absentees.csv")
files.download("/content/LowestPerformingDept.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>