# Importing Necessary Libraries


In [1]:
import pandas as pd
import numpy as np

# Loading 3 Datasets

In [3]:
from google.colab import files
uploaded = files.upload()

Saving attendance.csv to attendance.csv
Saving employees.csv to employees.csv
Saving tasks.csv to tasks.csv


In [4]:
import pandas as pd

dfAttendance = pd.read_csv("attendance.csv")
dfTasks = pd.read_csv("tasks.csv")
dfEmpl = pd.read_csv("employees.csv")

# Check Dataframes By Printing

In [5]:
dfAttendance.head()

Unnamed: 0,attendanceid,employeeid,date,clockin,clockout,islate,isabsent
0,1,1,2024-06-01,2024-06-01 09:05:00,2024-06-01 17:00:00,1,0
1,2,2,2024-06-01,2024-06-01 09:00:00,2024-06-01 17:15:00,0,0
2,3,3,2024-06-01,,,0,1
3,4,4,2024-06-01,2024-06-01 09:12:00,2024-06-01 17:10:00,1,0
4,5,5,2024-06-01,2024-06-01 08:50:00,2024-06-01 16:30:00,0,0


In [6]:
dfTasks.head()

Unnamed: 0,taskid,employeeid,taskname,taskdate,taskscompleted,hoursspent,productivityscore
0,1,1,api development,2024-06-01,6,6.5,0.85
1,2,2,social media campaign,2024-06-01,4,5.0,0.7
2,3,3,employee onboarding,2024-06-01,1,2.0,0.5
3,4,4,infra setup,2024-06-01,5,6.0,0.8
4,5,5,invoice processing,2024-06-01,7,8.0,0.9


In [7]:
dfEmpl.head()

Unnamed: 0,employeeid,name,department,role,email,hiredate,status
0,1,arun kumar,engineering,software engineer,arun.kumar@example.com,2023-02-10 00:00:00,Active
1,2,deepa rani,marketing,content writer,deepa.rani@example.com,2022-12-05 00:00:00,Active
2,3,vijay raj,hr,hr executive,vijay.raj@example.com,2021-10-15 00:00:00,Active
3,4,karthik s,engineering,devops engineer,karthik.s@example.com,2023-04-20 00:00:00,Active
4,5,meena p,finance,accountant,meena.p@example.com,2022-07-25 00:00:00,Resigned


# Clean missing or invalid entries


In [9]:
dfTasks = dfTasks.dropna()
dfAttendance = dfAttendance.dropna()

 # Calculating work hours, break times, and productivity scores

In [13]:
df = dfAttendance.merge(dfTasks, how="inner", on="employeeid").merge(dfEmpl, how="inner", on="employeeid")

In [24]:
df["workinghours"] = round(abs((pd.to_datetime(df["clockin"]) - pd.to_datetime(df["clockout"])).dt.total_seconds() / 3600), 2)
df["productivityscore"] = round(df["taskscompleted"] / df["workinghours"], 2)
df["breaktimes"] = round(df["workinghours"] / 4)

In [25]:
df.head()

Unnamed: 0,attendanceid,employeeid,date,clockin,clockout,islate,isabsent,taskid,taskname,taskdate,...,hoursspent,productivityscore,name,department,role,email,hiredate,status,workinghours,breaktimes
0,1,1,2024-06-01,2024-06-01 09:05:00,2024-06-01 17:00:00,1,0,1,api development,2024-06-01,...,6.5,0.76,arun kumar,engineering,software engineer,arun.kumar@example.com,2023-02-10 00:00:00,Active,7.92,2.0
1,2,2,2024-06-01,2024-06-01 09:00:00,2024-06-01 17:15:00,0,0,2,social media campaign,2024-06-01,...,5.0,0.48,deepa rani,marketing,content writer,deepa.rani@example.com,2022-12-05 00:00:00,Active,8.25,2.0
2,4,4,2024-06-01,2024-06-01 09:12:00,2024-06-01 17:10:00,1,0,4,infra setup,2024-06-01,...,6.0,0.63,karthik s,engineering,devops engineer,karthik.s@example.com,2023-04-20 00:00:00,Active,7.97,2.0
3,5,5,2024-06-01,2024-06-01 08:50:00,2024-06-01 16:30:00,0,0,5,invoice processing,2024-06-01,...,8.0,0.91,meena p,finance,accountant,meena.p@example.com,2022-07-25 00:00:00,Resigned,7.67,2.0


# Finding top performers and frequent absentees

In [41]:
summary = df.groupby("employeeid").agg(
    hoursSpent=pd.NamedAgg(column="workinghours", aggfunc="mean"),
    productivityScore=pd.NamedAgg(column="productivityscore", aggfunc="mean"),
    abscentCount=pd.NamedAgg(column="isabsent", aggfunc="sum")
).reset_index()

In [42]:
summary_final = df[["employeeid", "name"]].drop_duplicates().merge(summary, on="employeeid", how="left")

topPerformer = summary_final.sort_values("productivityScore", ascending=False).iloc[0].rename("TopPerformer")

bottomPerformer = summary_final.sort_values(["abscentCount", "productivityScore"], ascending=[False, True]).iloc[0].rename("BottomPerformer")


# Deliverables:
# - Cleaned datasets for attendance and task performance
# - Report highlighting the top and bottom performers

In [43]:
# 1. cleaned dataset
dfAttendance.to_csv("cleaned_attendance.csv")
dfTasks.to_csv("cleaned_tasks.csv")

In [44]:
# 2. report of top and bottom performer
print("-----------------Top performer report----------------------")
print(f"Top performer: {topPerformer.iloc[1]}")
for i, j in topPerformer.items():
  if i != "name":
    print(f"{i}: {j}")
print("-----------------------------------------------------------\n")

print("-----------------Bottom performer report-------------------")
print(f"Bottom performer: {bottomPerformer.iloc[1]}")
for i, j in bottomPerformer.items():
  if i != "name":
    print(f"{i}: {j}")
print("-----------------------------------------------------------\n")

-----------------Top performer report----------------------
Top performer: meena p
employeeid: 5
hoursSpent: 7.67
productivityScore: 0.91
abscentCount: 0
-----------------------------------------------------------

-----------------Bottom performer report-------------------
Bottom performer: deepa rani
employeeid: 2
hoursSpent: 8.25
productivityScore: 0.48
abscentCount: 0
-----------------------------------------------------------

