In [1]:
# Dataset - https://www.kaggle.com/datasets/ravindrasinghrana/employeedataset?select=employee_engagement_survey_data.csv
import pandas as pd
import numpy as np

In [2]:
employeeData = pd.read_csv('employee_data.csv')
engagementData = pd.read_csv('employee_engagement_survey_data.csv')
recruitmentData = pd.read_csv('recruitment_data.csv')
trainingData = pd.read_csv('training_and_development_data.csv')
pd.set_option('display.max_rows',10)

In [3]:
engagementData.rename(columns = {
    'Survey Date':'survey_date','Employee ID':'emp_id','Engagement Score':'engagement_score','Satisfaction Score':'satisfaction_score','Work-Life Balance Score':'wl_balance_score'
},inplace = True)

In [4]:
engagementData.dtypes

emp_id                 int64
survey_date           object
engagement_score       int64
satisfaction_score     int64
wl_balance_score       int64
dtype: object

In [5]:
# convert Survey Date to date
engagementData['survey_date'] = pd.to_datetime(engagementData['survey_date'], format = '%d-%m-%Y')

In [6]:
engagementData.dtypes

emp_id                         int64
survey_date           datetime64[ns]
engagement_score               int64
satisfaction_score             int64
wl_balance_score               int64
dtype: object

In [7]:
# count employee surveyed by year
engagementData.groupby(engagementData['survey_date'].dt.year)['emp_id'].size().reset_index(name = 'Number of Emp')

Unnamed: 0,survey_date,Number of Emp
0,2022,1187
1,2023,1813


In [8]:
# find the date when the last survey was done
engagementData['survey_date'].max()

Timestamp('2023-08-05 00:00:00')

In [9]:
# find the date when first survey was done
engagementData['survey_date'].min()

Timestamp('2022-08-05 00:00:00')

In [10]:
# find gender wise engagement score of employees
engagementData.merge(
    employeeData, how = 'inner',left_on = 'emp_id', right_on = 'EmpID'
).groupby('GenderCode').size().reset_index(name = 'Total Employees')

Unnamed: 0,GenderCode,Total Employees
0,Female,1682
1,Male,1318


In [11]:
# group work life balance on division
engagementData.merge(
    employeeData, how = 'inner', left_on ='emp_id', right_on = 'EmpID'
).groupby(['Division','wl_balance_score']).size().reset_index(name = 'Total Employees')

Unnamed: 0,Division,wl_balance_score,Total Employees
0,Aerial,1,37
1,Aerial,2,36
2,Aerial,3,50
3,Aerial,4,45
4,Aerial,5,28
...,...,...,...
114,Yard (Material Handling),1,16
115,Yard (Material Handling),2,10
116,Yard (Material Handling),3,12
117,Yard (Material Handling),4,11


In [12]:
# find out which department employees have the best work life balance
engagementData.merge(
    employeeData, how = 'inner', left_on = 'emp_id', right_on = 'EmpID'
).groupby(['DepartmentType','wl_balance_score']).size().reset_index(name = 'Total Employees').sort_values(by = ['wl_balance_score','Total Employees'],ascending = False).head(1)

Unnamed: 0,DepartmentType,wl_balance_score,Total Employees
19,Production,5,379


In [13]:
# grouping on satisfaction score and calculating percentage
groupedDF = engagementData.groupby('satisfaction_score').size().reset_index(name = 'Total Employees')
groupedDF['Percentage%'] = round(groupedDF['Total Employees']*100/(groupedDF['Total Employees'].sum()),2)
groupedDF

Unnamed: 0,satisfaction_score,Total Employees,Percentage%
0,1,592,19.73
1,2,574,19.13
2,3,604,20.13
3,4,636,21.2
4,5,594,19.8


In [14]:
# what is the average engagement score in finance and Accounting division
groupedDF = engagementData.merge(
    employeeData, how = 'inner', left_on = 'emp_id', right_on = 'EmpID'
)
round(groupedDF.loc[groupedDF.Division == 'Finance & Accounting']['engagement_score'].mean(),2)

2.84

In [15]:
# which employees have a performance score of fully meets and and engagement score higher than 4
groupedDF = engagementData.merge(
    employeeData, how = 'inner', left_on = 'emp_id', right_on = 'EmpID'
)
groupedDF.loc[(groupedDF['Performance Score'] == 'Fully Meets') & (groupedDF.engagement_score>4)]

Unnamed: 0,emp_id,survey_date,engagement_score,satisfaction_score,wl_balance_score,EmpID,FirstName,LastName,StartDate,ExitDate,...,Division,DOB,State,JobFunctionDescription,GenderCode,LocationCode,RaceDesc,MaritalDesc,Performance Score,Current Employee Rating
5,1006,2023-05-03,5,2,1,1006,Colby,Andreola,15-Aug-21,20-Nov-21,...,Engineers,19-02-1959,MA,Administrative,Female,2110,White,Widowed,Fully Meets,3
7,1008,2023-06-21,5,2,2,1008,Judith,Carabbio,28-Aug-19,04-May-20,...,General - Con,21-12-1981,MA,Foreman,Female,2132,Hispanic,Married,Fully Meets,3
12,1013,2022-12-13,5,4,3,1013,Jasmin,Shah,13-May-20,,...,Engineers,19-11-1974,MA,Engineer,Male,36398,Asian,Single,Fully Meets,3
20,1021,2023-03-09,5,1,4,1021,Joe,Fletcher,09-Nov-20,17-Jul-23,...,General - Eng,25-01-1989,MA,Billing,Male,51613,Hispanic,Single,Fully Meets,3
23,1024,2022-10-12,5,3,1,1024,Kasey,Boyer,30-Dec-18,23-Jun-21,...,Fielders,30-03-1987,MA,Coordinator,Female,91332,Other,Divorced,Fully Meets,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2971,3972,2022-10-29,5,4,1,3972,Eleanor,Patton,26-Jul-22,,...,General - Con,07-06-1955,TX,Foreman,Female,21727,Black,Divorced,Fully Meets,3
2983,3984,2023-02-24,5,5,4,3984,Christine,Skinner,10-Feb-22,08-Jun-22,...,Wireline Construction,23-07-1963,TX,Laborer,Female,67986,Hispanic,Single,Fully Meets,3
2984,3985,2022-09-28,5,4,2,3985,Zayne,Mccullough,13-Jan-23,19-May-23,...,People Services,18-01-1992,TX,Director,Male,26193,White,Widowed,Fully Meets,3
2987,3988,2023-05-20,5,2,5,3988,Kenzie,Mullins,21-Jul-22,21-Nov-22,...,Shop (Fleet),17-06-1981,MA,Mechanic,Female,73583,White,Widowed,Fully Meets,3


In [16]:
employeeData['Performance Score']

0       Fully Meets
1       Fully Meets
2       Fully Meets
3       Fully Meets
4       Fully Meets
           ...     
2995    Fully Meets
2996    Fully Meets
2997    Fully Meets
2998    Fully Meets
2999    Fully Meets
Name: Performance Score, Length: 3000, dtype: object

In [23]:
# what is the average work life balance score of employees who started in 2023
employeeData['StartDate'] = pd.to_datetime(employeeData['StartDate'], format = '%d-%b-%y')
groupedDF = engagementData.merge(
    employeeData, how ='inner',left_on = 'emp_id', right_on = 'EmpID'
)
round(groupedDF.loc[groupedDF['StartDate'].dt.year == 2023]['wl_balance_score'].mean(),2)

2.86

In [27]:
# What is the trend of engagement score over time for each business unit
engagementData.merge(
    employeeData, how = 'inner', left_on = 'emp_id', right_on = 'EmpID'
).groupby(['BusinessUnit','engagement_score']).size().reset_index(name = 'Total Employees')

Unnamed: 0,BusinessUnit,engagement_score,Total Employees
0,BPC,1,69
1,BPC,2,62
2,BPC,3,51
3,BPC,4,70
4,BPC,5,51
...,...,...,...
45,WBL,1,67
46,WBL,2,61
47,WBL,3,44
48,WBL,4,58


In [43]:
# what is the average performance score by race and gender
round(engagementData.merge(
    employeeData, how = 'inner', left_on = 'emp_id', right_on = 'EmpID'
).groupby(['GenderCode','RaceDesc'])[['satisfaction_score', 'Current Employee Rating']].mean().reset_index(),2)

Unnamed: 0,GenderCode,RaceDesc,satisfaction_score,Current Employee Rating
0,Female,Asian,2.99,2.98
1,Female,Black,2.93,2.95
2,Female,Hispanic,3.1,2.94
3,Female,Other,3.1,2.99
4,Female,White,2.93,3.05
5,Male,Asian,3.24,2.92
6,Male,Black,2.94,2.97
7,Male,Hispanic,2.95,2.99
8,Male,Other,2.95,3.0
9,Male,White,3.12,2.87
