# The goal of this EDA is to explore and understand the IBM HR Analytics dataset, identify key factors influencing employee attrition, and uncover patterns in the data through visualizations and statistical summaries. 

_Importing the necessary libraries required for data loading, analysis, and visualization._

In [37]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression

_Loading the dataset into a pandas DataFrame_

# Data Loading

In [38]:
df = pd.read_csv("data/WA_Fn-UseC_-HR-Employee-Attrition.csv")

_Taking a quick look at the dataset to understand its structure, data types, and sample records_

In [39]:
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


_Checking the shape of the dataset to see how many rows (records) and columns (features) it contains_

In [40]:
df.shape

(1470, 35)

_The dataset contains 1,470 rows and 35 columns, meaning we have information on 1,470 employees and 35 different features related to their personal and professional details_

_Getting a summary of the dataset, including column names, data types, and non-null value counts_

In [41]:
df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EmployeeCount             1470 non-null   int64 
 9   EmployeeNumber            1470 non-null   int64 
 10  EnvironmentSatisfaction   1470 non-null   int64 
 11  Gender                    1470 non-null   object
 12  HourlyRate                1470 non-null   int64 
 13  JobInvolvement            1470 non-null   int64 
 14  JobLevel                

_The dataset consists of 26 numerical features and 9 categorical features_

# Data Preprocessing

_Taking a quick look at thenumerical features_

In [42]:
df.select_dtypes(include='int64')

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,1102,1,2,1,1,2,94,3,2,...,1,80,0,8,0,1,6,4,0,5
1,49,279,8,1,1,2,3,61,2,2,...,4,80,1,10,3,3,10,7,1,7
2,37,1373,2,2,1,4,4,92,2,1,...,2,80,0,7,3,3,0,0,0,0
3,33,1392,3,4,1,5,4,56,3,1,...,3,80,0,8,3,3,8,7,3,0
4,27,591,2,1,1,7,1,40,3,1,...,4,80,1,6,3,3,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,884,23,2,1,2061,3,41,4,2,...,3,80,1,17,3,3,5,2,0,3
1466,39,613,6,1,1,2062,4,42,2,3,...,1,80,1,9,5,3,7,7,1,7
1467,27,155,4,3,1,2064,2,87,4,2,...,2,80,1,6,0,3,6,2,0,3
1468,49,1023,2,3,1,2065,4,63,2,2,...,4,80,0,17,3,2,9,6,0,8


_Some of the numerical columns appear to be ordinal categorical features that store numeric values representing categories. Let's find them out_

In [43]:
df.select_dtypes(include='int64').nunique()

Age                           43
DailyRate                    886
DistanceFromHome              29
Education                      5
EmployeeCount                  1
EmployeeNumber              1470
EnvironmentSatisfaction        4
HourlyRate                    71
JobInvolvement                 4
JobLevel                       5
JobSatisfaction                4
MonthlyIncome               1349
MonthlyRate                 1427
NumCompaniesWorked            10
PercentSalaryHike             15
PerformanceRating              2
RelationshipSatisfaction       4
StandardHours                  1
StockOptionLevel               4
TotalWorkingYears             40
TrainingTimesLastYear          7
WorkLifeBalance                4
YearsAtCompany                37
YearsInCurrentRole            19
YearsSinceLastPromotion       16
YearsWithCurrManager          18
dtype: int64

_For ease of analysis and visualization, we'll convert the ordinal categorical columns stored as numbers into their corresponding labeled categories, making the data more interpretable_

In [44]:
df["Education"] = df["Education"].replace({1:"Below College",2:"College",3:"Bachelor",4:"Master",5:"Doctor"})
df["EnvironmentSatisfaction"] = df["EnvironmentSatisfaction"].replace({1:"Low",2:"Medium",3:"High",4:"Very High"})
df["JobInvolvement"] = df["JobInvolvement"].replace({1:"Low",2:"Medium",3:"High",4:"Very High"})
df["JobLevel"] = df["JobLevel"].replace({1:"Entry Level",2:"Junior Level",3:"Mid Level",4:"Senior Level",5:"Executive Level"})
df["JobSatisfaction"] = df["JobSatisfaction"].replace({1:"Low",2:"Medium",3:"High",4:"Very High"})
df["PerformanceRating"] = df["PerformanceRating"].replace({1:"Low",2:"Good",3:"Excellent",4:"Outstanding"})
df["RelationshipSatisfaction"] = df["RelationshipSatisfaction"].replace({1:"Low",2:"Medium",3:"High",4:"Very High"})
df["WorkLifeBalance"] = df["WorkLifeBalance"].replace({1:"Bad",2:"Good",3:"Better",4:"Best"})

_We observe that the EmployeeCount column has only one unique value (1), indicating that each row represents a single employee, but it does not add any analytical value, so it can be dropped. Similarly, the StandardHours column contains a constant value of 80 for all records, making it irrelevant for our analysis. Additionally, the EmployeeNumber column is a unique identifier for each employee and does not contribute any meaningful insight toward attrition prediction. Therefore, all three columns will be removed from the dataset to keep only relevant features_

In [45]:
df.drop(['EmployeeCount', 'EmployeeNumber', 'StandardHours'], axis="columns", inplace=True)

_Let’s take another look at the numerical features after removing the irrelevant columns_

In [46]:
df.select_dtypes(include='int64')

Unnamed: 0,Age,DailyRate,DistanceFromHome,HourlyRate,MonthlyIncome,MonthlyRate,NumCompaniesWorked,PercentSalaryHike,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,1102,1,94,5993,19479,8,11,0,8,0,6,4,0,5
1,49,279,8,61,5130,24907,1,23,1,10,3,10,7,1,7
2,37,1373,2,92,2090,2396,6,15,0,7,3,0,0,0,0
3,33,1392,3,56,2909,23159,1,11,0,8,3,8,7,3,0
4,27,591,2,40,3468,16632,9,12,1,6,3,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,884,23,41,2571,12290,4,17,1,17,3,5,2,0,3
1466,39,613,6,42,9991,21457,4,15,1,9,5,7,7,1,7
1467,27,155,4,87,6142,5174,1,20,1,6,0,6,2,0,3
1468,49,1023,2,63,5390,13243,2,14,0,17,3,9,6,0,8


_After cleaning the dataset and removing irrelevant columns, we are left with 15 numerical features that will be used for analysis_

_Now, let's clean the categorical features_

In [47]:
df.select_dtypes(include='object')

Unnamed: 0,Attrition,BusinessTravel,Department,Education,EducationField,EnvironmentSatisfaction,Gender,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,Over18,OverTime,PerformanceRating,RelationshipSatisfaction,WorkLifeBalance
0,Yes,Travel_Rarely,Sales,College,Life Sciences,Medium,Female,High,Junior Level,Sales Executive,Very High,Single,Y,Yes,Excellent,Low,Bad
1,No,Travel_Frequently,Research & Development,Below College,Life Sciences,High,Male,Medium,Junior Level,Research Scientist,Medium,Married,Y,No,Outstanding,Very High,Better
2,Yes,Travel_Rarely,Research & Development,College,Other,Very High,Male,Medium,Entry Level,Laboratory Technician,High,Single,Y,Yes,Excellent,Medium,Better
3,No,Travel_Frequently,Research & Development,Master,Life Sciences,Very High,Female,High,Entry Level,Research Scientist,High,Married,Y,Yes,Excellent,High,Better
4,No,Travel_Rarely,Research & Development,Below College,Medical,Low,Male,High,Entry Level,Laboratory Technician,Medium,Married,Y,No,Excellent,Very High,Better
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,No,Travel_Frequently,Research & Development,College,Medical,High,Male,Very High,Junior Level,Laboratory Technician,Very High,Married,Y,No,Excellent,High,Better
1466,No,Travel_Rarely,Research & Development,Below College,Medical,Very High,Male,Medium,Mid Level,Healthcare Representative,Low,Married,Y,No,Excellent,Low,Better
1467,No,Travel_Rarely,Research & Development,Bachelor,Life Sciences,Medium,Male,Very High,Junior Level,Manufacturing Director,Medium,Married,Y,Yes,Outstanding,Medium,Better
1468,No,Travel_Frequently,Sales,Bachelor,Medical,Very High,Male,Medium,Junior Level,Sales Executive,Medium,Married,Y,No,Excellent,Very High,Good


In [48]:
df.select_dtypes(include='object').nunique()

Attrition                   2
BusinessTravel              3
Department                  3
Education                   5
EducationField              6
EnvironmentSatisfaction     4
Gender                      2
JobInvolvement              4
JobLevel                    5
JobRole                     9
JobSatisfaction             4
MaritalStatus               3
Over18                      1
OverTime                    2
PerformanceRating           2
RelationshipSatisfaction    4
WorkLifeBalance             4
dtype: int64

_The minimum age in the dataset is 18, which confirms that all employees are adults. Therefore, the Over18 attribute is not need. So, we will drop it as well_

In [49]:
df.drop(['Over18'], axis="columns", inplace=True)

In [50]:
df.select_dtypes(include='object').shape

(1470, 16)

_Now, we have 16 Categorical features_

_Let’s quickly glance over the cleaned dataset_

In [51]:
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,College,Life Sciences,Medium,Female,...,Excellent,Low,0,8,0,Bad,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,Below College,Life Sciences,High,Male,...,Outstanding,Very High,1,10,3,Better,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,College,Other,Very High,Male,...,Excellent,Medium,0,7,3,Better,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,Master,Life Sciences,Very High,Female,...,Excellent,High,0,8,3,Better,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,Below College,Medical,Low,Male,...,Excellent,Very High,1,6,3,Better,2,2,2,2


In [52]:
df.shape

(1470, 31)

_we are now left with 31 meaningful features in the dataset_

# Exploratory Data Analysis