# Import Libraries & Data

In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import QuantileTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib.colors import ListedColormap
import statsmodels.api as sm
from scipy.stats import boxcox

## Import Data

In [2]:
info = pd.read_excel("Data/data_dictionary.xlsx")

In [3]:
info

Unnamed: 0,Variable,Meaning,Levels
0,Age,Age of the employee,
1,Attrition,Whether the employee left in the previous year...,
2,BusinessTravel,How frequently the employees travelled for bus...,
3,Department,Department in company,
4,DistanceFromHome,Distance from home in kms,
5,Education,Education Level,1 'Below College'
6,,,2 'College'
7,,,3 'Bachelor'
8,,,4 'Master'
9,,,5 'Doctor'


In [4]:
survey= pd.read_csv("Data/employee_survey_data.csv")
survey.columns= survey.columns.str.lower()
survey

Unnamed: 0,employeeid,environmentsatisfaction,jobsatisfaction,worklifebalance
0,1,3.0,4.0,2.0
1,2,3.0,2.0,4.0
2,3,2.0,2.0,1.0
3,4,4.0,4.0,3.0
4,5,4.0,1.0,3.0
...,...,...,...,...
4405,4406,4.0,1.0,3.0
4406,4407,4.0,4.0,3.0
4407,4408,1.0,3.0,3.0
4408,4409,4.0,1.0,3.0


In [5]:
#check for nan values
survey.isnull().values.any()

True

In [6]:
display(survey.isnull().sum())

employeeid                  0
environmentsatisfaction    25
jobsatisfaction            20
worklifebalance            38
dtype: int64

In [7]:
general= pd.read_csv('Data/general_data.csv')
general
general.columns= general.columns.str.lower()
general

Unnamed: 0,age,attrition,businesstravel,department,distancefromhome,education,educationfield,employeecount,employeeid,gender,joblevel,jobrole,maritalstatus,monthlyincome,numcompaniesworked,over18,percentsalaryhike,standardhours,stockoptionlevel,totalworkingyears,trainingtimeslastyear,yearsatcompany,yearssincelastpromotion,yearswithcurrmanager
0,51,No,Travel_Rarely,Sales,6,2,Life Sciences,1,1,Female,1,Healthcare Representative,Married,131160,1.0,Y,11,8,0,1.0,6,1,0,0
1,31,Yes,Travel_Frequently,Research & Development,10,1,Life Sciences,1,2,Female,1,Research Scientist,Single,41890,0.0,Y,23,8,1,6.0,3,5,1,4
2,32,No,Travel_Frequently,Research & Development,17,4,Other,1,3,Male,4,Sales Executive,Married,193280,1.0,Y,15,8,3,5.0,2,5,0,3
3,38,No,Non-Travel,Research & Development,2,5,Life Sciences,1,4,Male,3,Human Resources,Married,83210,3.0,Y,11,8,3,13.0,5,8,7,5
4,32,No,Travel_Rarely,Research & Development,10,1,Medical,1,5,Male,1,Sales Executive,Single,23420,4.0,Y,12,8,2,9.0,2,6,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4405,42,No,Travel_Rarely,Research & Development,5,4,Medical,1,4406,Female,1,Research Scientist,Single,60290,3.0,Y,17,8,1,10.0,5,3,0,2
4406,29,No,Travel_Rarely,Research & Development,2,4,Medical,1,4407,Male,1,Laboratory Technician,Divorced,26790,2.0,Y,15,8,0,10.0,2,3,0,2
4407,25,No,Travel_Rarely,Research & Development,25,2,Life Sciences,1,4408,Male,2,Sales Executive,Married,37020,0.0,Y,20,8,0,5.0,4,4,1,2
4408,42,No,Travel_Rarely,Sales,18,2,Medical,1,4409,Male,1,Laboratory Technician,Divorced,23980,0.0,Y,14,8,1,10.0,2,9,7,8


In [8]:
general["employeecount"].unique()

array([1])

In [9]:
general["over18"].value_counts()

Y    4410
Name: over18, dtype: int64

In [10]:
manager= pd.read_csv('Data/manager_survey_data.csv')
manager.columns= manager.columns.str.lower()

In [11]:
general["over18"].unique()

array(['Y'], dtype=object)

In [12]:
# merge data sets

In [13]:
general= pd.merge(general, survey, on = "employeeid", how = "inner")
general

Unnamed: 0,age,attrition,businesstravel,department,distancefromhome,education,educationfield,employeecount,employeeid,gender,joblevel,jobrole,maritalstatus,monthlyincome,numcompaniesworked,over18,percentsalaryhike,standardhours,stockoptionlevel,totalworkingyears,trainingtimeslastyear,yearsatcompany,yearssincelastpromotion,yearswithcurrmanager,environmentsatisfaction,jobsatisfaction,worklifebalance
0,51,No,Travel_Rarely,Sales,6,2,Life Sciences,1,1,Female,1,Healthcare Representative,Married,131160,1.0,Y,11,8,0,1.0,6,1,0,0,3.0,4.0,2.0
1,31,Yes,Travel_Frequently,Research & Development,10,1,Life Sciences,1,2,Female,1,Research Scientist,Single,41890,0.0,Y,23,8,1,6.0,3,5,1,4,3.0,2.0,4.0
2,32,No,Travel_Frequently,Research & Development,17,4,Other,1,3,Male,4,Sales Executive,Married,193280,1.0,Y,15,8,3,5.0,2,5,0,3,2.0,2.0,1.0
3,38,No,Non-Travel,Research & Development,2,5,Life Sciences,1,4,Male,3,Human Resources,Married,83210,3.0,Y,11,8,3,13.0,5,8,7,5,4.0,4.0,3.0
4,32,No,Travel_Rarely,Research & Development,10,1,Medical,1,5,Male,1,Sales Executive,Single,23420,4.0,Y,12,8,2,9.0,2,6,0,4,4.0,1.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4405,42,No,Travel_Rarely,Research & Development,5,4,Medical,1,4406,Female,1,Research Scientist,Single,60290,3.0,Y,17,8,1,10.0,5,3,0,2,4.0,1.0,3.0
4406,29,No,Travel_Rarely,Research & Development,2,4,Medical,1,4407,Male,1,Laboratory Technician,Divorced,26790,2.0,Y,15,8,0,10.0,2,3,0,2,4.0,4.0,3.0
4407,25,No,Travel_Rarely,Research & Development,25,2,Life Sciences,1,4408,Male,2,Sales Executive,Married,37020,0.0,Y,20,8,0,5.0,4,4,1,2,1.0,3.0,3.0
4408,42,No,Travel_Rarely,Sales,18,2,Medical,1,4409,Male,1,Laboratory Technician,Divorced,23980,0.0,Y,14,8,1,10.0,2,9,7,8,4.0,1.0,3.0


In [14]:
general= pd.merge(general, manager, on = "employeeid", how = "inner")

In [15]:
general

Unnamed: 0,age,attrition,businesstravel,department,distancefromhome,education,educationfield,employeecount,employeeid,gender,joblevel,jobrole,maritalstatus,monthlyincome,numcompaniesworked,over18,percentsalaryhike,standardhours,stockoptionlevel,totalworkingyears,trainingtimeslastyear,yearsatcompany,yearssincelastpromotion,yearswithcurrmanager,environmentsatisfaction,jobsatisfaction,worklifebalance,jobinvolvement,performancerating
0,51,No,Travel_Rarely,Sales,6,2,Life Sciences,1,1,Female,1,Healthcare Representative,Married,131160,1.0,Y,11,8,0,1.0,6,1,0,0,3.0,4.0,2.0,3,3
1,31,Yes,Travel_Frequently,Research & Development,10,1,Life Sciences,1,2,Female,1,Research Scientist,Single,41890,0.0,Y,23,8,1,6.0,3,5,1,4,3.0,2.0,4.0,2,4
2,32,No,Travel_Frequently,Research & Development,17,4,Other,1,3,Male,4,Sales Executive,Married,193280,1.0,Y,15,8,3,5.0,2,5,0,3,2.0,2.0,1.0,3,3
3,38,No,Non-Travel,Research & Development,2,5,Life Sciences,1,4,Male,3,Human Resources,Married,83210,3.0,Y,11,8,3,13.0,5,8,7,5,4.0,4.0,3.0,2,3
4,32,No,Travel_Rarely,Research & Development,10,1,Medical,1,5,Male,1,Sales Executive,Single,23420,4.0,Y,12,8,2,9.0,2,6,0,4,4.0,1.0,3.0,3,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4405,42,No,Travel_Rarely,Research & Development,5,4,Medical,1,4406,Female,1,Research Scientist,Single,60290,3.0,Y,17,8,1,10.0,5,3,0,2,4.0,1.0,3.0,3,3
4406,29,No,Travel_Rarely,Research & Development,2,4,Medical,1,4407,Male,1,Laboratory Technician,Divorced,26790,2.0,Y,15,8,0,10.0,2,3,0,2,4.0,4.0,3.0,2,3
4407,25,No,Travel_Rarely,Research & Development,25,2,Life Sciences,1,4408,Male,2,Sales Executive,Married,37020,0.0,Y,20,8,0,5.0,4,4,1,2,1.0,3.0,3.0,3,4
4408,42,No,Travel_Rarely,Sales,18,2,Medical,1,4409,Male,1,Laboratory Technician,Divorced,23980,0.0,Y,14,8,1,10.0,2,9,7,8,4.0,1.0,3.0,2,3


In [16]:
general.dtypes

age                          int64
attrition                   object
businesstravel              object
department                  object
distancefromhome             int64
education                    int64
educationfield              object
employeecount                int64
employeeid                   int64
gender                      object
joblevel                     int64
jobrole                     object
maritalstatus               object
monthlyincome                int64
numcompaniesworked         float64
over18                      object
percentsalaryhike            int64
standardhours                int64
stockoptionlevel             int64
totalworkingyears          float64
trainingtimeslastyear        int64
yearsatcompany               int64
yearssincelastpromotion      int64
yearswithcurrmanager         int64
environmentsatisfaction    float64
jobsatisfaction            float64
worklifebalance            float64
jobinvolvement               int64
performancerating   

In [17]:
general['jobsatisfaction'].unique()

array([ 4.,  2.,  1.,  3., nan])

In [18]:
general['totalworkingyears'].unique()

array([ 1.,  6.,  5., 13.,  9., 28., 10., 21., 16., 37.,  7.,  3., 15.,
        8., nan, 12., 17., 19., 22.,  2.,  4., 23.,  0., 11., 24., 25.,
       20., 14., 26., 18., 30., 36., 31., 33., 32., 34., 40., 29., 35.,
       27., 38.])

In [19]:
display(general.isnull().sum())

age                         0
attrition                   0
businesstravel              0
department                  0
distancefromhome            0
education                   0
educationfield              0
employeecount               0
employeeid                  0
gender                      0
joblevel                    0
jobrole                     0
maritalstatus               0
monthlyincome               0
numcompaniesworked         19
over18                      0
percentsalaryhike           0
standardhours               0
stockoptionlevel            0
totalworkingyears           9
trainingtimeslastyear       0
yearsatcompany              0
yearssincelastpromotion     0
yearswithcurrmanager        0
environmentsatisfaction    25
jobsatisfaction            20
worklifebalance            38
jobinvolvement              0
performancerating           0
dtype: int64

In [20]:
general.drop_duplicates()

Unnamed: 0,age,attrition,businesstravel,department,distancefromhome,education,educationfield,employeecount,employeeid,gender,joblevel,jobrole,maritalstatus,monthlyincome,numcompaniesworked,over18,percentsalaryhike,standardhours,stockoptionlevel,totalworkingyears,trainingtimeslastyear,yearsatcompany,yearssincelastpromotion,yearswithcurrmanager,environmentsatisfaction,jobsatisfaction,worklifebalance,jobinvolvement,performancerating
0,51,No,Travel_Rarely,Sales,6,2,Life Sciences,1,1,Female,1,Healthcare Representative,Married,131160,1.0,Y,11,8,0,1.0,6,1,0,0,3.0,4.0,2.0,3,3
1,31,Yes,Travel_Frequently,Research & Development,10,1,Life Sciences,1,2,Female,1,Research Scientist,Single,41890,0.0,Y,23,8,1,6.0,3,5,1,4,3.0,2.0,4.0,2,4
2,32,No,Travel_Frequently,Research & Development,17,4,Other,1,3,Male,4,Sales Executive,Married,193280,1.0,Y,15,8,3,5.0,2,5,0,3,2.0,2.0,1.0,3,3
3,38,No,Non-Travel,Research & Development,2,5,Life Sciences,1,4,Male,3,Human Resources,Married,83210,3.0,Y,11,8,3,13.0,5,8,7,5,4.0,4.0,3.0,2,3
4,32,No,Travel_Rarely,Research & Development,10,1,Medical,1,5,Male,1,Sales Executive,Single,23420,4.0,Y,12,8,2,9.0,2,6,0,4,4.0,1.0,3.0,3,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4405,42,No,Travel_Rarely,Research & Development,5,4,Medical,1,4406,Female,1,Research Scientist,Single,60290,3.0,Y,17,8,1,10.0,5,3,0,2,4.0,1.0,3.0,3,3
4406,29,No,Travel_Rarely,Research & Development,2,4,Medical,1,4407,Male,1,Laboratory Technician,Divorced,26790,2.0,Y,15,8,0,10.0,2,3,0,2,4.0,4.0,3.0,2,3
4407,25,No,Travel_Rarely,Research & Development,25,2,Life Sciences,1,4408,Male,2,Sales Executive,Married,37020,0.0,Y,20,8,0,5.0,4,4,1,2,1.0,3.0,3.0,3,4
4408,42,No,Travel_Rarely,Sales,18,2,Medical,1,4409,Male,1,Laboratory Technician,Divorced,23980,0.0,Y,14,8,1,10.0,2,9,7,8,4.0,1.0,3.0,2,3


# Data Cleaning

## Replacing null value with 0

In [21]:
np.where(general['numcompaniesworked'].isnull())[0]

array([ 115,  210,  343,  476,  647,  799,  932, 1103, 1312, 1521, 1711,
       1996, 2365, 2696, 3063, 3533, 3910, 4226, 4395])

In [22]:
general['numcompaniesworked'] = general['numcompaniesworked'].fillna(value = 0)

In [23]:
general['numcompaniesworked'].isnull().sum()

0

In [24]:
general['numcompaniesworked'].value_counts()

1.0    1558
0.0     605
3.0     474
2.0     438
4.0     415
7.0     222
6.0     208
5.0     187
9.0     156
8.0     147
Name: numcompaniesworked, dtype: int64

## Replacing null value with mode

In [25]:
np.where(general['environmentsatisfaction'].isnull())[0]

array([  11,  111,  195,  300,  405,  489,  678,  846,  909, 1077, 1224,
       1476, 1812, 2148, 2379, 2568, 2736, 2925, 3114, 3261, 3387, 3534,
       3681, 3996, 4332])

In [26]:
for column in ['environmentsatisfaction']:
    general[column].fillna(general[column].mode()[0], inplace=True)

In [27]:
general['environmentsatisfaction'].isnull().sum()

0

## Replacing null value with mode

In [28]:
np.where(general['jobsatisfaction'].isnull())[0]

array([  40,  124,  313,  586,  859, 1195, 1468, 1678, 1909, 2182, 2476,
       2707, 2875, 3085, 3295, 3526, 3778, 4030, 4219, 4345])

In [29]:
for column in ['jobsatisfaction']:
    general[column].fillna(general[column].mode()[0], inplace=True)

In [30]:
general['jobsatisfaction'].isnull().sum()

0

## Replacing null value with mode

In [31]:
np.where(general['worklifebalance'].isnull())[0]

array([  84,  273,  483,  609,  735,  861,  987, 1092, 1197, 1260, 1323,
       1407, 1470, 1533, 1596, 1659, 1722, 1785, 1848, 1911, 1953, 2016,
       2079, 2142, 2226, 2310, 2394, 2478, 2583, 2709, 2898, 3024, 3339,
       3654, 3885, 4116, 4305, 4409])

In [32]:
for column in ['worklifebalance']:
    general[column].fillna(general[column].mode()[0], inplace=True)

In [33]:
general['worklifebalance'].isnull().sum()

0

## Replacing null value with mode totalworkingyears

In [34]:
for column in ['totalworkingyears']:
    general[column].fillna(general[column].mode()[0], inplace=True)

In [35]:
general.isnull().sum()

age                        0
attrition                  0
businesstravel             0
department                 0
distancefromhome           0
education                  0
educationfield             0
employeecount              0
employeeid                 0
gender                     0
joblevel                   0
jobrole                    0
maritalstatus              0
monthlyincome              0
numcompaniesworked         0
over18                     0
percentsalaryhike          0
standardhours              0
stockoptionlevel           0
totalworkingyears          0
trainingtimeslastyear      0
yearsatcompany             0
yearssincelastpromotion    0
yearswithcurrmanager       0
environmentsatisfaction    0
jobsatisfaction            0
worklifebalance            0
jobinvolvement             0
performancerating          0
dtype: int64

## Handling Null Value

In [36]:
for column in general.columns:
    if general[column].dtype == 'float64':
        general[column] = general[column].astype('int64')

In [37]:
general.dtypes

age                         int64
attrition                  object
businesstravel             object
department                 object
distancefromhome            int64
education                   int64
educationfield             object
employeecount               int64
employeeid                  int64
gender                     object
joblevel                    int64
jobrole                    object
maritalstatus              object
monthlyincome               int64
numcompaniesworked          int64
over18                     object
percentsalaryhike           int64
standardhours               int64
stockoptionlevel            int64
totalworkingyears           int64
trainingtimeslastyear       int64
yearsatcompany              int64
yearssincelastpromotion     int64
yearswithcurrmanager        int64
environmentsatisfaction     int64
jobsatisfaction             int64
worklifebalance             int64
jobinvolvement              int64
performancerating           int64
dtype: object

## Dropping not informative columns

In [38]:
general['over18'].value_counts()

Y    4410
Name: over18, dtype: int64

In [39]:
general['employeecount'].value_counts()

1    4410
Name: employeecount, dtype: int64

In [40]:
general['standardhours'].value_counts()

8    4410
Name: standardhours, dtype: int64

In [41]:
general = general.drop(['over18','employeecount', 'standardhours' ], axis = 1).reset_index(drop = True)


In [42]:
general

Unnamed: 0,age,attrition,businesstravel,department,distancefromhome,education,educationfield,employeeid,gender,joblevel,jobrole,maritalstatus,monthlyincome,numcompaniesworked,percentsalaryhike,stockoptionlevel,totalworkingyears,trainingtimeslastyear,yearsatcompany,yearssincelastpromotion,yearswithcurrmanager,environmentsatisfaction,jobsatisfaction,worklifebalance,jobinvolvement,performancerating
0,51,No,Travel_Rarely,Sales,6,2,Life Sciences,1,Female,1,Healthcare Representative,Married,131160,1,11,0,1,6,1,0,0,3,4,2,3,3
1,31,Yes,Travel_Frequently,Research & Development,10,1,Life Sciences,2,Female,1,Research Scientist,Single,41890,0,23,1,6,3,5,1,4,3,2,4,2,4
2,32,No,Travel_Frequently,Research & Development,17,4,Other,3,Male,4,Sales Executive,Married,193280,1,15,3,5,2,5,0,3,2,2,1,3,3
3,38,No,Non-Travel,Research & Development,2,5,Life Sciences,4,Male,3,Human Resources,Married,83210,3,11,3,13,5,8,7,5,4,4,3,2,3
4,32,No,Travel_Rarely,Research & Development,10,1,Medical,5,Male,1,Sales Executive,Single,23420,4,12,2,9,2,6,0,4,4,1,3,3,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4405,42,No,Travel_Rarely,Research & Development,5,4,Medical,4406,Female,1,Research Scientist,Single,60290,3,17,1,10,5,3,0,2,4,1,3,3,3
4406,29,No,Travel_Rarely,Research & Development,2,4,Medical,4407,Male,1,Laboratory Technician,Divorced,26790,2,15,0,10,2,3,0,2,4,4,3,2,3
4407,25,No,Travel_Rarely,Research & Development,25,2,Life Sciences,4408,Male,2,Sales Executive,Married,37020,0,20,0,5,4,4,1,2,1,3,3,3,4
4408,42,No,Travel_Rarely,Sales,18,2,Medical,4409,Male,1,Laboratory Technician,Divorced,23980,0,14,1,10,2,9,7,8,4,1,3,2,3


# Data Processing

## Transforming dt Time

In [43]:
in_time= pd.read_csv('Data/in_time.csv')
in_time.head()

FileNotFoundError: [Errno 2] No such file or directory: 'Data/in_time.csv'

In [None]:
out_time = pd.read_csv('Data/out_time.csv')
out_time.head()

In [None]:
## Concat time data sets

In [None]:
total_time= in_time.append(out_time)
total_time

In [None]:
# remove 0 values (0 values are days where the office was closed)


In [None]:
total_time = total_time.dropna(axis=1, how='all')

In [None]:
total_time

In [None]:
# replace individual days that have NaN with 0 ( for future calculations)

In [None]:
total_time.fillna(0, inplace= True)
total_time

In [None]:
## Convert to datatime format

In [None]:
total_time.iloc[:,1:]= total_time.iloc[:,1:].apply(pd.to_datetime, errors='coerce') 

In [None]:
total_time

In [None]:
total_time = total_time.diff(periods=4410)

In [None]:
total_time = total_time.iloc[4410:]

In [None]:
total_time

In [None]:
total_time.reset_index(inplace=True)

In [None]:
total_time

In [None]:
total_time.drop(columns=['index','Unnamed: 0'],axis=1,inplace=True)

In [None]:
total_time.head()

In [None]:
total_time['mean_time'] = total_time.mean(axis=1)

In [None]:
total_time

In [None]:
# convert mean time into hours
total_time.mean_time = total_time.mean_time.apply(lambda x : int(str(x).split(' ')[2].split(':')[0]) + int(str(x).split(' ')[2].split(':')[1])/60 + float(str(x).split(' ')[2].split(':')[2])/3600)
total_time.index = range(1,len(total_time)+1)

In [None]:
total_time.reset_index(inplace=True)

In [None]:
total_time

In [None]:
total_time.index = range(1,len(total_time)+1)

In [None]:
# Rename index as emID to add the general data set
total_time = total_time.rename(columns = {'index' : 'employeeid'})

In [None]:
# Overwrite totalt only with the columns we need for general
total_time = total_time[['employeeid','mean_time']]  


In [None]:
general = pd.merge(general, total_time, on = 'employeeid', how = 'inner')

In [None]:
general

In [None]:
general.to_csv('Data/general_hr_data.csv')

## Data Distribution

In [None]:
general['businesstravel'].value_counts()

In [None]:
general['department'].value_counts()

In [None]:
general['distancefromhome'].value_counts()

In [None]:
general['education'].value_counts()

In [None]:
sns.histplot(general['mean_time'])

In [None]:
general['educationfield'].value_counts()

In [None]:
general['joblevel'].value_counts()

In [None]:
general['jobrole'].value_counts()

In [None]:
general['maritalstatus'].value_counts()

In [None]:
general['monthlyincome'].sort_values(ascending= False)

In [None]:
sns.histplot(general['monthlyincome'],bins= 10)

In [None]:
general['numcompaniesworked'].value_counts()

In [None]:
general['percentsalaryhike'].value_counts()

In [None]:
sns.histplot(general['percentsalaryhike'],bins= 15)

In [None]:
general['stockoptionlevel'].value_counts()

In [None]:
general['totalworkingyears'].value_counts()

In [None]:
general['trainingtimeslastyear'].value_counts()

In [None]:
general['yearsatcompany'].value_counts()

In [None]:
sns.histplot(general['yearsatcompany'],bins= 15)

In [None]:
general['yearssincelastpromotion'].value_counts()

In [None]:
sns.histplot(general['yearssincelastpromotion'],bins= 14)

In [None]:
general['yearswithcurrmanager'].value_counts()

In [None]:
general['environmentsatisfaction'].value_counts()

In [None]:
sns.histplot(general['environmentsatisfaction'])

In [None]:
sns.histplot(general['jobsatisfaction'])

In [None]:
general['worklifebalance'].value_counts()

In [None]:
sns.histplot(general['worklifebalance'])

In [None]:
sns.histplot(general['jobinvolvement'])

In [None]:
sns.histplot(general['performancerating'])

In [None]:
general['performancerating'].value_counts()

In [None]:
sns.histplot(general['mean_time'])

In [None]:
plt.figure(figsize=(35,30))
for i, column in enumerate(general.columns,1):
    plt.subplot(9,3,i)
    sns.histplot(general[column])

In [None]:
# Change type from integer to category for education feature
general['education'] = general['education'].astype('category') 


In [None]:
general.dtypes

## Change type of categorical features

In [None]:
general[['joblevel','education','stockoptionlevel','environmentsatisfaction', 'jobsatisfaction', 'worklifebalance','jobinvolvement','performancerating']]=general[['joblevel','education','stockoptionlevel','environmentsatisfaction', 'jobsatisfaction', 'worklifebalance','jobinvolvement','performancerating']].astype(object)

In [None]:
general.dtypes

## Numerical features 

In [None]:
numerical = general.select_dtypes(np.number)
numerical = numerical.drop(['employeeid'], axis = 1).reset_index(drop = True)
numerical

## Standardize numerical features

In [None]:
#zscore=(numerical-numerical.mean())/numerical.std()
#zscore
#numerical.mean()

In [None]:
corr=numerical.corr()
corr

In [None]:
import statsmodels.api as sm
from scipy.stats import boxcox
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
with sns.axes_style("white"):
    f, ax =plt.subplots(figsize=(9, 7))
    multicollinearity_ax = sns.heatmap(corr, mask=mask, cmap='coolwarm', vmin=-1,vmax=1,annot=True, square=True)

In [None]:
numerical_targ = pd.concat([numerical, general['attrition']],axis=1)

In [None]:
numerical_targ

In [None]:
numerical_targ['attrition'] = numerical_targ['attrition'].apply(lambda x:0 if x=="No" else 1)
numerical_targ


In [None]:
numerical_targ['attrition'].value_counts()

In [None]:
corr=numerical_targ.corr()
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
with sns.axes_style("white"):
    f, ax =plt.subplots(figsize=(9, 7))
    multicollinearity_ax = sns.heatmap(corr, mask=mask, cmap='coolwarm', vmin=-1,vmax=1,annot=True, square=True)

## Modeling numerical features : Logistic regression

### Xy split

In [None]:
X= numerical
y= numerical_targ.attrition


### Train/Test split

In [None]:
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=.30,random_state=123)
X_train

### Standardize

In [None]:

std_scaler = StandardScaler().fit(X_train) 

X_train_scaled = std_scaler.transform(X_train)
X_train_scaled

In [None]:
X_test_scaled=std_scaler.transform(X_test)
X_test_scaled

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PowerTransformer
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.metrics import plot_confusion_matrix


log_model = LogisticRegression() 
trans = PowerTransformer()

trans.fit(X_train_scaled)

X_train_mod = trans.transform(X_train_scaled)
X_test_mod  = trans.transform(X_test_scaled)

log_model.fit(X_train_mod, y_train)

y_pred_train_log = log_model.predict(X_train_mod)
y_pred_test_log = log_model.predict(X_test_mod)

performance_log = pd.DataFrame({'Error_metric': ['Accuracy','Precision','Recall'],
                               'Train': [accuracy_score(y_train, y_pred_train_log),
                                         precision_score(y_train, y_pred_train_log),
                                         recall_score(y_train, y_pred_train_log)],
                               'Test': [accuracy_score(y_test, y_pred_test_log),
                                        precision_score(y_test, y_pred_test_log),
                                        recall_score(y_test, y_pred_test_log)]})

display(performance_log)
print("Confusion matrix for the train set")
print(confusion_matrix(y_train,y_pred_train_log))
plot_confusion_matrix(log_model,X_train_mod,y_train, values_format = 'd')
plt.show()

print()
print()

print("Confusion matrix for the test set")
print(confusion_matrix(y_test, y_pred_test_log))
plot_confusion_matrix(log_model,X_test_mod,y_test, values_format = 'd')
plt.show()

## Categorical features

In [None]:
categorical = general.select_dtypes(include=object)
categorical = categorical.drop(['attrition'], axis=1).reset_index(drop = True)
categorical

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

### Ordinal encoding

In [None]:
cat_ord = categorical[['businesstravel']]

In [None]:
categories = [['Non-Travel','Travel_Rarely','Travel_Frequently']]

In [None]:
categorical['education'].value_counts()

In [None]:
ordinal_encoder = OrdinalEncoder(categories=categories)
cat_ord_encoded = pd.DataFrame(ordinal_encoder.fit_transform(cat_ord),columns=cat_ord.columns)

In [None]:
cat_ord_2 = categorical[['education','joblevel', 'stockoptionlevel', 'environmentsatisfaction','worklifebalance','jobsatisfaction','jobinvolvement','performancerating']]

In [None]:
cat_ord_encoded

In [None]:
cat_ord = pd.concat([cat_ord_2, cat_ord_encoded], axis=1)

In [None]:
cat_ord

In [None]:
categorical

### Nominal Encoding

In [None]:
cat_nom = categorical[['department','educationfield','gender','jobrole','maritalstatus']]
cat_nom

In [None]:
cat_nom_encoded = pd.get_dummies(cat_nom)
cat_nom_encoded

In [None]:
cat_encoded = pd.concat([cat_nom_encoded, cat_ord], axis=1)

In [None]:
cat_encoded

In [None]:
## General merge (All features)

In [None]:
hr = pd.concat([cat_encoded, numerical_targ], axis=1)
hr

## Modeling numerical and categorical features together

### Xy split

In [None]:
X = hr.iloc[:,:-1]
X

In [None]:
y = hr.iloc[:,-1]
y

### Train/test split

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

print(X.shape)
print(y.shape)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)
model = LogisticRegression()

In [None]:
scores=cross_val_score(model, X_train, y_train, cv=5)
scores

In [None]:
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

In [None]:
std_scaler = StandardScaler().fit(X_train) 
X_train_scaled = std_scaler.transform(X_train)
X_test_scaled=std_scaler.transform(X_test)

In [None]:
log_model = LogisticRegression() 
trans = PowerTransformer()

trans.fit(X_train_scaled)

X_train_mod = trans.transform(X_train_scaled)
X_test_mod  = trans.transform(X_test_scaled)

log_model.fit(X_train_mod, y_train)

y_pred_train_log = log_model.predict(X_train_mod)
y_pred_test_log = log_model.predict(X_test_mod)

performance_log = pd.DataFrame({'Error_metric': ['Accuracy','Precision','Recall'],
                               'Train': [accuracy_score(y_train, y_pred_train_log),
                                         precision_score(y_train, y_pred_train_log),
                                         recall_score(y_train, y_pred_train_log)],
                               'Test': [accuracy_score(y_test, y_pred_test_log),
                                        precision_score(y_test, y_pred_test_log),
                                        recall_score(y_test, y_pred_test_log)]})

display(performance_log)
print("Confusion matrix for the train set")
print(confusion_matrix(y_train,y_pred_train_log))
plot_confusion_matrix(log_model,X_train_mod,y_train, values_format = 'd')
plt.show()

print()
print()

print("Confusion matrix for the test set")
print(confusion_matrix(y_test, y_pred_test_log))
plot_confusion_matrix(log_model,X_test_mod,y_test, values_format = 'd')
plt.show()

## Handling outliers numerical

In [None]:
for column in numerical:
    plt.figure()
    numerical.boxplot([column])

### IQR

In [None]:
def outliers(hr, column):
    for column in hr:
        Q1= hr[column].quantile(0.25)
        Q3 = hr[column].quantile(0.75)
        IQR = Q3 - Q1
        upper_limit = Q3 + 1.5 * IQR
        lower_limit = Q1 - 1.5 * IQR
    print (upper_limit, lower_limit)

In [None]:
upper, lower = outliers(hr, column)

In [None]:
new_hr = hr[(hr[column] > lower) & (hr[column] < upper)]

In [None]:
new_hr

In [None]:
new_hr.boxplot(column=['monthlyincome'])