In [1]:
# importing the relevant libraries
import pandas as pd
import numpy as np

In [2]:
# loading the raw data in csv and displaying first 20 rows of it
raw_csv_data = pd.read_csv('Absenteeism_data.csv')
raw_csv_data.head(20)

Unnamed: 0,ID,Reason for Absence,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,11,26,07/07/2015,289,36,33,239.554,30,1,2,1,4
1,36,0,14/07/2015,118,13,50,239.554,31,1,1,0,0
2,3,23,15/07/2015,179,51,38,239.554,31,1,0,0,2
3,7,7,16/07/2015,279,5,39,239.554,24,1,2,0,4
4,11,23,23/07/2015,289,36,33,239.554,30,1,2,1,2
5,3,23,10/07/2015,179,51,38,239.554,31,1,0,0,2
6,10,22,17/07/2015,361,52,28,239.554,27,1,1,4,8
7,20,23,24/07/2015,260,50,36,239.554,23,1,4,0,4
8,14,19,06/07/2015,155,12,34,239.554,25,1,2,0,40
9,1,22,13/07/2015,235,11,37,239.554,29,3,1,1,8


### Descriptive statistics

In [3]:
raw_csv_data.describe(include="all") # to see the discriptive statistics

# only for object data there is unique, top, freq 

Unnamed: 0,ID,Reason for Absence,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
count,700.0,700.0,700,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0
unique,,,432,,,,,,,,,
top,,,17/08/2015,,,,,,,,,
freq,,,5,,,,,,,,,
mean,17.951429,19.411429,,222.347143,29.892857,36.417143,271.801774,26.737143,1.282857,1.021429,0.687143,6.761429
std,11.028144,8.356292,,66.31296,14.804446,6.379083,40.021804,4.254701,0.66809,1.112215,1.166095,12.670082
min,1.0,0.0,,118.0,5.0,27.0,205.917,19.0,1.0,0.0,0.0,0.0
25%,9.0,13.0,,179.0,16.0,31.0,241.476,24.0,1.0,0.0,0.0,2.0
50%,18.0,23.0,,225.0,26.0,37.0,264.249,25.0,1.0,1.0,0.0,3.0
75%,28.0,27.0,,260.0,50.0,40.0,294.217,31.0,1.0,2.0,1.0,8.0


In [4]:
raw_csv_data.info() # using this just to get the datatype of each column

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   ID                         700 non-null    int64  
 1   Reason for Absence         700 non-null    int64  
 2   Date                       700 non-null    object 
 3   Transportation Expense     700 non-null    int64  
 4   Distance to Work           700 non-null    int64  
 5   Age                        700 non-null    int64  
 6   Daily Work Load Average    700 non-null    float64
 7   Body Mass Index            700 non-null    int64  
 8   Education                  700 non-null    int64  
 9   Children                   700 non-null    int64  
 10  Pets                       700 non-null    int64  
 11  Absenteeism Time in Hours  700 non-null    int64  
dtypes: float64(1), int64(10), object(1)
memory usage: 65.8+ KB


In [5]:
# Insights from above data -
# 1) As each feature has same Non-null count, so there are no missing values
# 2) Only 'Date' has object datatype

### Deal with 'ID' column

In [6]:
# first we will make a new copy of 'raw_csv_data'
df = raw_csv_data.copy() 
df.head()

Unnamed: 0,ID,Reason for Absence,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,11,26,07/07/2015,289,36,33,239.554,30,1,2,1,4
1,36,0,14/07/2015,118,13,50,239.554,31,1,1,0,0
2,3,23,15/07/2015,179,51,38,239.554,31,1,0,0,2
3,7,7,16/07/2015,279,5,39,239.554,24,1,2,0,4
4,11,23,23/07/2015,289,36,33,239.554,30,1,2,1,2


In [7]:
print(len(df.loc[:, 'ID'].unique())) # to get number unique values from column "ID"
print(sorted(df.loc[:, 'ID'].unique())) # to get all unique values in in ascending order

34
[1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 36]


In [8]:
df['ID'].value_counts() # to get counts of unique values

3     113
28     74
34     48
20     42
22     41
11     39
15     36
36     32
24     30
14     27
33     24
10     22
1      22
17     19
5      18
18     16
13     14
25     10
27      7
30      7
6       7
23      7
7       6
9       6
2       5
29      5
32      5
26      5
12      3
31      3
19      3
21      2
8       1
16      1
Name: ID, dtype: int64

In [9]:
# So there are only 34 unique values in 'ID', and values are repeated ovet the time
# As 'ID' is just a identification number (a nominal data) and it does not carry any numerical information
# So we'll drop it.

In [10]:
df = df.drop(columns = ['ID'])
df

Unnamed: 0,Reason for Absence,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,26,07/07/2015,289,36,33,239.554,30,1,2,1,4
1,0,14/07/2015,118,13,50,239.554,31,1,1,0,0
2,23,15/07/2015,179,51,38,239.554,31,1,0,0,2
3,7,16/07/2015,279,5,39,239.554,24,1,2,0,4
4,23,23/07/2015,289,36,33,239.554,30,1,2,1,2
...,...,...,...,...,...,...,...,...,...,...,...
695,10,23/05/2018,179,22,40,237.656,22,2,2,0,8
696,6,23/05/2018,225,26,28,237.656,24,1,1,2,3
697,10,24/05/2018,330,16,28,237.656,25,2,0,0,8
698,23,24/05/2018,235,16,32,237.656,25,3,0,0,2


### Deal with 'Reason for Absence' column

In [11]:
type(df.loc[:,'Reason for Absence'])

pandas.core.series.Series

In [12]:
print(df.loc[:,'Reason for Absence'].min()) # to get min value from this column
print(df.loc[:,'Reason for Absence'].max()) # to get max value from this column

print(len(df.loc[:,'Reason for Absence'].unique())) # to get counts of unique values

0
28
28


In [13]:
# As minimum value is 0 and max in 28 so total unique values count should be 29 but what we got is 28.
# so one reason type which is not given by any employee

In [14]:
print(sorted(df.loc[:,'Reason for Absence'].unique())) # to get sorted list of all unique values
# Using this line found that "reason 20" is not specified by any employee

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, 22, 23, 24, 25, 26, 27, 28]


In [15]:
print(df.loc[:,'Reason for Absence'].value_counts())

# So persons with '0 reason' are 38.
# '0 reason' means there is no reason behind his/her absency because he/she was present at all time.

23    147
28    110
27     66
13     52
0      38
19     36
22     32
26     31
25     29
11     24
10     22
18     21
14     18
1      16
7      13
12      8
21      6
6       6
8       5
9       4
5       3
16      3
24      3
15      2
4       2
3       1
2       1
17      1
Name: Reason for Absence, dtype: int64


#### Group the reason for absence

In [16]:
pd.options.display.max_columns = 50
reason_column = pd.get_dummies(df['Reason for Absence'], drop_first=True) 
# we dropped reason-0 column to avoid multicollinearity

reason_column.head(10)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,21,22,23,24,25,26,27,28
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0


In [17]:
df = df.drop('Reason for Absence', axis=1)
df.head()

Unnamed: 0,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,07/07/2015,289,36,33,239.554,30,1,2,1,4
1,14/07/2015,118,13,50,239.554,31,1,1,0,0
2,15/07/2015,179,51,38,239.554,31,1,0,0,2
3,16/07/2015,279,5,39,239.554,24,1,2,0,4
4,23/07/2015,289,36,33,239.554,30,1,2,1,2


In [18]:
# "Reason_0", a.k.a. no reason, which is the baseline model
# "Reason_1", which comprises of various diseases (col 1-14)
# "Reason_2", relating to pregnancy and giving birth (col 15-17)
# "Reason_3", regarding poisoning and peculiar reasons not categorized elsewhere (col 18-21)
# "Reason_4", which relates to light diseases (col 22-28)

In [19]:
# To group the columns
reason_type_1 = reason_column.loc[:,'1':'14'].max(axis=1)
reason_type_2 = reason_column.loc[:,'15':'17'].max(axis=1)
reason_type_3 = reason_column.loc[:,'18':'21'].max(axis=1)
reason_type_4 = reason_column.loc[:,'22':'28'].max(axis=1)

# here we will make the new columns
df['Reason_1'] = reason_type_1
df['Reason_2'] = reason_type_2
df['Reason_3'] = reason_type_3
df['Reason_4'] = reason_type_4
df.head()

Unnamed: 0,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Reason_1,Reason_2,Reason_3,Reason_4
0,07/07/2015,289,36,33,239.554,30,1,2,1,4,0,0,0,1
1,14/07/2015,118,13,50,239.554,31,1,1,0,0,0,0,0,0
2,15/07/2015,179,51,38,239.554,31,1,0,0,2,0,0,0,1
3,16/07/2015,279,5,39,239.554,24,1,2,0,4,1,0,0,0
4,23/07/2015,289,36,33,239.554,30,1,2,1,2,0,0,0,1


In [20]:
df.columns.values

array(['Date', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours', 'Reason_1',
       'Reason_2', 'Reason_3', 'Reason_4'], dtype=object)

In [21]:
# Rearranging the columns
reordered_cols = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Date', 
        'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours']

df = df[reordered_cols]
df.head(10)

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,07/07/2015,289,36,33,239.554,30,1,2,1,4
1,0,0,0,0,14/07/2015,118,13,50,239.554,31,1,1,0,0
2,0,0,0,1,15/07/2015,179,51,38,239.554,31,1,0,0,2
3,1,0,0,0,16/07/2015,279,5,39,239.554,24,1,2,0,4
4,0,0,0,1,23/07/2015,289,36,33,239.554,30,1,2,1,2
5,0,0,0,1,10/07/2015,179,51,38,239.554,31,1,0,0,2
6,0,0,0,1,17/07/2015,361,52,28,239.554,27,1,1,4,8
7,0,0,0,1,24/07/2015,260,50,36,239.554,23,1,4,0,4
8,0,0,1,0,06/07/2015,155,12,34,239.554,25,1,2,0,40
9,0,0,0,1,13/07/2015,235,11,37,239.554,29,3,1,1,8


In [22]:
df_reason_mod = df.copy() # creating checkpoints at crucial steps

### Deal with 'Date' column

In [23]:
df_reason_mod['Date'].head(10)

0    07/07/2015
1    14/07/2015
2    15/07/2015
3    16/07/2015
4    23/07/2015
5    10/07/2015
6    17/07/2015
7    24/07/2015
8    06/07/2015
9    13/07/2015
Name: Date, dtype: object

In [24]:
# Now will check the type of values in 'Date' column
print(type(df_reason_mod['Date']))
print(type(df_reason_mod['Date'][0]))

<class 'pandas.core.series.Series'>
<class 'str'>


In [25]:
df_reason_mod['Date'] = pd.to_datetime(df_reason_mod['Date'], format= '%d/%m/%Y')
# pd.to_datetime() converts values into timestamp
# format argument is used to read current date format not a required format.
df_reason_mod['Date'].head(10)

0   2015-07-07
1   2015-07-14
2   2015-07-15
3   2015-07-16
4   2015-07-23
5   2015-07-10
6   2015-07-17
7   2015-07-24
8   2015-07-06
9   2015-07-13
Name: Date, dtype: datetime64[ns]

In [26]:
print(type(df_reason_mod['Date']))
print(type(df_reason_mod['Date'][0]))
print(df_reason_mod['Date'].dtypes)
df_reason_mod['Date'][0]

<class 'pandas.core.series.Series'>
<class 'pandas._libs.tslibs.timestamps.Timestamp'>
datetime64[ns]


Timestamp('2015-07-07 00:00:00')

#### Extract Month

In [27]:
df_reason_mod['Date'][0].month # to extract the month value from date

7

In [28]:
list_months = []
for i in range(len(df_reason_mod['Date'])):
    list_months.append(df_reason_mod['Date'][i].month)
    
list_months

[7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 7,
 

In [29]:
df_reason_mod['Month'] = list_months
df_reason_mod.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Month
0,0,0,0,1,2015-07-07,289,36,33,239.554,30,1,2,1,4,7
1,0,0,0,0,2015-07-14,118,13,50,239.554,31,1,1,0,0,7
2,0,0,0,1,2015-07-15,179,51,38,239.554,31,1,0,0,2,7
3,1,0,0,0,2015-07-16,279,5,39,239.554,24,1,2,0,4,7
4,0,0,0,1,2015-07-23,289,36,33,239.554,30,1,2,1,2,7


#### Extract the Day of the Week

In [30]:
df_reason_mod['Date'][2].weekday()
# Starts from "Monday" with index '0' [0,1,2,3,4,5,6]=[Mon, Tue, Wed, Thur, Fri, Sat, Sun]

2

In [31]:
def date_to_weekday(date_value):
    return date_value.weekday()

In [32]:
df_reason_mod['Day of the Week'] = df_reason_mod['Date'].apply(date_to_weekday)
# Dataframe.apply(Func_name) --Inside the bracket there is function name only without brackets and arguments.
df_reason_mod.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Month,Day of the Week
0,0,0,0,1,2015-07-07,289,36,33,239.554,30,1,2,1,4,7,1
1,0,0,0,0,2015-07-14,118,13,50,239.554,31,1,1,0,0,7,1
2,0,0,0,1,2015-07-15,179,51,38,239.554,31,1,0,0,2,7,2
3,1,0,0,0,2015-07-16,279,5,39,239.554,24,1,2,0,4,7,3
4,0,0,0,1,2015-07-23,289,36,33,239.554,30,1,2,1,2,7,3


In [33]:
df_reason_mod = df_reason_mod.drop(['Date'], axis = 1) # dropping the "Date" column
df_reason_mod.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Month,Day of the Week
0,0,0,0,1,289,36,33,239.554,30,1,2,1,4,7,1
1,0,0,0,0,118,13,50,239.554,31,1,1,0,0,7,1
2,0,0,0,1,179,51,38,239.554,31,1,0,0,2,7,2
3,1,0,0,0,279,5,39,239.554,24,1,2,0,4,7,3
4,0,0,0,1,289,36,33,239.554,30,1,2,1,2,7,3


In [34]:
df_reason_mod.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4',
       'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours', 'Month',
       'Day of the Week'], dtype=object)

In [35]:
# Rearrange the columns values
columns_order = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month',
       'Day of the Week', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours']

df_reason_mod = df_reason_mod[columns_order]
df_reason_mod.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,1,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,1,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,1,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,1,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,1,2,1,2


In [36]:
df_reason_date_mod = df_reason_mod.copy() # creatin a checkpoint
df_reason_date_mod.head(10)

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,1,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,1,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,1,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,1,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,1,2,1,2
5,0,0,0,1,7,4,179,51,38,239.554,31,1,0,0,2
6,0,0,0,1,7,4,361,52,28,239.554,27,1,1,4,8
7,0,0,0,1,7,4,260,50,36,239.554,23,1,4,0,4
8,0,0,1,0,7,0,155,12,34,239.554,25,1,2,0,40
9,0,0,0,1,7,0,235,11,37,239.554,29,3,1,1,8


### Deal with Transportation Expense, Distance to Work, Age, Daily Work Load Average, BMI

In [37]:
# As data contained from all these columns is purely numerical and there is no hidden meaning behind this
# so we will not do any preprocessing on them

# Note:-
# Transportation expense is a subcategory of Travel expenses.
# Travel expenses are costs related to business travel such as fuel, parking, meals, transportation and other charges
# In our table, this column contains monthly transportation expenses of an individual measured in dollars.

### Deal with Education column

In [38]:
df_reason_date_mod['Education'].unique()

array([1, 3, 2, 4], dtype=int64)

In [39]:
# Even though data is numeric, it represent different categories
# 1 : high school
# 2 : graduate
# 3 : post graduate
# 4 : PHD

In [40]:
df_reason_date_mod['Education'].value_counts()

1    583
3     73
2     40
4      4
Name: Education, dtype: int64

In [41]:
# This shows that nearly 600 persons out of 700 are from category 1
# so it will become less relevant to compare all 4 categories
# Therefore, we will combine the categories 2,3,4 into a single category

In [42]:
df_reason_date_mod['Education'] = df_reason_date_mod['Education'].map({1:0, 2:1, 3:1, 4:1})
df_reason_date_mod['Education'].unique()

array([0, 1], dtype=int64)

In [43]:
df_reason_date_mod['Education'].value_counts()

0    583
1    117
Name: Education, dtype: int64

### Deal with Children, Pets

In [44]:
# Even though these columns are look like categories but they only have numeric meaning

In [45]:
# Let's check the unique values from 'Children' column with their counts
df_reason_date_mod['Children'].value_counts()

0    285
1    213
2    146
4     42
3     14
Name: Children, dtype: int64

In [46]:
# Let's check the unique values from 'Pets' column with their counts
df_reason_date_mod['Pets'].value_counts()

0    439
1    132
2     92
4     29
5      5
8      3
Name: Pets, dtype: int64

### Final Checkpoint

In [47]:
df_preprocessed = df_reason_date_mod.copy()
df_preprocessed.head(100)

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0,0,0,1,11,1,225,26,28,306.345,24,0,1,2,3
96,1,0,0,0,11,2,289,36,33,306.345,30,0,2,1,24
97,0,0,0,1,11,3,291,31,40,306.345,25,0,1,1,3
98,0,0,0,1,12,1,248,25,47,261.306,32,0,2,1,1


In [None]:
# Create the new csv file of preprocessed data
df_preprocessed.to_csv('Absenteeism_preprocessed.csv', index=False)