## --Import the relevant libraries--

In [1]:
# import the relevant libraries
import pandas as pd
import numpy as np

## --Load the data--

In [2]:
# load the preprocessed CSV data
data_preprocessed = pd.read_csv('Data_files/Absenteeism_preprocessed.csv')

In [3]:
# eyeball the data
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2


## --Create the targets--

In [4]:
# find the median of 'Absenteeism Time in Hours'
data_preprocessed['Absenteeism Time in Hours'].median()

3.0

In [5]:
# create targets for our logistic regression
# they have to be categories and we must find a way to say if someone is 'being absent too much' or not
# what we've decided to do is to take the median of the dataset as a cut-off line
# in this way the dataset will be balanced (there will be roughly equal number of 0s and 1s for the logistic regression)
# as balancing is a great problem for ML, this will work great for us
# alternatively, if we had more data, we could have found other ways to deal with the issue 
# for instance, we could have assigned some arbitrary value as a cut-off line, instead of the median

# note that what line does is to assign 1 to anyone who has been absent 4 hours or more (more than 3 hours)
# that is the equivalent of taking half a day off

targets = np.where(data_preprocessed['Absenteeism Time in Hours'] > 
                   data_preprocessed['Absenteeism Time in Hours'].median(), 1, 0)

In [6]:
# eyeball the targets
targets

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,

In [7]:
targets = pd.Series(targets) # convert the targets from array to Series
targets

0      1
1      0
2      0
3      1
4      0
      ..
695    1
696    0
697    1
698    0
699    0
Length: 700, dtype: int32

In [8]:
# create a Series in the original data frame that will contain the targets for the regression
data_preprocessed['Excessive Absenteeism'] = targets

In [9]:
# let's check what happened
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Excessive Absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2,0


In [10]:
# check if dataset is balanced (what % of targets are 1s)
# targets.sum() will give us the number of 1s that there are
# the shape[0] will give us the length of the targets
targets.sum() / targets.shape[0]

0.45571428571428574

In [11]:
# data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours'], axis = 1)

# For backward elimination we commented the above line and will drop some more features.
# Also we'll create a new checkpoint of it

data_with_targets = data_preprocessed.drop(
    ['Absenteeism Time in Hours', 'Day of the Week', 'Distance to Work', 'Daily Work Load Average'], axis = 1)

In [12]:
# check what's inside
data_with_targets.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Excessive Absenteeism
0,0,0,0,1,7,289,33,30,0,2,1,1
1,0,0,0,0,7,118,50,31,0,1,0,0
2,0,0,0,1,7,179,38,31,0,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0,1
4,0,0,0,1,7,289,33,30,0,2,1,0


## --Select the inputs for the regression--

In [13]:
# Selects all rows and all columns but the last one (basically the same operation)
inputs = data_with_targets.iloc[:,:-1]
inputs

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,0,0,0,1,7,289,33,30,0,2,1
1,0,0,0,0,7,118,50,31,0,1,0
2,0,0,0,1,7,179,38,31,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0
4,0,0,0,1,7,289,33,30,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,179,40,22,1,2,0
696,1,0,0,0,5,225,28,24,0,1,2
697,1,0,0,0,5,330,28,25,1,0,0
698,0,0,0,1,5,235,32,25,1,0,0


## --Split the data into train & test and shuffle--

### i) Import the relevant module

In [14]:
# import train_test_split so we can split our data into train and test
from sklearn.model_selection import train_test_split

### ii) Split

In [15]:
# declare 4 variables for the split
x_train, x_test, y_train, y_test = train_test_split(inputs, targets, train_size = 0.8, random_state = 20)

In [16]:
# check the shape of the train inputs and targets
print (x_train.shape, y_train.shape)

(560, 11) (560,)


In [17]:
# check the shape of the test inputs and targets
print (x_test.shape, y_test.shape)

(140, 11) (140,)


In [18]:
# check the type 
print(type(x_train))
print(type(x_test))
print(type(y_train))
print(type(y_test))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>


## --Standardize the Data--

In [19]:
# Standardization is one of the most common preprocessing tools as it will reduce the biasness of higher magnitude feature
# But standardising the dummy will reduce the interpretability of it
# So before standardising (scaling) we need to separate the df into two df - dummy and non-dummy
# And after scaling we will again merge those two df

In [20]:
# Creating a list of dummy column named as columns to omit
columns_to_omit = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4','Education']

In [21]:
# Separate the x_train into dummy and non-dummy DataFrame
dummy_x_train = x_train.loc[:, x_train.columns.isin(columns_to_omit)] 
non_dummy_x_train = x_train.loc[:, ~x_train.columns.isin(columns_to_omit)]

In [22]:
# Import StandardScaler
from sklearn.preprocessing import StandardScaler

# define absenteeism_scaler as an object
absenteeism_scaler = StandardScaler()

In [23]:
# fit only Non-dummy data
absenteeism_scaler.fit(non_dummy_x_train)

StandardScaler()

In [24]:
# calculate mean and variance
print(absenteeism_scaler.mean_)
print(absenteeism_scaler.var_)

[  6.35892857 222.09107143  36.36428571  26.64821429   1.03214286
   0.69464286]
[1.22979560e+01 4.53320421e+03 4.01458673e+01 1.82923182e+01
 1.27039541e+00 1.44425702e+00]


In [25]:
# transform the Non-dummy inputs and convert it into a DataFrame
scaled_non_dummy_x_train = pd.DataFrame(absenteeism_scaler.transform(non_dummy_x_train), columns = non_dummy_x_train.columns)

# Let's check the scaled non-dummy x_train
scaled_non_dummy_x_train

Unnamed: 0,Month,Transportation Expense,Age,Body Mass Index,Children,Pets
0,1.608589,-0.640007,0.258159,1.017497,-0.915736,-0.578015
1,1.323432,-0.640007,0.573812,-1.086805,0.858701,-0.578015
2,1.038276,-0.640007,-1.004452,-1.788239,-0.915736,-0.578015
3,-1.528134,-0.640007,-1.004452,-1.788239,-0.915736,-0.578015
4,-0.387508,-0.640007,0.573812,-1.086805,0.858701,-0.578015
...,...,...,...,...,...,...
555,-0.387508,-1.546005,2.152075,1.017497,-0.028518,-0.578015
556,-0.102351,1.023465,0.573812,-0.385371,-0.028518,0.254089
557,0.753119,-0.640007,0.258159,1.017497,-0.915736,-0.578015
558,0.182806,2.063134,-1.320105,0.082251,-0.028518,2.750402


In [26]:
# Join the 2 df - dummy and non-dummy
# Before joining, reset the index of dummy df
# Also use original column order of x_train
transformed_x_train = pd.concat([dummy_x_train.reset_index(drop = True), scaled_non_dummy_x_train], axis=1)[x_train.columns]

In [27]:
# Let's check it
transformed_x_train

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,0,0,0,1,1.608589,-0.640007,0.258159,1.017497,0,-0.915736,-0.578015
1,0,0,1,0,1.323432,-0.640007,0.573812,-1.086805,1,0.858701,-0.578015
2,1,0,0,0,1.038276,-0.640007,-1.004452,-1.788239,1,-0.915736,-0.578015
3,0,0,1,0,-1.528134,-0.640007,-1.004452,-1.788239,1,-0.915736,-0.578015
4,1,0,0,0,-0.387508,-0.640007,0.573812,-1.086805,1,0.858701,-0.578015
...,...,...,...,...,...,...,...,...,...,...,...
555,1,0,0,0,-0.387508,-1.546005,2.152075,1.017497,0,-0.028518,-0.578015
556,0,0,0,1,-0.102351,1.023465,0.573812,-0.385371,0,-0.028518,0.254089
557,0,0,0,1,0.753119,-0.640007,0.258159,1.017497,0,-0.915736,-0.578015
558,0,0,0,1,0.182806,2.063134,-1.320105,0.082251,0,-0.028518,2.750402


## --Logistic regression with sklearn--

In [28]:
# import the LogisticRegression model from sklearn
from sklearn.linear_model import LogisticRegression

# import the 'metrics' module, which includes important metrics we may want to use
from sklearn import metrics

### i) Training the model

In [29]:
# create a logistic regression object
log_reg = LogisticRegression()

In [30]:
# fit our transformed-train inputs
# that is basically the whole training part of the machine learning
log_reg.fit(transformed_x_train, y_train)

LogisticRegression()

In [31]:
# assess the train accuracy of the model
log_reg.score(transformed_x_train,y_train)

0.7732142857142857

### ii) Finding the intercept and coefficients

In [32]:
print(log_reg.intercept_) # get the intercept (bias) of our model
print(log_reg.coef_) # get the coefficients (weights) of our model

[-1.65271136]
[[ 2.80020222  0.952197    3.11550396  0.83909169  0.15914852  0.61462715
  -0.16878688  0.2813875  -0.21061502  0.3531512  -0.28576094]]


In [33]:
# use the coefficients from this table (they will be exported later and will be used in Tableau)
# transpose the model coefficients (model.coef_) and throws them into a df 
summary_table = pd.DataFrame (columns=['Feature name'], data = transformed_x_train.columns)

# add the coefficient values to the summary table
summary_table['Coefficient'] = np.transpose(log_reg.coef_)

# display the summary table
summary_table

Unnamed: 0,Feature name,Coefficient
0,Reason_1,2.800202
1,Reason_2,0.952197
2,Reason_3,3.115504
3,Reason_4,0.839092
4,Month,0.159149
5,Transportation Expense,0.614627
6,Age,-0.168787
7,Body Mass Index,0.281388
8,Education,-0.210615
9,Children,0.353151


In [34]:
# do a little Python trick to move the intercept to the top of the summary table
# move all indices by 1
summary_table.index = summary_table.index + 1

# add the intercept at index 0
summary_table.loc[0] = ['Intercept', log_reg.intercept_[0]]

# sort the df by index
summary_table = summary_table.sort_index()
summary_table

# Disclaimer: 
# Don't run this cell multiple times 
# Everytime it will run, the index column will change by 1.

Unnamed: 0,Feature name,Coefficient
0,Intercept,-1.652711
1,Reason_1,2.800202
2,Reason_2,0.952197
3,Reason_3,3.115504
4,Reason_4,0.839092
5,Month,0.159149
6,Transportation Expense,0.614627
7,Age,-0.168787
8,Body Mass Index,0.281388
9,Education,-0.210615


## --Interpreting the coefficients--

In [35]:
# create a new Series called: 'Odds ratio' which will show the.. odds ratio of each feature
summary_table['Odds_ratio'] = np.exp(summary_table.Coefficient)

In [36]:
# display the df
summary_table

Unnamed: 0,Feature name,Coefficient,Odds_ratio
0,Intercept,-1.652711,0.19153
1,Reason_1,2.800202,16.447973
2,Reason_2,0.952197,2.591397
3,Reason_3,3.115504,22.544789
4,Reason_4,0.839092,2.314264
5,Month,0.159149,1.172512
6,Transportation Expense,0.614627,1.848967
7,Age,-0.168787,0.844689
8,Body Mass Index,0.281388,1.324967
9,Education,-0.210615,0.810086


In [37]:
# sort the table according to odds ratio
# note that by default, the sort_values method sorts values by 'ascending'
summary_table.sort_values('Odds_ratio', ascending=False)

Unnamed: 0,Feature name,Coefficient,Odds_ratio
3,Reason_3,3.115504,22.544789
1,Reason_1,2.800202,16.447973
2,Reason_2,0.952197,2.591397
4,Reason_4,0.839092,2.314264
6,Transportation Expense,0.614627,1.848967
10,Children,0.353151,1.423546
8,Body Mass Index,0.281388,1.324967
5,Month,0.159149,1.172512
7,Age,-0.168787,0.844689
9,Education,-0.210615,0.810086


In [38]:
# If a coefficient is around zero or its odds ratio is close to one, 
# this means that the corresponding feature is not particularly important.

# Backward Elimination

# when weight of feature is small enough, it won't make a difference anyway, 
# so we need to drop them and restart the code from start

# By looking at summary table we find that following features have small weight So we will drop them-
# 'Day of the week', 'Distance to Work', 'Daily Work Load Average'

## --Testing the model--

In [39]:
# Before testing the model, we need to scale the data first
# And for scaling we need to separate the df into two diffenrent df - dummy and non-dummy
# After scaling we'll merge those 2 df

### i) Scaling the test data before testing

In [40]:
# Separate the x_test into dummy and non-dummy DataFrame
dummy_x_test = x_test.loc[:, x_test.columns.isin(columns_to_omit)] 
non_dummy_x_test = x_test.loc[:, ~x_test.columns.isin(columns_to_omit)]

In [41]:
# transform the Non-dummy inputs and convert it into a DataFrame
scaled_non_dummy_x_test = pd.DataFrame(absenteeism_scaler.transform(non_dummy_x_test), columns = non_dummy_x_test.columns)

# Let's check the scaled non-dummy x_test
scaled_non_dummy_x_test

Unnamed: 0,Month,Transportation Expense,Age,Body Mass Index,Children,Pets
0,1.323432,-0.640007,0.258159,1.017497,-0.915736,-0.578015
1,0.753119,1.023465,0.573812,-0.385371,-0.028518,0.254089
2,1.323432,0.191729,1.047291,2.654177,-0.028518,-0.578015
3,-0.957821,-0.640007,0.258159,1.017497,-0.915736,-0.578015
4,0.182806,0.563040,-0.057494,-0.852994,2.633137,-0.578015
...,...,...,...,...,...,...
135,-1.528134,0.993760,-0.530973,0.783686,0.858701,0.254089
136,-0.387508,2.063134,-1.320105,0.082251,-0.028518,2.750402
137,0.467962,0.191729,0.100332,0.549874,-0.028518,0.254089
138,-0.102351,0.355106,0.731638,-0.852994,-0.915736,-0.578015


In [42]:
# Join the 2 df - dummy and non-dummy
# Before joining, reset the index of dummy df
# Also use original column order of x_test
transformed_x_test = pd.concat([dummy_x_test.reset_index(drop = True), scaled_non_dummy_x_test], axis=1)[x_test.columns]

### ii) Testing the Model using Test Data

In [43]:
# assess the test accuracy of the model
log_reg.score(transformed_x_test, y_test)

0.75

In [44]:
# find the predicted probabilities of each class
# the first column shows the probability of a particular observation to be 0, while the second one - to be 1
predicted_proba = log_reg.predict_proba(transformed_x_test)

# let's check that out
predicted_proba

array([[0.71337755, 0.28662245],
       [0.58726805, 0.41273195],
       [0.44027878, 0.55972122],
       [0.78158092, 0.21841908],
       [0.0841389 , 0.9158611 ],
       [0.33487727, 0.66512273],
       [0.29973759, 0.70026241],
       [0.13105372, 0.86894628],
       [0.78619522, 0.21380478],
       [0.74901654, 0.25098346],
       [0.49394305, 0.50605695],
       [0.22502433, 0.77497567],
       [0.07132243, 0.92867757],
       [0.73178847, 0.26821153],
       [0.30926944, 0.69073056],
       [0.54737273, 0.45262727],
       [0.55049783, 0.44950217],
       [0.53924413, 0.46075587],
       [0.40192018, 0.59807982],
       [0.05363776, 0.94636224],
       [0.70021442, 0.29978558],
       [0.78158092, 0.21841908],
       [0.42026665, 0.57973335],
       [0.42026665, 0.57973335],
       [0.2477931 , 0.7522069 ],
       [0.74559033, 0.25440967],
       [0.51025841, 0.48974159],
       [0.85687964, 0.14312036],
       [0.203499  , 0.796501  ],
       [0.78158092, 0.21841908],
       [0.

In [45]:
predicted_proba.shape

(140, 2)

In [46]:
# select ONLY the probabilities referring to 1s
predicted_proba[:,1]

array([0.28662245, 0.41273195, 0.55972122, 0.21841908, 0.9158611 ,
       0.66512273, 0.70026241, 0.86894628, 0.21380478, 0.25098346,
       0.50605695, 0.77497567, 0.92867757, 0.26821153, 0.69073056,
       0.45262727, 0.44950217, 0.46075587, 0.59807982, 0.94636224,
       0.29978558, 0.21841908, 0.57973335, 0.57973335, 0.7522069 ,
       0.25440967, 0.48974159, 0.14312036, 0.796501  , 0.21841908,
       0.36953503, 0.67912058, 0.68502714, 0.52871674, 0.21841908,
       0.53497706, 0.22153223, 0.7369366 , 0.40498459, 0.60498626,
       0.21077083, 0.45225614, 0.23757967, 0.39823937, 0.82755591,
       0.56805562, 0.6911646 , 0.28662245, 0.21933776, 0.20332071,
       0.57631053, 0.32927259, 0.66512273, 0.26948516, 0.83319296,
       0.43489216, 0.88374142, 0.23131102, 0.3340719 , 0.34424283,
       0.69902281, 0.65493997, 0.29250529, 0.79205029, 0.20759656,
       0.26842867, 0.08710365, 0.22153223, 0.7324096 , 0.30529973,
       0.22153223, 0.28996775, 0.90435024, 0.46062496, 0.60167

## --Save the model--

In [47]:
import pickle

In [48]:
# pickle the "log_reg" object as "model" file
with open('model', 'wb') as file1:
    pickle.dump(log_reg, file1)

In [49]:
# pickle the "absenteeism_scaler" object as "scaler" file
with open('scaler', 'wb') as file2:
    pickle.dump(absenteeism_scaler, file2)   