## --Import the relevant libraries--

In [1]:
# import the relevant libraries
import pandas as pd
import numpy as np

## --Load the data--

In [2]:
# load the preprocessed CSV data
data_preprocessed = pd.read_csv('Absenteeism_preprocessed.csv')

In [3]:
# eyeball the data
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2


## --Create the targets--

In [4]:
# find the median of 'Absenteeism Time in Hours'
data_preprocessed['Absenteeism Time in Hours'].median()

3.0

In [5]:
# create targets for our logistic regression
# they have to be categories and we must find a way to say if someone is 'being absent too much' or not
# what we've decided to do is to take the median of the dataset as a cut-off line
# in this way the dataset will be balanced (there will be roughly equal number of 0s and 1s for the logistic regression)
# as balancing is a great problem for ML, this will work great for us
# alternatively, if we had more data, we could have found other ways to deal with the issue 
# for instance, we could have assigned some arbitrary value as a cut-off line, instead of the median

# note that what line does is to assign 1 to anyone who has been absent 4 hours or more (more than 3 hours)
# that is the equivalent of taking half a day off

targets = np.where(data_preprocessed['Absenteeism Time in Hours'] > 
                   data_preprocessed['Absenteeism Time in Hours'].median(), 1, 0)

In [6]:
# eyeball the targets
targets

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,

In [7]:
targets = pd.Series(targets) # convert the targets from array to Series
targets

0      1
1      0
2      0
3      1
4      0
      ..
695    1
696    0
697    1
698    0
699    0
Length: 700, dtype: int32

In [8]:
# create a Series in the original data frame that will contain the targets for the regression
data_preprocessed['Excessive Absenteeism'] = targets

In [9]:
# let's check what happened
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Excessive Absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2,0


In [10]:
# check if dataset is balanced (what % of targets are 1s)
# targets.sum() will give us the number of 1s that there are
# the shape[0] will give us the length of the targets
targets.sum() / targets.shape[0]

0.45571428571428574

In [11]:
# data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours'], axis = 1)

# For backward elimination we commented the above line and will drop some more features.
# Also we'll create a new checkpoint of it

data_with_targets = data_preprocessed.drop(
    ['Absenteeism Time in Hours', 'Day of the Week', 'Distance to Work', 'Daily Work Load Average'], axis = 1)

In [12]:
# check what's inside
data_with_targets.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Excessive Absenteeism
0,0,0,0,1,7,289,33,30,0,2,1,1
1,0,0,0,0,7,118,50,31,0,1,0,0
2,0,0,0,1,7,179,38,31,0,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0,1
4,0,0,0,1,7,289,33,30,0,2,1,0


## --Select the inputs for the regression--

In [13]:
# Selects all rows and all columns but the last one (basically the same operation)
inputs = data_with_targets.iloc[:,:-1]
inputs

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,0,0,0,1,7,289,33,30,0,2,1
1,0,0,0,0,7,118,50,31,0,1,0
2,0,0,0,1,7,179,38,31,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0
4,0,0,0,1,7,289,33,30,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,179,40,22,1,2,0
696,1,0,0,0,5,225,28,24,0,1,2
697,1,0,0,0,5,330,28,25,1,0,0
698,0,0,0,1,5,235,32,25,1,0,0


## --Split the data into train & test and shuffle--

### i) Import the relevant module

In [14]:
# import train_test_split so we can split our data into train and test
from sklearn.model_selection import train_test_split

### ii) Split

In [15]:
# declare 4 variables for the split
x_train, x_test, y_train, y_test = train_test_split(inputs, targets, train_size = 0.8, random_state = 20)

In [16]:
# check the shape of the train inputs and targets
print (x_train.shape, y_train.shape)

(560, 11) (560,)


In [17]:
# check the shape of the test inputs and targets
print (x_test.shape, y_test.shape)

(140, 11) (140,)


In [18]:
# check the type 
print(type(x_train))
print(type(x_test))
print(type(y_train))
print(type(y_test))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>


## --Standardize the Data--

In [19]:
# Standardization is one of the most common preprocessing tools as it will reduce the biasness of higher magnitude feature
# But standardising the dummy will reduce the interpretability of it
# So before standardising (scaling) we need to separate the df into two df - dummy and non-dummy
# And after scaling we will again merge those two df

### i) Creating a Function for Separating Dummy and Non-dummy Variables

In [21]:
# Creating a list of dummy column named as columns to omit
columns_to_omit = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4','Education']

In [24]:
# non_dummy_x_train = x_train.loc[:, ~x_train.columns.isin(columns_to_omit)]

In [None]:
# The function will take two arguments, i) DataFrame of input_data ii) List of columns
def separate_dummy_and_nondummy(unscaled_x, columns_to_omit):
                    # creating a separeate dataframe of dummy variables
    unscaled_x_dummy = unscaled_x.loc[:, columns_to_omit]
                    # columns for scaling the non dummy
    columns_to_scale = [x for x in unscaled_x.columns.values if x not in columns_to_omit]
                    # creating a separeate dataframe of non-dummy variables
    unscaled_x_nondummy = unscaled_x.loc[:, columns_to_scale]
                    # returning dummy and non-dummy two separate dataframes
    return unscaled_x_dummy, unscaled_x_nondummy

In [None]:
# Store the outputs of 'separate_dummy_and_nondummy()' function
unscaled_x_train_dummy, unscaled_x_train_nondummy = separate_dummy_and_nondummy(x_train, columns_to_omit)

In [None]:
unscaled_x_train_dummy # or separate_dummy_and_nondummy(x_train, columns_to_omit)[0]

In [None]:
unscaled_x_train_nondummy # or separate_dummy_and_nondummy(x_train, columns_to_omit)[1]

### ii) Standardising only Non-Dummy variables

In [None]:
# Import StandardScaler
from sklearn.preprocessing import StandardScaler
absenteeism_scaler = StandardScaler()

In [None]:
# fit the data; they are automatically stored inside the object 
absenteeism_scaler.fit(unscaled_x_train_nondummy)

In [None]:
# calculate mean and variance
print(absenteeism_scaler.mean_)
print(absenteeism_scaler.var_)

In [None]:
# transform the data
scaled_x_train_nondummy = absenteeism_scaler.transform(unscaled_x_train_nondummy)

# After transformation, the output is in array form, so we'll convert it into DataFrame
scaled_x_train_nondummy = pd.DataFrame(scaled_x_train_nondummy, columns = unscaled_x_train_nondummy.columns.values)
# Let's check it
scaled_x_train_nondummy

### iii) Creating a function for Re-joining the Dummy and Non-Dummy 

In [None]:
# the function will take 3 inputs: 1st and 2nd are DataFrames and 3rd is List of column values
# the function will return a single combined ordered output

def rejoin_dummy_and_nondummy(unscaled_x_dummy, scaled_x_nondummy, columns_values):
        # Join the two DataFrames and store it in a new DataFrame
    transformed_x = pd.concat([scaled_x_nondummy, unscaled_x_dummy.reset_index(drop = True)], axis=1)
        # arrange the columns of DataFrame as per given list of column values
    transformed_x = transformed_x[columns_values]
        # Return the output
    return transformed_x

In [None]:
# Store the output of 'rejoin_dummy_and_nondummy()' function
transformed_x_train = rejoin_dummy_and_nondummy(unscaled_x_train_dummy, scaled_x_train_nondummy, x_train.columns.values)

## --Logistic regression with sklearn--

In [None]:
# import the LogisticRegression model from sklearn
from sklearn.linear_model import LogisticRegression

# import the 'metrics' module, which includes important metrics we may want to use
from sklearn import metrics

### i) Training the model

In [None]:
# create a logistic regression object
log_reg = LogisticRegression()

In [None]:
# fit our train inputs
# that is basically the whole training part of the machine learning
log_reg.fit(transformed_x_train, y_train)

In [None]:
# assess the train accuracy of the model
log_reg.score(transformed_x_train,y_train)

### ii) Finding the intercept and coefficients

In [None]:
print(log_reg.intercept_) # get the intercept (bias) of our model
print(log_reg.coef_) # get the coefficients (weights) of our model

In [None]:
# use the coefficients from this table (they will be exported later and will be used in Tableau)
# transpose the model coefficients (model.coef_) and throws them into a df 
summary_table = pd.DataFrame (columns=['Feature name'], data = [x for x in transformed_x_train])

# add the coefficient values to the summary table
summary_table['Coefficient'] = np.transpose(log_reg.coef_)

# display the summary table
summary_table

In [None]:
# do a little Python trick to move the intercept to the top of the summary table
# move all indices by 1
summary_table.index = summary_table.index + 1

# add the intercept at index 0
summary_table.loc[0] = ['Intercept', log_reg.intercept_[0]]

# sort the df by index
summary_table = summary_table.sort_index()
summary_table

# Disclaimer: 
# Don't run this cell multiple times 
# Everytime it will run, the index column will change by 1.

## --Interpreting the coefficients--

In [None]:
# create a new Series called: 'Odds ratio' which will show the.. odds ratio of each feature
summary_table['Odds_ratio'] = np.exp(summary_table.Coefficient)

In [None]:
# display the df
summary_table

In [None]:
# sort the table according to odds ratio
# note that by default, the sort_values method sorts values by 'ascending'
summary_table.sort_values('Odds_ratio', ascending=False)

In [None]:
# If a coefficient is around zero or its odds ratio is close to one, 
# this means that the corresponding feature is not particularly important.

# Backward Elimination

# when weight of feature is small enough, it won't make a difference anyway, 
# so we need to drop them and restart the code from start

# By looking at summary table we find that following features have small weight So we will drop them-
# 'Day of the week', 'Distance to Work', 'Daily Work Load Average'

## --Testing the model--

In [None]:
# Before testing the model, we need to scale the data first
# And for scaling we need to separate the df into two diffenrent df - dummy and non-dummy
# After scaling we'll merge those 2 df

### i) Scaling the test data before testing

In [None]:
# Separate the x_test
# Store the 2 outputs of 'separate_dummy_and_nondummy()' function
unscaled_x_test_dummy, unscaled_x_test_nondummy = separate_dummy_and_nondummy(x_test, columns_to_omit)

In [None]:
# scaling the non-dummy using store model 
scaled_x_test_nondummy = absenteeism_scaler.transform(unscaled_x_test_nondummy)

scaled_x_test_nondummy = pd.DataFrame(scaled_x_test_nondummy, columns = unscaled_x_test_nondummy.columns.values)
scaled_x_test_nondummy

In [None]:
# Re-joining the dummy and non-dummy df
transformed_x_test = rejoin_dummy_and_nondummy(unscaled_x_test_dummy, scaled_x_test_nondummy, x_test.columns.values)

### ii) Testing the Model using Test Data

In [None]:
# assess the test accuracy of the model
log_reg.score(transformed_x_test, y_test)

In [None]:
# find the predicted probabilities of each class
# the first column shows the probability of a particular observation to be 0, while the second one - to be 1
predicted_proba = log_reg.predict_proba(transformed_x_test)

# let's check that out
predicted_proba

In [None]:
predicted_proba.shape

In [None]:
# select ONLY the probabilities referring to 1s
predicted_proba[:,1]

## --Save the model--

In [None]:
import pickle

In [None]:
# pickle the "log_reg" object as "model" file
with open('model', 'wb') as file1:
    pickle.dump(log_reg, file1)

In [None]:
# pickle the "absenteeism_scaler" object as "scaler" file
with open('scaler', 'wb') as file2:
    pickle.dump(absenteeism_scaler, file2)   