# Absenteeism Exercise

In [1]:
# import librairies
import pandas as pd
import numpy as np

## Load the data

In [2]:
data_preprocessed = pd.read_csv('Absenteeism-preprocessed.csv')
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pet,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2


Note that dummy variable 'Reason_0', which corresponds to the worker having no particular reason for being absent, was removed from the dataset in preprocessing, in line with best practices of using dummy variables.

## Create the targets

In [3]:
# Median value
data_preprocessed['Absenteeism Time in Hours'].median()

3.0

We will consider anyone who has been absent > 3 hours to be excessively absent.

In [4]:
targets =np.where(data_preprocessed['Absenteeism Time in Hours'] > data_preprocessed['Absenteeism Time in Hours'].median(), 1,0)
targets

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,

In [5]:
data_preprocessed['Excessive Absenteeism'] = targets
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pet,Absenteeism Time in Hours,Excessive Absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2,0


Note that by using the median, we have implicitely balanced the dataset!

In [6]:
targets.sum() / targets.shape[0]

0.45571428571428574

In [7]:
# Drop column
data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours'], axis=1)

# Is it a checkpoint? If False = It is a Checkpoint!
data_with_targets is data_preprocessed

False

## Select the inputs for the regression

In [8]:
unscaled_inputs = data_with_targets.iloc[:,:-1]
unscaled_inputs.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pet
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1


## Standardize the data

In [9]:
#import sklearn
from sklearn.preprocessing import StandardScaler

# create scaler object
#absenteeism_scaler = StandardScaler()

### Standardizing only the Numerical Variables

When we standardized the inputs using StandardScaler, we also standardized the numerical dummies! This is bad practice, because in doing so we lost the interpretability of the dummy variable. This brings us to a correction of our code using the CustomScaler.

In [10]:
# custom scaler
from sklearn.base import BaseEstimator, TransformerMixin

# define CustomScaler class
class CustomScaler(BaseEstimator, TransformerMixin):
    # constructor method
    def __init__(self, columns, copy=True, with_mean=True, with_std=True):
        self.scaler = StandardScaler(copy, with_mean, with_std)
        self.columns = columns
        self.mean = None
        self.var_ = None
    
    def fit(self,X,y=None):
        self.scaler.fit(X[self.columns],y)
        self.mean = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    
    def transform(self, X, y=None, copy=None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

In [11]:
#columns to scale
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Day of the Week', 'Transportation Expense', 'Distance to Work',
       'Age', 'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pet'], dtype=object)

In [12]:
# columns to omit
columns_to_omit = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4']

# columns to scale using list comprehension
columns_to_scale= [x for x in unscaled_inputs.columns.values if x not in columns_to_omit]

# create scaler object
absenteeism_scaler = CustomScaler(columns_to_scale)



In [13]:
# fit our input data
absenteeism_scaler.fit(unscaled_inputs)

# scale the data
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)
scaled_inputs.shape

(700, 14)

In [14]:
scaled_inputs.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pet
0,0,0,0,1,0.030796,-0.80095,1.005844,0.412816,-0.536062,-0.806331,0.767431,-0.44798,0.880469,0.268487
1,0,0,0,0,0.030796,-0.80095,-1.574681,-1.141882,2.130803,-0.806331,1.002633,-0.44798,-0.01928,-0.58969
2,0,0,0,1,0.030796,-0.2329,-0.654143,1.426749,0.24831,-0.806331,1.002633,-0.44798,-0.91903,-0.58969
3,1,0,0,0,0.030796,0.335149,0.854936,-1.682647,0.405184,-0.806331,-0.643782,-0.44798,0.880469,-0.58969
4,0,0,0,1,0.030796,0.335149,1.005844,0.412816,-0.536062,-0.806331,0.767431,-0.44798,0.880469,0.268487


## Split the data into train & test and shuffle

In [15]:
# import module
from sklearn.model_selection import train_test_split

# split the data
# shuffle=True by default
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size = 0.8, random_state = 20)

In [16]:
print("x: ", x_train.shape, x_test.shape)
print("y: ", y_train.shape, y_test.shape)

x:  (560, 14) (140, 14)
y:  (560,) (140,)


## Logistic Regression

In [17]:
# import libraries and modules
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

### Training the model

In [18]:
# logistic regression object
reg = LogisticRegression()

# fit the model
reg.fit(x_train, y_train)

LogisticRegression()

In [19]:
# evaluate the model accuracy
reg.score(x_train,y_train)

0.7660714285714286

### Manually check the accuracy

In [20]:
# model outputs
model_outputs = reg.predict(x_train)
model_outputs

array([0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0,

In [21]:
y_train

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0,

In [22]:
train_results = model_outputs == y_train
train_results

array([ True,  True, False,  True,  True,  True,  True,  True,  True,
        True, False,  True, False, False,  True,  True,  True,  True,
       False,  True, False,  True, False, False,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
       False, False, False, False,  True, False,  True,  True, False,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True, False,  True, False,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
       False,  True, False,  True,  True, False, False, False,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
       False,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,

In [23]:
# numerator
np.sum(train_results)

429

In [24]:
# denominator
model_outputs.shape[0]

560

In [25]:
# accuracy
np.sum(train_results) / model_outputs.shape[0]

0.7660714285714286

### Create a Summary Table

In [26]:
# intercepts
reg.intercept_

array([-1.65467097])

In [27]:
# coefficients
reg.coef_

array([[ 2.78862535,  0.92769133,  3.10680325,  0.81612674,  0.013678  ,
        -0.07597497,  0.62827389, -0.03204259, -0.17407337, -0.02362769,
         0.27897256, -0.10925701,  0.35896925, -0.27524358]])

In [28]:
type(scaled_inputs)

pandas.core.frame.DataFrame

In [29]:
type(unscaled_inputs)

pandas.core.frame.DataFrame

In [30]:
# List of columns in pandas DataFrame
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Day of the Week', 'Transportation Expense', 'Distance to Work',
       'Age', 'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pet'], dtype=object)

In [31]:
feature_name = unscaled_inputs.columns.values

In [32]:
# create new dataframe for our summary table
summary_table = pd.DataFrame(columns=['Independent Variable'],data=feature_name)
summary_table['Coefficient']=np.transpose(reg.coef_)
summary_table

Unnamed: 0,Independent Variable,Coefficient
0,Reason_1,2.788625
1,Reason_2,0.927691
2,Reason_3,3.106803
3,Reason_4,0.816127
4,Month Value,0.013678
5,Day of the Week,-0.075975
6,Transportation Expense,0.628274
7,Distance to Work,-0.032043
8,Age,-0.174073
9,Daily Work Load Average,-0.023628


#### Add the intercept

In [33]:
# increase indeces
summary_table.index = summary_table.index + 1
summary_table

Unnamed: 0,Independent Variable,Coefficient
1,Reason_1,2.788625
2,Reason_2,0.927691
3,Reason_3,3.106803
4,Reason_4,0.816127
5,Month Value,0.013678
6,Day of the Week,-0.075975
7,Transportation Expense,0.628274
8,Distance to Work,-0.032043
9,Age,-0.174073
10,Daily Work Load Average,-0.023628


In [34]:
# Add intercept to table
summary_table.loc[0] = ['Intercept', reg.intercept_[0]] # loc: a single lable input returns a row
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Independent Variable,Coefficient
0,Intercept,-1.654671
1,Reason_1,2.788625
2,Reason_2,0.927691
3,Reason_3,3.106803
4,Reason_4,0.816127
5,Month Value,0.013678
6,Day of the Week,-0.075975
7,Transportation Expense,0.628274
8,Distance to Work,-0.032043
9,Age,-0.174073


#### Interpret the summary table

All the coefficients refer the log-odds of each variable.

In [35]:
summary_table['Odds_ratio'] = np.exp(summary_table.Coefficient)
summary_table

Unnamed: 0,Independent Variable,Coefficient,Odds_ratio
0,Intercept,-1.654671,0.191155
1,Reason_1,2.788625,16.258654
2,Reason_2,0.927691,2.528665
3,Reason_3,3.106803,22.349484
4,Reason_4,0.816127,2.261723
5,Month Value,0.013678,1.013772
6,Day of the Week,-0.075975,0.926839
7,Transportation Expense,0.628274,1.874372
8,Distance to Work,-0.032043,0.968465
9,Age,-0.174073,0.840235


In [36]:
# sort to view most important coefficients at the top
summary_table.sort_values('Odds_ratio', ascending=False)

Unnamed: 0,Independent Variable,Coefficient,Odds_ratio
3,Reason_3,3.106803,22.349484
1,Reason_1,2.788625,16.258654
2,Reason_2,0.927691,2.528665
4,Reason_4,0.816127,2.261723
7,Transportation Expense,0.628274,1.874372
13,Children,0.358969,1.431853
11,Body Mass Index,0.278973,1.321771
5,Month Value,0.013678,1.013772
10,Daily Work Load Average,-0.023628,0.976649
8,Distance to Work,-0.032043,0.968465


For a unit change in the standardized feature, the odds increase by a multiple equal to the odds ratio (1 = no change).

For instance, someone with Reason_3 is 19 times more likely to be absent than someone with no reason for being absent(i.e. 'Reason_0'=1).

The 'Transportation Expense' variable was standardized for higher accuracy of prediction, but in doing so, we lost interpretability.

## Backward Elimination

You can remove variables with small coefficients from the model.

## Test the model

Testing is done only once at the end of the machine learning process.

In [37]:
# Model test accuracy
reg.score(x_test,y_test)

0.75

In [38]:
# Get the probability of an output being 0 or 1
predicted_proba = reg.predict_proba(x_test)
predicted_proba

array([[0.75860887, 0.24139113],
       [0.62409894, 0.37590106],
       [0.45511724, 0.54488276],
       [0.77428775, 0.22571225],
       [0.07513669, 0.92486331],
       [0.28649954, 0.71350046],
       [0.29986092, 0.70013908],
       [0.11162113, 0.88837887],
       [0.75263651, 0.24736349],
       [0.75772395, 0.24227605],
       [0.49745939, 0.50254061],
       [0.18040561, 0.81959439],
       [0.06963821, 0.93036179],
       [0.68255992, 0.31744008],
       [0.27764405, 0.72235595],
       [0.50074571, 0.49925429],
       [0.51572453, 0.48427547],
       [0.55104397, 0.44895603],
       [0.38140894, 0.61859106],
       [0.05651624, 0.94348376],
       [0.73276549, 0.26723451],
       [0.76665612, 0.23334388],
       [0.44195989, 0.55804011],
       [0.45927099, 0.54072901],
       [0.22345683, 0.77654317],
       [0.7529486 , 0.2470514 ],
       [0.4927068 , 0.5072932 ],
       [0.87619335, 0.12380665],
       [0.23568562, 0.76431438],
       [0.75884681, 0.24115319],
       [0.

In [39]:
predicted_proba.shape

(140, 2)

## Save the model

Saving the model is equivalent to saving the reg object.

In [40]:
# import library
import pickle

# use pickle to serialize an object into a character stream
with open('model', 'wb') as file:
    pickle.dump(reg,file)

In [41]:
# use pickle to save custom scaler
with open('scaler','wb') as file:
    pickle.dump(absenteeism_scaler, file)