# Create a logistic regression to predict absenteeism

### Import the relevant libraries

In [1]:
import numpy as np
import pandas as pd

### Load the data

In [2]:
data_preprocessed = pd.read_csv('Absenteeism_preprocessed.csv')
data_preprocessed.head(20)

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Month Value,Day of the Week
0,0,0,0,1,2015-07-07,289,36,33,239.554,30,0,2,1,4,7,1
1,0,0,0,0,2015-07-14,118,13,50,239.554,31,0,1,0,0,7,1
2,0,0,0,1,2015-07-15,179,51,38,239.554,31,0,0,0,2,7,2
3,1,0,0,0,2015-07-16,279,5,39,239.554,24,0,2,0,4,7,3
4,0,0,0,1,2015-07-23,289,36,33,239.554,30,0,2,1,2,7,3
5,0,0,0,1,2015-10-07,179,51,38,239.554,31,0,0,0,2,10,2
6,0,0,1,0,2015-07-17,361,52,28,239.554,27,0,1,4,8,7,4
7,0,0,0,1,2015-07-24,260,50,36,239.554,23,0,4,0,4,7,4
8,0,0,1,0,2015-06-07,155,12,34,239.554,25,0,2,0,40,6,6
9,0,0,1,0,2015-07-13,235,11,37,239.554,29,1,1,1,8,7,0


### Create the targets

In [3]:
data_preprocessed['Absenteeism Time in Hours'].median()
# those that went above 3 hours suggests theyre excessively absent

3.0

In [4]:
targets = np.where(data_preprocessed['Absenteeism Time in Hours'] > 3, 1, 0)

In [5]:
targets

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,

In [6]:
data_preprocessed['Excessive Absenteeism'] = targets
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Month Value,Day of the Week,Excessive Absenteeism
0,0,0,0,1,2015-07-07,289,36,33,239.554,30,0,2,1,4,7,1,1
1,0,0,0,0,2015-07-14,118,13,50,239.554,31,0,1,0,0,7,1,0
2,0,0,0,1,2015-07-15,179,51,38,239.554,31,0,0,0,2,7,2,0
3,1,0,0,0,2015-07-16,279,5,39,239.554,24,0,2,0,4,7,3,1
4,0,0,0,1,2015-07-23,289,36,33,239.554,30,0,2,1,2,7,3,0


In [7]:
targets.sum() / targets.shape[0]
# About 46% of targets are 1s

0.45571428571428574

In [8]:
data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours', 'Date', 'Day of the Week',
                                            'Daily Work Load Average', 'Distance to Work'], axis=1)
data_with_targets.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Month Value,Excessive Absenteeism
0,0,0,0,1,289,33,30,0,2,1,7,1
1,0,0,0,0,118,50,31,0,1,0,7,0
2,0,0,0,1,179,38,31,0,0,0,7,0
3,1,0,0,0,279,39,24,0,2,0,7,1
4,0,0,0,1,289,33,30,0,2,1,7,0


In [9]:
# check if these two datasets are the same (should be false)
data_with_targets is data_preprocessed

False

### Select the inputs for regression

In [10]:
data_with_targets.shape

(700, 12)

In [11]:
data_with_targets.iloc[:,0:14]

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Month Value,Excessive Absenteeism
0,0,0,0,1,289,33,30,0,2,1,7,1
1,0,0,0,0,118,50,31,0,1,0,7,0
2,0,0,0,1,179,38,31,0,0,0,7,0
3,1,0,0,0,279,39,24,0,2,0,7,1
4,0,0,0,1,289,33,30,0,2,1,7,0
...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,179,40,22,1,2,0,5,1
696,1,0,0,0,225,28,24,0,1,2,5,0
697,1,0,0,0,330,28,25,1,0,0,5,1
698,0,0,0,1,235,32,25,1,0,0,5,0


In [12]:
data_with_targets.iloc[:,:-1]

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Month Value
0,0,0,0,1,289,33,30,0,2,1,7
1,0,0,0,0,118,50,31,0,1,0,7
2,0,0,0,1,179,38,31,0,0,0,7
3,1,0,0,0,279,39,24,0,2,0,7
4,0,0,0,1,289,33,30,0,2,1,7
...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,179,40,22,1,2,0,5
696,1,0,0,0,225,28,24,0,1,2,5
697,1,0,0,0,330,28,25,1,0,0,5
698,0,0,0,1,235,32,25,1,0,0,5


In [13]:
unscaled_inputs = data_with_targets.iloc[:,:-1]

### Standardize the data

In [14]:
#from sklearn.preprocessing import StandardScaler

#absenteeism_scaler = StandardScaler()

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

class CustomScaler(BaseEstimator, TransformerMixin):
    def __init__(self, columns, copy=True, with_mean=True, with_std=True):
        self.scaler = StandardScaler(copy, with_mean, with_std)
        self.columns = columns
        self.mean_ = None
        self.var_ = None
        
    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns],y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    
    def transform(self, X, y=None, copy=None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        X_not_scaled = X.loc[:, ~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

In [15]:
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Month Value'], dtype=object)

In [16]:
# columns_to_scale = ['Month Value', 'Day of the Week', 'Transportation Expense', 'Distance to Work', 'Age',
# 'Daily Work Load Average', 'Body Mass Index', 'Children', 'Pets', ]

columns_to_omit = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Education']
columns_to_scale = [x for x in unscaled_inputs.columns.values if x not in columns_to_omit]

In [17]:
absenteeism_scaler = CustomScaler(columns_to_scale)



In [18]:
#calculate and store mean and standard deviation
absenteeism_scaler.fit(unscaled_inputs)



CustomScaler(columns=['Transportation Expense', 'Age', 'Body Mass Index',
                      'Children', 'Pets', 'Month Value'],
             copy=None, with_mean=None, with_std=None)

In [19]:
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)
scaled_inputs

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Month Value
0,0,0,0,1,1.005844,-0.536062,0.767431,0,0.880469,0.268487,0.030796
1,0,0,0,0,-1.574681,2.130803,1.002633,0,-0.019280,-0.589690,0.030796
2,0,0,0,1,-0.654143,0.248310,1.002633,0,-0.919030,-0.589690,0.030796
3,1,0,0,0,0.854936,0.405184,-0.643782,0,0.880469,-0.589690,0.030796
4,0,0,0,1,1.005844,-0.536062,0.767431,0,0.880469,0.268487,0.030796
...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,-0.654143,0.562059,-1.114186,1,0.880469,-0.589690,-0.568019
696,1,0,0,0,0.040034,-1.320435,-0.643782,0,-0.019280,1.126663,-0.568019
697,1,0,0,0,1.624567,-1.320435,-0.408580,1,-0.919030,-0.589690,-0.568019
698,0,0,0,1,0.190942,-0.692937,-0.408580,1,-0.919030,-0.589690,-0.568019


In [20]:
scaled_inputs.shape

(700, 11)

## Split the data into train and test and shuffle

### Import the relevant modules

In [21]:
from sklearn.model_selection import train_test_split

### Split

In [22]:
train_test_split(scaled_inputs, targets)

[     Reason_1  Reason_2  Reason_3  Reason_4  Transportation Expense       Age  \
 186         0         0         0         1               -1.016322 -0.379188   
 449         0         0         0         1                0.356940  0.718933   
 281         0         0         0         1                1.036026  0.562059   
 318         0         0         0         1                0.387122  1.660180   
 354         0         0         0         1                0.568211 -0.065439   
 ..        ...       ...       ...       ...                     ...       ...   
 300         0         0         0         0                0.190942  1.032682   
 412         0         0         0         1               -0.654143  0.248310   
 152         0         0         0         1                1.624567 -1.320435   
 656         0         0         1         0                2.092381 -1.320435   
 418         0         0         0         1               -0.654143  0.248310   
 
      Body Mas

In [23]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size= 0.8, random_state=20)
print("\n x_train, y_train shape:", x_train.shape, y_train.shape)
print("\n x_test, y_test shape:", x_test.shape, y_test.shape)


 x_train, y_train shape: (560, 11) (560,)

 x_test, y_test shape: (140, 11) (140,)


The train input contain 560 observations along 14 features, while the targets are a vector of 560. 
The test input contain 140 observations 14 input variable and 1 target variable.
The split is 80% of the observations help us with training and 20% will serve for testing.

## Logistic Regression with sklearn

In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

### Training the model

In [25]:
reg = LogisticRegression()
reg.fit(x_train, y_train)

LogisticRegression()

In [26]:
# evaluate the accuracy of the model
reg.score(x_train, y_train)

0.7910714285714285

### Manually check the accuracy

In [27]:
model_outputs = reg.predict(x_train)
model_outputs

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0,

In [28]:
# check the total number of correct predictions
print(np.sum(model_outputs == y_train))
# accuracy
print(np.sum(model_outputs == y_train) / model_outputs.shape[0])

443
0.7910714285714285


We get the same result as we did using the sklearn method 'score'

### Find the intercept and coefficients

In [29]:
reg.intercept_

array([-1.53914505])

In [30]:
reg.coef_

array([[ 2.70473146,  0.93004987,  3.34447163,  0.55576017,  0.56911805,
        -0.16658704,  0.26732643, -0.40768235,  0.31718133, -0.41291177,
         0.0552093 ]])

In [31]:
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Month Value'], dtype=object)

In [32]:
feature_name = unscaled_inputs.columns.values
summary_table = pd.DataFrame(columns=['Feature name'], data = feature_name)
summary_table['Coefficient'] = np.transpose(reg.coef_)
summary_table

Unnamed: 0,Feature name,Coefficient
0,Reason_1,2.704731
1,Reason_2,0.93005
2,Reason_3,3.344472
3,Reason_4,0.55576
4,Transportation Expense,0.569118
5,Age,-0.166587
6,Body Mass Index,0.267326
7,Education,-0.407682
8,Children,0.317181
9,Pets,-0.412912


In [33]:
summary_table.index = summary_table.index+1
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Feature name,Coefficient
0,Intercept,-1.539145
1,Reason_1,2.704731
2,Reason_2,0.93005
3,Reason_3,3.344472
4,Reason_4,0.55576
5,Transportation Expense,0.569118
6,Age,-0.166587
7,Body Mass Index,0.267326
8,Education,-0.407682
9,Children,0.317181


### Interpreting the coefficients

In [34]:
summary_table['Odds Ratio'] = np.exp(summary_table.Coefficient)
summary_table

Unnamed: 0,Feature name,Coefficient,Odds Ratio
0,Intercept,-1.539145,0.214564
1,Reason_1,2.704731,14.950301
2,Reason_2,0.93005,2.534636
3,Reason_3,3.344472,28.345595
4,Reason_4,0.55576,1.743266
5,Transportation Expense,0.569118,1.766708
6,Age,-0.166587,0.846549
7,Body Mass Index,0.267326,1.306467
8,Education,-0.407682,0.66519
9,Children,0.317181,1.373252


In [35]:
summary_table.sort_values('Odds Ratio', ascending=False)

Unnamed: 0,Feature name,Coefficient,Odds Ratio
3,Reason_3,3.344472,28.345595
1,Reason_1,2.704731,14.950301
2,Reason_2,0.93005,2.534636
5,Transportation Expense,0.569118,1.766708
4,Reason_4,0.55576,1.743266
9,Children,0.317181,1.373252
7,Body Mass Index,0.267326,1.306467
11,Month Value,0.055209,1.056762
6,Age,-0.166587,0.846549
8,Education,-0.407682,0.66519


If the coefficient is around 0 (or if its odds ratio is around 1) then the feature is not particularly important. This is because a weight of 0 implies that no matter the feature value, we will multiply it by 0 in the model. Looking at our table, 'Daily Work Load Average', 'Distance to Work', 'Day of the Week' meets the criteria and should be removed from the model, because the result will be likely the same. 

## Test the model

In [36]:
reg.score(x_test, y_test)

0.7714285714285715

Based on the data that the model has never seen before, in 77% of the cases, the model will predict corectly if the person
is going to be excessivley absent. Often the test accuracy is 10-20% lower than the train accuracy (due to overfitting)

In [37]:
predicted_proba = reg.predict_proba(x_test)
predicted_proba

array([[0.75504872, 0.24495128],
       [0.66347884, 0.33652116],
       [0.50834728, 0.49165272],
       [0.77581343, 0.22418657],
       [0.08626494, 0.91373506],
       [0.28750394, 0.71249606],
       [0.3373738 , 0.6626262 ],
       [0.09970528, 0.90029472],
       [0.79936995, 0.20063005],
       [0.76410405, 0.23589595],
       [0.11033109, 0.88966891],
       [0.01876862, 0.98123138],
       [0.04972388, 0.95027612],
       [0.18969968, 0.81030032],
       [0.24433805, 0.75566195],
       [0.59918319, 0.40081681],
       [0.66481313, 0.33518687],
       [0.11699049, 0.88300951],
       [0.37617128, 0.62382872],
       [0.05093223, 0.94906777],
       [0.78855561, 0.21144439],
       [0.77581343, 0.22418657],
       [0.42867256, 0.57132744],
       [0.41256074, 0.58743926],
       [0.20476738, 0.79523262],
       [0.80200789, 0.19799211],
       [0.55279374, 0.44720626],
       [0.89205526, 0.10794474],
       [0.16611734, 0.83388266],
       [0.77581343, 0.22418657],
       [0.

In [38]:
predicted_proba[:,1]

array([0.24495128, 0.33652116, 0.49165272, 0.22418657, 0.91373506,
       0.71249606, 0.6626262 , 0.90029472, 0.20063005, 0.23589595,
       0.88966891, 0.98123138, 0.95027612, 0.81030032, 0.75566195,
       0.40081681, 0.33518687, 0.88300951, 0.62382872, 0.94906777,
       0.21144439, 0.22418657, 0.57132744, 0.58743926, 0.79523262,
       0.19799211, 0.44720626, 0.10794474, 0.83388266, 0.22418657,
       0.88531509, 0.66829636, 0.73560486, 0.89287247, 0.22418657,
       0.94906777, 0.20329423, 0.84187711, 0.3801227 , 0.55259129,
       0.22132466, 0.4927361 , 0.19279482, 0.41468911, 0.76915347,
       0.71646814, 0.73095263, 0.229989  , 0.24384692, 0.21848899,
       0.52577288, 0.29075856, 0.71249606, 0.22893164, 0.81307796,
       0.34394208, 0.89444335, 0.23757149, 0.34927841, 0.35304471,
       0.68856376, 0.72254642, 0.24667037, 0.80836334, 0.19657632,
       0.24190694, 0.09965072, 0.20598467, 0.8353389 , 0.82025603,
       0.18770202, 0.32675944, 0.89398507, 0.37236421, 0.51753

## Save the model

In [39]:
import pickle

In [40]:
with open('model', 'wb') as file:
    pickle.dump(reg, file)

In [41]:
with open('scaler', 'wb') as file:
    pickle.dump(absenteeism_scaler, file)