In [1]:
import pandas as pd
import numpy as np

In [2]:
data_preprocessed = pd.read_csv('Absenteeism_preprocessed.csv')

In [3]:
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Month Value,Day of the Week
0,0,0,0,1,2015-07-07,289,36,33,239.554,30,0,2,1,4,7,1
1,0,0,0,0,2015-07-14,118,13,50,239.554,31,0,1,0,0,7,1
2,0,0,0,1,2015-07-15,179,51,38,239.554,31,0,0,0,2,7,2
3,1,0,0,0,2015-07-16,279,5,39,239.554,24,0,2,0,4,7,3
4,0,0,0,1,2015-07-23,289,36,33,239.554,30,0,2,1,2,7,3


In [4]:
## create the tragets

In [5]:
data_preprocessed['Absenteeism Time in Hours'].median()

3.0

In [6]:
# This suggests anyone absent more than 3 hours are classed as 'high absenteeism'. I will create targets of '0' for anyone absent for 3 hours or less. '1' for everyone higher. My final task is to predict whether someone will be in the 0 or 1 category

In [7]:
targets = np.where(data_preprocessed['Absenteeism Time in Hours'] > 3, 1, 0)

In [8]:
data_preprocessed['Excessive Absenteeism'] = targets
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Month Value,Day of the Week,Excessive Absenteeism
0,0,0,0,1,2015-07-07,289,36,33,239.554,30,0,2,1,4,7,1,1
1,0,0,0,0,2015-07-14,118,13,50,239.554,31,0,1,0,0,7,1,0
2,0,0,0,1,2015-07-15,179,51,38,239.554,31,0,0,0,2,7,2,0
3,1,0,0,0,2015-07-16,279,5,39,239.554,24,0,2,0,4,7,3,1
4,0,0,0,1,2015-07-23,289,36,33,239.554,30,0,2,1,2,7,3,0


In [10]:
data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours'], axis=1)

In [11]:
data_with_targets.head(2)

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Month Value,Day of the Week,Excessive Absenteeism
0,0,0,0,1,2015-07-07,289,36,33,239.554,30,0,2,1,7,1,1
1,0,0,0,0,2015-07-14,118,13,50,239.554,31,0,1,0,7,1,0


In [30]:
data_with_targets = data_with_targets.drop(['Date'], axis=1)

In [34]:
data_with_targets.iloc[:,0:14]

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Month Value,Day of the Week
0,0,0,0,1,289,36,33,239.554,30,0,2,1,7,1
1,0,0,0,0,118,13,50,239.554,31,0,1,0,7,1
2,0,0,0,1,179,51,38,239.554,31,0,0,0,7,2
3,1,0,0,0,279,5,39,239.554,24,0,2,0,7,3
4,0,0,0,1,289,36,33,239.554,30,0,2,1,7,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,179,22,40,237.656,22,1,2,0,5,2
696,1,0,0,0,225,26,28,237.656,24,0,1,2,5,2
697,1,0,0,0,330,16,28,237.656,25,1,0,0,5,3
698,0,0,0,1,235,16,32,237.656,25,1,0,0,5,3


In [35]:
unscaled_inputs = data_with_targets.iloc[:,0:14]

In [None]:
#standardize data - without standardizing the dummy codes

In [71]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
    
class CustomScaler(BaseEstimator,TransformerMixin):

    def __init__(self,columns,copy=True,with_mean=True,with_std=True):
        self.scaler = StandardScaler(copy,with_mean,with_std)
        self.columns = columns
        self.mean_ = None
        self.var_ = None

    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self

    def transform(self, X, y=None, copy=None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

In [72]:
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4',
       'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Month Value', 'Day of the Week'], dtype=object)

In [73]:
columns_to_scale = ['Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index',
       'Children', 'Pets', 'Month Value', 'Day of the Week']

In [74]:
absenteeism_scaler = CustomScaler(columns_to_scale)

In [75]:
absenteeism_scaler.fit(unscaled_inputs)



CustomScaler(columns=['Transportation Expense', 'Distance to Work', 'Age',
                      'Daily Work Load Average', 'Body Mass Index', 'Children',
                      'Pets', 'Month Value', 'Day of the Week'],
             copy=None, with_mean=None, with_std=None)

In [76]:
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)

In [77]:
scaled_inputs

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Month Value,Day of the Week
0,0,0,0,1,1.005844,0.412816,-0.536062,-0.806331,0.767431,0,0.880469,0.268487,0.182726,-0.683704
1,0,0,0,0,-1.574681,-1.141882,2.130803,-0.806331,1.002633,0,-0.019280,-0.589690,0.182726,-0.683704
2,0,0,0,1,-0.654143,1.426749,0.248310,-0.806331,1.002633,0,-0.919030,-0.589690,0.182726,-0.007725
3,1,0,0,0,0.854936,-1.682647,0.405184,-0.806331,-0.643782,0,0.880469,-0.589690,0.182726,0.668253
4,0,0,0,1,1.005844,0.412816,-0.536062,-0.806331,0.767431,0,0.880469,0.268487,0.182726,0.668253
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,-0.654143,-0.533522,0.562059,-0.853789,-1.114186,1,0.880469,-0.589690,-0.388293,-0.007725
696,1,0,0,0,0.040034,-0.263140,-1.320435,-0.853789,-0.643782,0,-0.019280,1.126663,-0.388293,-0.007725
697,1,0,0,0,1.624567,-0.939096,-1.320435,-0.853789,-0.408580,1,-0.919030,-0.589690,-0.388293,0.668253
698,0,0,0,1,0.190942,-0.939096,-0.692937,-0.853789,-0.408580,1,-0.919030,-0.589690,-0.388293,0.668253


In [39]:
# train test split

In [79]:
from sklearn.model_selection import train_test_split

In [80]:
train_test_split(scaled_inputs, targets)

[     Reason_1  Reason_2  Reason_3  Reason_4  Transportation Expense  \
 550         1         0         0         0                0.387122   
 135         0         0         0         1                0.040034   
 57          0         0         0         1               -0.654143   
 360         0         0         0         1               -1.016322   
 389         0         0         0         1               -0.654143   
 ..        ...       ...       ...       ...                     ...   
 425         1         0         0         0                0.190942   
 483         0         0         0         1               -1.574681   
 229         1         0         0         0                0.190942   
 471         1         0         0         0                0.040034   
 369         0         0         0         1               -1.574681   
 
      Distance to Work       Age  Daily Work Load Average  Body Mass Index  \
 550         -0.330735  1.660180                -0.08208

In [81]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size = 0.8, random_state = 20)

In [82]:
print (x_train.shape, y_train.shape)

(560, 14) (560,)


In [83]:
print (x_test.shape, y_test.shape)

(140, 14) (140,)


In [84]:
#Logistic regression - sklearn

In [85]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [86]:
# training

In [87]:
reg = LogisticRegression()

In [88]:
reg.fit(x_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [89]:
reg.score(x_train, y_train)

0.775

In [90]:
# this shows that the model learned to classify 80% of the observations correctly

In [91]:
# intercept and coefficients

In [92]:
reg.intercept_

array([-1.6561092])

In [93]:
reg.coef_

array([[ 2.80096498e+00,  9.34857518e-01,  3.09561645e+00,
         8.56587468e-01,  6.12732578e-01, -7.79685996e-03,
        -1.65922708e-01, -1.47005122e-04,  2.71811477e-01,
        -2.05738037e-01,  3.61989880e-01, -2.85510745e-01,
         1.66248119e-01, -8.43703301e-02]])

In [94]:
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4',
       'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Month Value', 'Day of the Week'], dtype=object)

In [95]:
feature_name = unscaled_inputs.columns.values

In [96]:
summary_table = pd.DataFrame (columns=['Feature name'], data = feature_name)
summary_table['Coefficient'] = np.transpose(reg.coef_)
summary_table

Unnamed: 0,Feature name,Coefficient
0,Reason_1,2.800965
1,Reason_2,0.934858
2,Reason_3,3.095616
3,Reason_4,0.856587
4,Transportation Expense,0.612733
5,Distance to Work,-0.007797
6,Age,-0.165923
7,Daily Work Load Average,-0.000147
8,Body Mass Index,0.271811
9,Education,-0.205738


In [97]:
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Feature name,Coefficient
0,Intercept,-1.656109
1,Reason_1,2.800965
2,Reason_2,0.934858
3,Reason_3,3.095616
4,Reason_4,0.856587
5,Transportation Expense,0.612733
6,Distance to Work,-0.007797
7,Age,-0.165923
8,Daily Work Load Average,-0.000147
9,Body Mass Index,0.271811


In [98]:
summary_table['Odds_ratio'] = np.exp(summary_table.Coefficient)

In [99]:
summary_table

Unnamed: 0,Feature name,Coefficient,Odds_ratio
0,Intercept,-1.656109,0.19088
1,Reason_1,2.800965,16.460523
2,Reason_2,0.934858,2.546851
3,Reason_3,3.095616,22.100858
4,Reason_4,0.856587,2.35511
5,Transportation Expense,0.612733,1.845467
6,Distance to Work,-0.007797,0.992233
7,Age,-0.165923,0.847112
8,Daily Work Load Average,-0.000147,0.999853
9,Body Mass Index,0.271811,1.31234


In [100]:
summary_table.sort_values('Odds_ratio', ascending=False)

Unnamed: 0,Feature name,Coefficient,Odds_ratio
3,Reason_3,3.095616,22.100858
1,Reason_1,2.800965,16.460523
2,Reason_2,0.934858,2.546851
4,Reason_4,0.856587,2.35511
5,Transportation Expense,0.612733,1.845467
11,Children,0.36199,1.436184
9,Body Mass Index,0.271811,1.31234
13,Month Value,0.166248,1.180866
8,Daily Work Load Average,-0.000147,0.999853
6,Distance to Work,-0.007797,0.992233


In [68]:
# a variable/feature not important to the model (no predictive power) if the coefficient is around 0 and odds ratio around 1

In [101]:
# This results show that if person reports being sick for reason 3, they are 22 times more likely to be excessively absent from work (compared to the baseline of Reason_0)

In [102]:
# Transportation expense is one of the non-dummy variables in the model. The figures above suggest that for every 1 point increase in transport expenses, a person is twice as likely to have excessive absence

In [103]:
#Standardized values produce more accurate models. However, they are less interpretabe and don't fully explain the reasons behind figures

In [107]:
# Could use backward elimination to remove - daily work load, distance to work and day of week

In [None]:
# testing the model

In [105]:
reg.score(x_test, y_test)

0.7428571428571429

In [106]:
 # this shows that based on data that the model has never seen before, it correctly predicts 74% of new observations 

In [108]:
# save the model

In [109]:
import pickle

In [110]:
with open('model', 'wb') as file:
    pickle.dump(reg, file)

In [111]:
with open('scaler','wb') as file:
    pickle.dump(absenteeism_scaler, file)