# Creating Logistic regression to predict time of absenteeism in hours

# Importing Libraries

In [3]:
import numpy as np
import pandas as pd

# Loading Data

In [5]:
pp_data = pd.read_csv('Absenteeism_preprocessed.csv')

In [6]:
pp_data.head()

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Month Values,Day of Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2


## We calculated median of absenteeism it was 3 if it is greater then 3 it is 1 otherwise 0

In [8]:
targets = np.where(pp_data['Absenteeism Time in Hours'] > pp_data['Absenteeism Time in Hours'].median(), 1, 0)
targets

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,

In [9]:
pp_data['Excessive Absenteeism'] = targets

In [10]:
targets.sum() / len(targets)

0.45571428571428574

In [11]:
target_data_pp = pp_data.drop(['Absenteeism Time in Hours','Daily Work Load Average','Distance to Work','Day of Week'], axis = 1)
target_data_pp.head()

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Month Values,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Excessive Absenteeism
0,0,0,0,1,7,289,33,30,0,2,1,1
1,0,0,0,0,7,118,50,31,0,1,0,0
2,0,0,0,1,7,179,38,31,0,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0,1
4,0,0,0,1,7,289,33,30,0,2,1,0


In [12]:
unscaled_inputs = target_data_pp.iloc[:,:-1]
unscaled_inputs.head()

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Month Values,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,0,0,0,1,7,289,33,30,0,2,1
1,0,0,0,0,7,118,50,31,0,1,0
2,0,0,0,1,7,179,38,31,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0
4,0,0,0,1,7,289,33,30,0,2,1


# Standardizing the data

In [14]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
class CustomScaler(BaseEstimator,TransformerMixin): 
    
    # init or what information we need to declare a CustomScaler object
    # and what is calculated/declared as we do
    
    def __init__(self, columns, copy=True, with_mean=True, with_std=True):
        # Pass arguments as keyword arguments
        self.scaler = StandardScaler(copy=copy, with_mean=with_mean, with_std=with_std)
        self.columns = columns
        self.mean_ = None
        self.var_ = None
        
    
    # the fit method, which, again based on StandardScale
    
    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    
    # the transform method which does the actual scaling

    def transform(self, X, y=None, copy=None):
        
        # record the initial order of the columns
        init_col_order = X.columns
        
        # scale all features that you chose when creating the instance of the class
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        
        # declare a variable containing all information that was not scaled
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        
        # return a data frame which contains all scaled features and all 'not scaled' features
        # use the original order (that you recorded in the beginning)
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

In [15]:
unscaled_inputs.columns.values

array(['Reason 1', 'Reason 2', 'Reason 3', 'Reason 4', 'Month Values',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [16]:
# columns_to_scale = ['Month Values', 'Transportation Expense', 'Distance to Work', 'Age',
#        'Daily Work Load Average', 'Body Mass Index','Children', 'Pets']

columns_to_omit = ['Reason 1', 'Reason 2', 'Reason 3', 'Reason 4', 'Education']
columns_to_scale = [x for x in unscaled_inputs.columns.values if x not in columns_to_omit]

In [17]:
absenteeism_scaler = CustomScaler(columns_to_scale)

In [18]:
scaler.fit(unscaled_inputs)

In [19]:
scaled_inputs = scaler.transform(unscaled_inputs)

In [20]:
scaled_inputs

array([[-0.57735027, -0.09298136, -0.31448545, ..., -0.44798003,
         0.88046927,  0.26848661],
       [-0.57735027, -0.09298136, -0.31448545, ..., -0.44798003,
        -0.01928035, -0.58968976],
       [-0.57735027, -0.09298136, -0.31448545, ..., -0.44798003,
        -0.91902997, -0.58968976],
       ...,
       [ 1.73205081, -0.09298136, -0.31448545, ...,  2.23224237,
        -0.91902997, -0.58968976],
       [-0.57735027, -0.09298136, -0.31448545, ...,  2.23224237,
        -0.91902997, -0.58968976],
       [-0.57735027, -0.09298136, -0.31448545, ..., -0.44798003,
        -0.01928035,  0.26848661]])

### Splitting data for tarining and testing

In [22]:
from sklearn.model_selection import train_test_split

In [23]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size = 0.8, random_state = 20)

In [24]:
x_train.shape

(560, 11)

In [25]:
x_test.shape

(140, 11)

In [26]:
y_train.shape

(560,)

In [27]:
y_test.shape

(140,)

## Modeling

In [29]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [30]:
reg = LogisticRegression()

In [31]:
reg.fit(x_train,y_train)

In [32]:
reg.score(x_train, y_train)

0.7839285714285714

### there is a manual way i can calculate it like using x_train to predict and storing it in a variable and comparing it with original output y_train.

In [34]:
reg.intercept_

array([-0.22415067])

In [35]:
reg.coef_

array([[ 2.07060602,  0.33597918,  1.56203203,  1.31392372,  0.18468965,
         0.69080918, -0.19858992,  0.32509576, -0.12584894,  0.37082428,
        -0.32495752]])

In [36]:
# save the names of the columns in an ad-hoc variable
feature_name = unscaled_inputs.columns.values

In [37]:
# use the coefficients from this table (they will be exported later and will be used in Tableau)
# transpose the model coefficients (model.coef_) and throws them into a df (a vertical organization, so that they can be
# multiplied by certain matrices later) 
summary_table = pd.DataFrame (columns=['Feature name'], data = feature_name)

# add the coefficient values to the summary table
summary_table['Coefficient'] = np.transpose(reg.coef_)

# display the summary table
summary_table

Unnamed: 0,Feature name,Coefficient
0,Reason 1,2.070606
1,Reason 2,0.335979
2,Reason 3,1.562032
3,Reason 4,1.313924
4,Month Values,0.18469
5,Transportation Expense,0.690809
6,Age,-0.19859
7,Body Mass Index,0.325096
8,Education,-0.125849
9,Children,0.370824


In [38]:
# do a little Python trick to move the intercept to the top of the summary table
# move all indices by 1
summary_table.index = summary_table.index + 1

# add the intercept at index 0
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]

# sort the df by index
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Feature name,Coefficient
0,Intercept,-0.224151
1,Reason 1,2.070606
2,Reason 2,0.335979
3,Reason 3,1.562032
4,Reason 4,1.313924
5,Month Values,0.18469
6,Transportation Expense,0.690809
7,Age,-0.19859
8,Body Mass Index,0.325096
9,Education,-0.125849


## Interpreting the coefficients

In [40]:
# create a new Series called: 'Odds ratio' which will show the.. odds ratio of each feature
summary_table['Odds_ratio'] = np.exp(summary_table.Coefficient)

In [41]:
# display the df
summary_table

Unnamed: 0,Feature name,Coefficient,Odds_ratio
0,Intercept,-0.224151,0.799195
1,Reason 1,2.070606,7.929627
2,Reason 2,0.335979,1.39931
3,Reason 3,1.562032,4.768501
4,Reason 4,1.313924,3.720744
5,Month Values,0.18469,1.202845
6,Transportation Expense,0.690809,1.995329
7,Age,-0.19859,0.819886
8,Body Mass Index,0.325096,1.384163
9,Education,-0.125849,0.881748


In [42]:
# sort the table according to odds ratio
# note that by default, the sort_values method sorts values by 'ascending'
summary_table.sort_values('Odds_ratio', ascending=False)

Unnamed: 0,Feature name,Coefficient,Odds_ratio
1,Reason 1,2.070606,7.929627
3,Reason 3,1.562032,4.768501
4,Reason 4,1.313924,3.720744
6,Transportation Expense,0.690809,1.995329
10,Children,0.370824,1.448928
2,Reason 2,0.335979,1.39931
8,Body Mass Index,0.325096,1.384163
5,Month Values,0.18469,1.202845
9,Education,-0.125849,0.881748
7,Age,-0.19859,0.819886


# Testing the Model

In [79]:
reg.score(x_test, y_test)

0.7357142857142858

In [95]:
predicted_probability = reg.predict_proba(x_test)
predicted_probability

array([[0.70811655, 0.29188345],
       [0.5721039 , 0.4278961 ],
       [0.3987536 , 0.6012464 ],
       [0.78719503, 0.21280497],
       [0.0668946 , 0.9331054 ],
       [0.3111248 , 0.6888752 ],
       [0.28565469, 0.71434531],
       [0.08108162, 0.91891838],
       [0.79928258, 0.20071742],
       [0.74973079, 0.25026921],
       [0.46768065, 0.53231935],
       [0.18458351, 0.81541649],
       [0.04095243, 0.95904757],
       [0.7555815 , 0.2444185 ],
       [0.23782308, 0.76217692],
       [0.54134575, 0.45865425],
       [0.53349868, 0.46650132],
       [0.520355  , 0.479645  ],
       [0.40543527, 0.59456473],
       [0.02764621, 0.97235379],
       [0.7015122 , 0.2984878 ],
       [0.78719503, 0.21280497],
       [0.40550972, 0.59449028],
       [0.40550972, 0.59449028],
       [0.17426979, 0.82573021],
       [0.75364779, 0.24635221],
       [0.48844947, 0.51155053],
       [0.88051837, 0.11948163],
       [0.13089896, 0.86910104],
       [0.78719503, 0.21280497],
       [0.

In [93]:
predicted_probability[:,1]

array([0.29188345, 0.4278961 , 0.6012464 , 0.21280497, 0.9331054 ,
       0.6888752 , 0.71434531, 0.91891838, 0.20071742, 0.25026921,
       0.53231935, 0.81541649, 0.95904757, 0.2444185 , 0.76217692,
       0.45865425, 0.46650132, 0.479645  , 0.59456473, 0.97235379,
       0.2984878 , 0.21280497, 0.59449028, 0.59449028, 0.82573021,
       0.24635221, 0.51155053, 0.11948163, 0.86910104, 0.21280497,
       0.37722038, 0.69563785, 0.71101679, 0.55846122, 0.21280497,
       0.56307123, 0.20931063, 0.80979261, 0.41300737, 0.6300506 ,
       0.20410533, 0.42159529, 0.22729926, 0.10466249, 0.85724567,
       0.65408304, 0.70636669, 0.29188345, 0.21168449, 0.19567294,
       0.56557265, 0.07759079, 0.6888752 , 0.26925275, 0.85966686,
       0.45388672, 0.93036835, 0.21790894, 0.08520077, 0.0894016 ,
       0.70577652, 0.67746261, 0.28724979, 0.86048417, 0.1899959 ,
       0.27057335, 0.01399431, 0.20931063, 0.81221789, 0.28543027,
       0.20931063, 0.06838131, 0.92866479, 0.47804084, 0.63876

In [97]:
import pickle

In [99]:
with open('model','wb') as file:
    pickle.dump(reg,file)

In [101]:
with open('scaler','wb') as file:
    pickle.dump(absenteeism_scaler, file)