# Machine learning on Absenteeism data

In [1]:
import pandas as pd
import numpy as np

In [2]:
pd.options.display.max_columns = None
pd.options.display.max_rows = None
data_processed = pd.read_csv('Absenteeism_preprocessed.csv')

In [3]:
data_processed['Absenteeism Time in Hours'].median()

3.0

In [4]:
#creating target for more than 3 hours and les than 3 hours using np where

targets = np.where(data_processed['Absenteeism Time in Hours'] >
                   data_processed['Absenteeism Time in Hours'].median(),1,0)
data_processed['Excessive Absenteeism'] = targets

In [5]:
data_processed.columns

Index(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Date', 'Age',
       'Transportation Expense', 'Distance to Work', 'Daily Work Load Average',
       'Body Mass Index', 'Education', 'Children', 'Pets',
       'Absenteeism Time in Hours', 'Month Value', 'Day of the week',
       'Excessive Absenteeism'],
      dtype='object')

In [48]:
data_with_targets = data_processed.copy()
data_with_targets = data_processed.drop(['Absenteeism Time in Hours',
                                         'Day of the week',
                                         'Daily Work Load Average',
                                         'Distance to Work',
                                         'Education'
                                         
                                         ],axis = 1)




In [49]:
data_with_targets_arranged = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 
                                       'Month Value', 'Transportation Expense',
                                        'Age',
                                       'Body Mass Index', 'Children', 'Pets',
                                       'Excessive Absenteeism']

In [50]:
data_with_targets = data_with_targets[data_with_targets_arranged]

data_with_targets.head(5)

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Children,Pets,Excessive Absenteeism
0,0,0,0,1,7,289,33,30,2,1,1
1,0,0,0,0,7,118,50,31,1,0,0
2,0,0,0,1,7,179,38,31,0,0,0
3,1,0,0,0,7,279,39,24,2,0,1
4,0,0,0,1,7,289,33,30,2,1,0


In [51]:
#to check between two variables if they are same use is
 data_with_targets is data_processed


False

In [52]:
#to check the count or sum of 1's by total
targets.sum()/targets.shape[0]

0.45571428571428574

In [53]:
data_with_targets.shape

(700, 11)

In [54]:
data_with_targets.iloc[:,:-1]

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Children,Pets
0,0,0,0,1,7,289,33,30,2,1
1,0,0,0,0,7,118,50,31,1,0
2,0,0,0,1,7,179,38,31,0,0
3,1,0,0,0,7,279,39,24,2,0
4,0,0,0,1,7,289,33,30,2,1
5,0,0,0,1,7,179,38,31,0,0
6,0,0,0,1,7,361,28,27,1,4
7,0,0,0,1,7,260,36,23,4,0
8,0,0,1,0,7,155,34,25,2,0
9,0,0,0,1,7,235,37,29,1,1


In [55]:
unscaled_inputs = data_with_targets.iloc[:,:-1]

# Standardize the data

#absenteeism_scaler will be used to substract the mean and divide by S.D variablewise(feature wise)

In [56]:
#from sklearn.preprocessing import StandardScaler

#absenteeism_scaler = StandardScaler() 


In [57]:
unscaled_inputs.columns

Index(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Children', 'Pets'],
      dtype='object')

In [16]:
# import the libraries needed to create the Custom Scaler
# note that all of them are a part of the sklearn package
# moreover, one of them is actually the StandardScaler module, 
# so you can imagine that the Custom Scaler is build on it

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

# create the Custom Scaler class

class CustomScaler(BaseEstimator,TransformerMixin): 
    
    # init or what information we need to declare a CustomScaler object
    # and what is calculated/declared as we do
    
    def __init__(self,columns,copy=True,with_mean=True,with_std=True):
        
        # scaler is nothing but a Standard Scaler object
        self.scaler = StandardScaler(copy,with_mean,with_std)
        # with some columns 'twist'
        self.columns = columns
        self.mean_ = None
        self.var_ = None
        
    
    # the fit method, which, again based on StandardScale
    
    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    
    # the transform method which does the actual scaling

    def transform(self, X, y=None, copy=None):
        
        # record the initial order of the columns
        init_col_order = X.columns
        
        # scale all features that you chose when creating the instance of the class
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        
        # declare a variable containing all information that was not scaled
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        
        # return a data frame which contains all scaled features and all 'not scaled' features
        # use the original order (that you recorded in the beginning)
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

In [58]:
columns_to_scale = ['Month Value',
        'Transportation Expense', 'Age'
         , 'Body Mass Index', 'Children',
       'Pets']

In [59]:
absenteeism_scaler = CustomScaler(columns_to_scale)

In [60]:
absenteeism_scaler.fit(unscaled_inputs)

CustomScaler(columns=['Month Value', 'Transportation Expense', 'Age',
                      'Body Mass Index', 'Children', 'Pets'],
             copy=None, with_mean=None, with_std=None)

In [61]:
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)

In [62]:
scaled_inputs

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Children,Pets
0,0,0,0,1,0.182726,1.005844,-0.536062,0.767431,0.880469,0.268487
1,0,0,0,0,0.182726,-1.574681,2.130803,1.002633,-0.01928,-0.58969
2,0,0,0,1,0.182726,-0.654143,0.24831,1.002633,-0.91903,-0.58969
3,1,0,0,0,0.182726,0.854936,0.405184,-0.643782,0.880469,-0.58969
4,0,0,0,1,0.182726,1.005844,-0.536062,0.767431,0.880469,0.268487
5,0,0,0,1,0.182726,-0.654143,0.24831,1.002633,-0.91903,-0.58969
6,0,0,0,1,0.182726,2.092381,-1.320435,0.061825,-0.01928,2.843016
7,0,0,0,1,0.182726,0.568211,-0.065439,-0.878984,2.679969,-0.58969
8,0,0,1,0,0.182726,-1.016322,-0.379188,-0.40858,0.880469,-0.58969
9,0,0,0,1,0.182726,0.190942,0.091435,0.532229,-0.01928,0.268487


In [63]:
scaled_inputs.shape

(700, 10)

In [64]:
# import train_test_split so we can split our data into train and test
from sklearn.model_selection import train_test_split

In [65]:
train_test_split(scaled_inputs,targets)

[     Reason_1  Reason_2  Reason_3  Reason_4  Month Value  \
 59          0         0         0         1     0.753746   
 443         0         0         0         1    -0.102784   
 689         0         0         0         1    -0.388293   
 337         0         0         0         0     1.324766   
 6           0         0         0         1     0.182726   
 317         1         0         0         0     1.324766   
 586         0         0         0         1    -1.244823   
 365         0         0         0         1    -1.530333   
 445         0         0         0         1    -0.102784   
 166         0         0         0         1    -0.959313   
 521         0         0         0         0     1.039256   
 190         0         0         0         1    -0.673803   
 496         0         0         0         1     0.753746   
 572         0         0         0         1    -1.530333   
 542         1         0         0         0     1.324766   
 607         0         0

In [66]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs,targets, train_size = 0.80, random_state = 20)

#by default the shuffle is true and we can set the occurance of shuffle by random state to as many to not miss out on the accuracy when rereun.

In [67]:
print(x_train.shape,y_train.shape)

(560, 10) (560,)


In [68]:
print(x_test.shape,y_test.shape)

(140, 10) (140,)


In [69]:
##Logistic Regression on Absenteeism Data

In [70]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [71]:
#Training the model

In [72]:
reg = LogisticRegression()

In [73]:
reg.fit(x_train,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [74]:
reg.score(x_train,y_train)

0.7446428571428572

In [75]:
model_outputs = reg.predict(x_train)
model_outputs

array([0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0,

In [76]:
y_train

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0,

In [77]:
model_outputs == y_train #true is match

array([ True,  True, False,  True,  True,  True,  True,  True,  True,
        True, False,  True, False, False,  True,  True,  True,  True,
       False,  True, False,  True, False, False, False,  True,  True,
       False,  True, False,  True,  True,  True,  True, False,  True,
        True, False, False, False,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True, False,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
       False,  True, False,  True,  True, False, False, False,  True,
        True,  True,  True,  True,  True,  True, False, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
       False,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,

In [78]:
np.sum(model_outputs == y_train)

417

In [79]:
np.sum(model_outputs == y_train)/model_outputs.shape[0] #match / total gices the accuracy

0.7446428571428572

In [80]:
#finding the interceps and coef

reg.intercept_

array([-0.65260707])

In [81]:
reg.coef_

array([[ 1.62966944,  0.32503751,  2.17180065, -0.15666004,  0.14866102,
         0.59428203, -0.15202094,  0.25118914,  0.30779917, -0.28796242]])

In [82]:
feature_name = unscaled_inputs.columns.values

In [83]:
#merging the featurename and coef and intercep
summary_table = pd.DataFrame(columns = ['Feature name'], data = feature_name)

summary_table['Coefficient'] = np.transpose(reg.coef_)

summary_table

Unnamed: 0,Feature name,Coefficient
0,Reason_1,1.629669
1,Reason_2,0.325038
2,Reason_3,2.171801
3,Reason_4,-0.15666
4,Month Value,0.148661
5,Transportation Expense,0.594282
6,Age,-0.152021
7,Body Mass Index,0.251189
8,Children,0.307799
9,Pets,-0.287962


In [84]:
#adding intercept

summary_table.index = summary_table.index+1
summary_table.loc[0] = ['Intercept',reg.intercept_[0]]

summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Feature name,Coefficient
0,Intercept,-0.652607
1,Reason_1,1.629669
2,Reason_2,0.325038
3,Reason_3,2.171801
4,Reason_4,-0.15666
5,Month Value,0.148661
6,Transportation Expense,0.594282
7,Age,-0.152021
8,Body Mass Index,0.251189
9,Children,0.307799


In [85]:
#logistic reg is basically log(odss)= coef + c1x1 +c2x2 +... lets check the log or exponential values of coef

In [86]:
summary_table['Odds_ratio'] = np.exp(summary_table.Coefficient)
summary_table.sort_values('Odds_ratio', ascending = False)

Unnamed: 0,Feature name,Coefficient,Odds_ratio
3,Reason_3,2.171801,8.774069
1,Reason_1,1.629669,5.102188
6,Transportation Expense,0.594282,1.81173
2,Reason_2,0.325038,1.384083
9,Children,0.307799,1.360428
8,Body Mass Index,0.251189,1.285553
5,Month Value,0.148661,1.16028
7,Age,-0.152021,0.85897
4,Reason_4,-0.15666,0.854995
10,Pets,-0.287962,0.74979


In [46]:
# A feature is not perticularly important 
#1. If its coef is around 0 (no matter the feature we will multiply bu coef = 0)
#2. If its odds ration is around 1(For a unit change in standardizing feature, the odds increase by a multiple equal to the odds ratio(1 = no change)




# Testing the model 

In [87]:
reg.score(x_test,y_test)

0.7214285714285714

In [None]:
#test score is always a little less
# So based on the data that the model has never seen before, in 72% of the cases the model will predict correectly if the person is goung to be excessively absent.

In [88]:
#We can get the probablility of the outcome

# We can now test for the probability to get 0 or 1

#we can predict by predict method, here other methd is used for getting the probability.

predicted_proba = reg.predict_proba(x_test)
predicted_proba


array([[0.71095713, 0.28904287],
       [0.58730776, 0.41269224],
       [0.45674734, 0.54325266],
       [0.77549276, 0.22450724],
       [0.37845939, 0.62154061],
       [0.36662137, 0.63337863],
       [0.34737534, 0.65262466],
       [0.12995832, 0.87004168],
       [0.78656199, 0.21343801],
       [0.74456153, 0.25543847],
       [0.51552065, 0.48447935],
       [0.24047239, 0.75952761],
       [0.0730632 , 0.9269368 ],
       [0.69037326, 0.30962674],
       [0.28673897, 0.71326103],
       [0.48973889, 0.51026111],
       [0.56815402, 0.43184598],
       [0.55771148, 0.44228852],
       [0.42941078, 0.57058922],
       [0.05602557, 0.94397443],
       [0.706798  , 0.293202  ],
       [0.77549276, 0.22450724],
       [0.45367314, 0.54632686],
       [0.45367314, 0.54632686],
       [0.24022101, 0.75977899],
       [0.74877773, 0.25122227],
       [0.51412644, 0.48587356],
       [0.81803679, 0.18196321],
       [0.19334443, 0.80665557],
       [0.77549276, 0.22450724],
       [0.

In [89]:
predicted_proba.shape

# 1st column is the probabilty of medel assigng whee the outcome is 0, second is where its 1

(140, 2)

In [None]:
#Now the probability of excessive abseteeism is 1 hence we only take the second column 

In [90]:
predicted_proba[:,1]

array([0.28904287, 0.41269224, 0.54325266, 0.22450724, 0.62154061,
       0.63337863, 0.65262466, 0.87004168, 0.21343801, 0.25543847,
       0.48447935, 0.75952761, 0.9269368 , 0.30962674, 0.71326103,
       0.51026111, 0.43184598, 0.44228852, 0.57058922, 0.94397443,
       0.293202  , 0.22450724, 0.54632686, 0.54632686, 0.75977899,
       0.25122227, 0.48587356, 0.18196321, 0.80665557, 0.22450724,
       0.37223894, 0.63804589, 0.65285832, 0.50569621, 0.22450724,
       0.51783947, 0.22065028, 0.74716347, 0.39481106, 0.58405264,
       0.21720404, 0.485922  , 0.23559338, 0.62154061, 0.79724675,
       0.59590183, 0.65164958, 0.28904287, 0.2345296 , 0.21007405,
       0.60122198, 0.62124115, 0.63337863, 0.28328119, 0.80744337,
       0.43340836, 0.88201931, 0.23103511, 0.5609986 , 0.57142316,
       0.71554173, 0.62346804, 0.28794898, 0.79635863, 0.20295058,
       0.27191554, 0.21412972, 0.22065028, 0.75869752, 0.34703721,
       0.22065028, 0.54402782, 0.88102974, 0.44647774, 0.58079

In [None]:
# in reality the logreg models calculates the probability in the background.
#if the probability is < 0.5 then it places 0 else 1.

# #saving the model

In [None]:
#Pickle is amodule used to convert a python object into a char stream.

In [93]:
import pickle

In [94]:
with open('model','wb') as file:
    pickle.dump(reg,file)

In [95]:
#we must save the pickle the scaler object too. 
with open('scaler','wb') as file:
    pickle.dump(absenteeism_scaler,file)