# Creating a logistical regression to predict the absenteeism

In [1]:
import pandas as pd
import numpy as np

### Load data

In [2]:
data_preprocessed= pd.read_csv('Absenteeism_preprocessed.csv')
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Month Value,Day of the Week
0,0,0,0,1,2015-07-07,289,36,33,239.554,30,1,2,1,4,7,1
1,0,0,0,0,2015-07-14,118,13,50,239.554,31,1,1,0,0,7,1
2,0,0,0,1,2015-07-15,179,51,38,239.554,31,1,0,0,2,7,2
3,1,0,0,0,2015-07-16,279,5,39,239.554,24,1,2,0,4,7,3
4,0,0,0,1,2015-07-23,289,36,33,239.554,30,1,2,1,2,7,3


### Create the targets

In [3]:
data_preprocessed['Absenteeism Time in Hours'].median()

3.0

In [4]:
# we create 'targets' by reassigning 1 to those that gave > mean() values
data_preprocessed['Excessive Absenteeism']= np.where(data_preprocessed['Absenteeism Time in Hours']>data_preprocessed['Absenteeism Time in Hours'].mean(),1,0)


In [5]:
targets=np.where(data_preprocessed['Absenteeism Time in Hours']>
                 data_preprocessed['Absenteeism Time in Hours'].mean(),1,0)
targets.sum()/targets.shape[0]

0.36428571428571427

In [6]:
column_names=data_preprocessed.columns.values
column_names

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Date',
       'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours', 'Month Value',
       'Day of the Week', 'Excessive Absenteeism'], dtype=object)

In [7]:
column_names=['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Day of the Week','Date',
       'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours', 'Excessive Absenteeism']

In [8]:
#checkpoint
df_with_targets= data_preprocessed.copy()
df_with_targets=df_with_targets[column_names]
df_with_targets= df_with_targets.drop(['Absenteeism Time in Hours'], axis=1)
df_with_targets= df_with_targets.drop(['Date'], axis=1)

In [9]:
#making the model more simple by eliminating columns
df_with_targets= df_with_targets.drop(['Day of the Week','Daily Work Load Average', 'Distance to Work', 'Day of the Week'], axis=1)
df_with_targets.head(10)

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Excessive Absenteeism
0,0,0,0,1,7,289,33,30,1,2,1,0
1,0,0,0,0,7,118,50,31,1,1,0,0
2,0,0,0,1,7,179,38,31,1,0,0,0
3,1,0,0,0,7,279,39,24,1,2,0,0
4,0,0,0,1,7,289,33,30,1,2,1,0
5,0,0,0,1,7,179,38,31,1,0,0,0
6,0,0,0,1,7,361,28,27,1,1,4,1
7,0,0,0,1,7,260,36,23,1,4,0,0
8,0,0,1,0,7,155,34,25,1,2,0,1
9,0,0,0,1,7,235,37,29,3,1,1,1


### Select the inputs for regression

In [10]:
df_with_targets.shape

(700, 12)

In [11]:
unscaled_inputs=df_with_targets.iloc[:,:-1]

### Standadize the data

In [12]:
#from sklearn.preprocessing import StandardScaler
#absenteeism_scaler=  StandardScaler()

In [13]:
#adding this code to correct model by not touching dummies
import sklearn
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

class CustomScaler (BaseEstimator, TransformerMixin):
    
    def __init__(self,columns,copy=True,with_mean=True,with_std=True):
        self.scaler=StandardScaler(copy,with_mean,with_std)
        self.columns= columns
        self.mean_=None
        self.var_= None
    
    def fit(self,X,y=None):
        self.scaler.fit(X[self.columns],y)
        self.mean_= np.mean(X[self.columns])
        self.var_=np.var(X[self.columns])
    
    def transform(self, X, y=None, copy=None):
        init_col_order=X.columns
        X_scaled=pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        X_not_scaled=X.loc[:,~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]


In [14]:
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [15]:
#for any changes in the columsn to process
columns_to_omit=['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4','Education']
columns_to_scale=[x for x in unscaled_inputs.columns.values if x not in columns_to_omit]

In [16]:
absenteeism_scaler=CustomScaler(columns_to_scale)



In [17]:
absenteeism_scaler.fit(unscaled_inputs)

In [18]:
scaled_inputs= absenteeism_scaler.transform(unscaled_inputs)
scaled_inputs

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,0,0,0,1,0.182726,1.005844,-0.536062,0.767431,1,0.880469,0.268487
1,0,0,0,0,0.182726,-1.574681,2.130803,1.002633,1,-0.019280,-0.589690
2,0,0,0,1,0.182726,-0.654143,0.248310,1.002633,1,-0.919030,-0.589690
3,1,0,0,0,0.182726,0.854936,0.405184,-0.643782,1,0.880469,-0.589690
4,0,0,0,1,0.182726,1.005844,-0.536062,0.767431,1,0.880469,0.268487
...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,-0.388293,-0.654143,0.562059,-1.114186,2,0.880469,-0.589690
696,1,0,0,0,-0.388293,0.040034,-1.320435,-0.643782,1,-0.019280,1.126663
697,1,0,0,0,-0.388293,1.624567,-1.320435,-0.408580,2,-0.919030,-0.589690
698,0,0,0,1,-0.388293,0.190942,-0.692937,-0.408580,3,-0.919030,-0.589690


In [19]:
scaled_inputs.shape

(700, 11)

## Split data into train and test and shuffle

### Import the relevant module

In [20]:
from sklearn.model_selection import train_test_split

### Split

In [21]:
train_test_split(scaled_inputs, targets)

[     Reason_1  Reason_2  Reason_3  Reason_4  Month Value  \
 182         1         0         0         0    -0.959313   
 631         0         0         0         1    -0.959313   
 283         0         0         0         1     0.753746   
 618         0         0         0         1    -0.959313   
 440         0         0         0         1    -0.388293   
 ..        ...       ...       ...       ...          ...   
 136         0         0         0         1    -1.530333   
 377         1         0         0         0    -1.244823   
 192         0         0         0         1    -0.673803   
 388         0         0         0         1    -1.244823   
 67          0         0         0         1     1.039256   
 
      Transportation Expense       Age  Body Mass Index  Education  Children  \
 182                0.387122  1.660180         1.237836          1  0.880469   
 631               -0.654143  0.248310         1.002633          1 -0.919030   
 283                0.1909

In [22]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size=0.8, random_state=20)

In [23]:
print(x_train.shape, y_train.shape)

(560, 11) (560,)


In [24]:
print(x_test.shape, y_test.shape)

(140, 11) (140,)


### Logistic regression with sklearn

In [25]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

### Training the model

In [26]:
reg= LogisticRegression()

In [27]:
reg.fit(x_train, y_train)

LogisticRegression()

In [28]:
reg.score(x_train, y_train)

0.7928571428571428

### Manually check the accuracy

In [29]:
model_outputs=reg.predict(x_train)
model_outputs == y_train

array([ True,  True, False,  True, False,  True,  True,  True,  True,
       False, False,  True, False, False,  True,  True,  True,  True,
       False, False, False,  True,  True, False,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True, False, False,  True,  True,  True,  True, False,
        True,  True,  True, False,  True, False,  True,  True,  True,
        True, False,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
       False,  True, False,  True,  True,  True,  True,  True,  True,
        True,  True, False,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False,  True, False, False,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False,

In [30]:
np.sum(model_outputs == y_train)

444

In [31]:
np.sum(model_outputs == y_train)/ model_outputs.shape[0]

0.7928571428571428

### Finding the Intercept and coefficients

In [32]:
reg.intercept_

array([-1.64686418])

In [33]:
reg.coef_

array([[ 2.54151482,  1.13494972,  3.19387025,  0.14448281,  0.01669096,
         0.67482812, -0.16814539,  0.44941565, -0.0791048 ,  0.16432508,
        -0.18385346]])

In [34]:
feature_name=unscaled_inputs.columns.values

In [35]:
summary_table=pd.DataFrame(columns=['Feature name'], data=feature_name)
summary_table['Coefficient']= np.transpose(reg.coef_)
summary_table

Unnamed: 0,Feature name,Coefficient
0,Reason_1,2.541515
1,Reason_2,1.13495
2,Reason_3,3.19387
3,Reason_4,0.144483
4,Month Value,0.016691
5,Transportation Expense,0.674828
6,Age,-0.168145
7,Body Mass Index,0.449416
8,Education,-0.079105
9,Children,0.164325


In [36]:
summary_table.index=summary_table.index+1
summary_table.loc[0]=['Intercept', reg.intercept_[0]]
summary_table= summary_table.sort_index()
summary_table

Unnamed: 0,Feature name,Coefficient
0,Intercept,-1.646864
1,Reason_1,2.541515
2,Reason_2,1.13495
3,Reason_3,3.19387
4,Reason_4,0.144483
5,Month Value,0.016691
6,Transportation Expense,0.674828
7,Age,-0.168145
8,Body Mass Index,0.449416
9,Education,-0.079105


### Interpreting Coeficcient

In [37]:
summary_table['Odds_ratio'] = np.exp(summary_table.Coefficient)
summary_table

Unnamed: 0,Feature name,Coefficient,Odds_ratio
0,Intercept,-1.646864,0.192653
1,Reason_1,2.541515,12.698893
2,Reason_2,1.13495,3.111017
3,Reason_3,3.19387,24.382612
4,Reason_4,0.144483,1.155442
5,Month Value,0.016691,1.016831
6,Transportation Expense,0.674828,1.963695
7,Age,-0.168145,0.845231
8,Body Mass Index,0.449416,1.567396
9,Education,-0.079105,0.923943


In [38]:
summary_table.sort_values('Odds_ratio', ascending=False)

Unnamed: 0,Feature name,Coefficient,Odds_ratio
3,Reason_3,3.19387,24.382612
1,Reason_1,2.541515,12.698893
2,Reason_2,1.13495,3.111017
6,Transportation Expense,0.674828,1.963695
8,Body Mass Index,0.449416,1.567396
10,Children,0.164325,1.178597
4,Reason_4,0.144483,1.155442
5,Month Value,0.016691,1.016831
9,Education,-0.079105,0.923943
7,Age,-0.168145,0.845231


### Testing the model

In [39]:
reg.score(x_test, y_test)

0.7642857142857142

In [40]:
predicted_proba=reg.predict_proba(x_test)
predicted_proba

array([[0.83679399, 0.16320601],
       [0.76860059, 0.23139941],
       [0.5764923 , 0.4235077 ],
       [0.84193389, 0.15806611],
       [0.20313454, 0.79686546],
       [0.32643715, 0.67356285],
       [0.41993133, 0.58006867],
       [0.11003656, 0.88996344],
       [0.86501963, 0.13498037],
       [0.83938056, 0.16061944],
       [0.60658128, 0.39341872],
       [0.43512884, 0.56487116],
       [0.11869188, 0.88130812],
       [0.81085383, 0.18914617],
       [0.36719508, 0.63280492],
       [0.64100628, 0.35899372],
       [0.61225273, 0.38774727],
       [0.61112082, 0.38887918],
       [0.52937049, 0.47062951],
       [0.06274342, 0.93725658],
       [0.85935807, 0.14064193],
       [0.84193389, 0.15806611],
       [0.53729826, 0.46270174],
       [0.53729826, 0.46270174],
       [0.23039878, 0.76960122],
       [0.86221328, 0.13778672],
       [0.75135593, 0.24864407],
       [0.94581926, 0.05418074],
       [0.19547449, 0.80452551],
       [0.84193389, 0.15806611],
       [0.

In [41]:
predicted_proba.shape

(140, 2)

In [42]:
predicted_proba[:,1]

array([0.16320601, 0.23139941, 0.4235077 , 0.15806611, 0.79686546,
       0.67356285, 0.58006867, 0.88996344, 0.13498037, 0.16061944,
       0.39341872, 0.56487116, 0.88130812, 0.18914617, 0.63280492,
       0.35899372, 0.38774727, 0.38887918, 0.47062951, 0.93725658,
       0.14064193, 0.15806611, 0.46270174, 0.46270174, 0.76960122,
       0.13778672, 0.24864407, 0.05418074, 0.80452551, 0.15806611,
       0.22802658, 0.63388963, 0.67565499, 0.39569547, 0.15806611,
       0.40296202, 0.13553775, 0.70883905, 0.2770745 , 0.26304248,
       0.15743296, 0.37734858, 0.13665834, 0.23600694, 0.85823575,
       0.61414307, 0.56142017, 0.16320601, 0.12840423, 0.15680186,
       0.3897418 , 0.33384198, 0.67356285, 0.13163836, 0.76792043,
       0.23309885, 0.93007426, 0.10387704, 0.3864157 , 0.38754618,
       0.66150245, 0.67251418, 0.10702352, 0.77379898, 0.10931045,
       0.16190856, 0.06169995, 0.13553775, 0.75662145, 0.192087  ,
       0.13553775, 0.3239438 , 0.89026074, 0.28187244, 0.40985

### Save the model

In [43]:
import pickle

In [44]:
with open ('model','wb') as file:
    pickle.dump(reg,file)

In [45]:
with open('scaler', 'wb') as file:
    pickle.dump(absenteeism_scaler, file)