In [145]:
import pandas as pd
import numpy as np

In [146]:
df = pd.read_csv("Absenteeism_preprocessed.csv")
df.head(5)

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,0,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,0,7,3,289,36,33,239.554,30,0,2,1,2


<b>Create targets</b>

In [147]:
# Finding the median value of absentism in hours to divide into 2 classes for Logistic regression to work

df["Absenteeism Time in Hours"].median()

3.0

In [148]:
targets = np.where(df["Absenteeism Time in Hours"]>df["Absenteeism Time in Hours"].median(),1,0)
targets

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,

In [149]:
# applying the above threshold to dataframe removing the earlier target col
df["Excessive Absentiesm"] = targets
df = df.drop(["Absenteeism Time in Hours", "Distance to Work", "Daily Work Load Average", "Day of the Week"], axis=1)
df.head(5)

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Excessive Absentiesm
0,0,0,0,1,7,289,33,30,0,2,1,1
1,0,0,0,0,7,118,50,31,0,1,0,0
2,0,0,0,0,7,179,38,31,0,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0,1
4,0,0,0,0,7,289,33,30,0,2,1,0


In [150]:
# checking the dataset balanced / unbalanced

targets.sum()/len(targets)

0.45571428571428574

<b>Select the i/p for regression</b>

In [152]:
df = df.iloc[:,0:11]
df

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,0,0,0,1,7,289,33,30,0,2,1
1,0,0,0,0,7,118,50,31,0,1,0
2,0,0,0,0,7,179,38,31,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0
4,0,0,0,0,7,289,33,30,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,179,40,22,1,2,0
696,1,0,0,0,5,225,28,24,0,1,2
697,1,0,0,0,5,330,28,25,1,0,0
698,0,0,0,0,5,235,32,25,1,0,0


In [153]:
# import the libraries needed to create the Custom Scaler

# note that all of them are a part of the sklearn package

# moreover, one of them is actually the StandardScaler module,

# so you can imagine that the Custom Scaler is build on it



from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.preprocessing import StandardScaler



# create the Custom Scaler class

class CustomScaler(BaseEstimator,TransformerMixin):

    # init or what information we need to declare a CustomScaler object

    # and what is calculated/declared as we do

    def __init__(self,columns):   

        # scaler is nothing but a Standard Scaler object

        self.scaler = StandardScaler()

        # with some columns 'twist'

        self.columns = columns

    # the fit method, which, again based on StandardScale

    def fit(self, X, y=None):

        self.scaler.fit(X[self.columns], y)

        self.mean_ = np.mean(X[self.columns])

        self.var_ = np.var(X[self.columns])

        return self
    
    # the transform method which does the actual scaling

    def transform(self, X, y=None):

        # record the initial order of the columns

        init_col_order = X.columns

        # scale all features that you chose when creating the instance of the class

        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)

        # declare a variable containing all information that was not scaled

        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]

        # return a data frame which contains all scaled features and all 'not scaled' features

        # use the original order (that you recorded in the beginning)

        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

In [154]:
# standardization substracts data with mean and divides by standard deviation

#columns_to_scale = ["Month Value",'Day of the Week','Transportation Expense', 'Distance to Work', 'Age',
#       'Daily Work Load Average', 'Body Mass Index','Children', 'Pets']

columns_to_omit = ["Reason_1","Reason_2", "Reason_3", "Reason_4","Education"]
columns_to_scale = [x for x in df.columns.values if x not in columns_to_omit]

scaling_obj = CustomScaler(columns_to_scale)
scaling_obj.fit(df)
scaled_df = scaling_obj.transform(df)
print(scaled_df.shape)

(700, 11)


In [155]:
scaled_df

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,0,0,0,1,0.182726,1.005844,-0.536062,0.767431,0,0.880469,0.268487
1,0,0,0,0,0.182726,-1.574681,2.130803,1.002633,0,-0.019280,-0.589690
2,0,0,0,0,0.182726,-0.654143,0.248310,1.002633,0,-0.919030,-0.589690
3,1,0,0,0,0.182726,0.854936,0.405184,-0.643782,0,0.880469,-0.589690
4,0,0,0,0,0.182726,1.005844,-0.536062,0.767431,0,0.880469,0.268487
...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,-0.388293,-0.654143,0.562059,-1.114186,1,0.880469,-0.589690
696,1,0,0,0,-0.388293,0.040034,-1.320435,-0.643782,0,-0.019280,1.126663
697,1,0,0,0,-0.388293,1.624567,-1.320435,-0.408580,1,-0.919030,-0.589690
698,0,0,0,0,-0.388293,0.190942,-0.692937,-0.408580,1,-0.919030,-0.589690


<b>Train Test Split</b>

In [156]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(scaled_df,targets, train_size = 0.8, random_state = 20)
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

(560, 11) (560,)
(140, 11) (140,)


<b>Logistic Regression</b>

In [157]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [158]:
logreg = LogisticRegression()

logreg.fit(x_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [159]:
logreg.score(x_train,y_train)

0.7464285714285714

<b>Finding Intercepts & Coefficients</b>

In [160]:
logreg.intercept_

array([-0.81258831])

In [161]:
logreg.coef_

array([[ 1.75317707, -0.02276885,  2.51573816, -0.03021215,  0.19268815,
         0.51251671, -0.07460196,  0.18151168, -0.01484993,  0.28750931,
        -0.3433442 ]])

In [162]:
# Creating the dataframe of coefficients with their feature names
feature_name = scaled_df.columns.values

summary_table = pd.DataFrame(columns=["Feature name"],data=feature_name)
summary_table["Coefficient"] = np.transpose(logreg.coef_)
summary_table

Unnamed: 0,Feature name,Coefficient
0,Reason_1,1.753177
1,Reason_2,-0.022769
2,Reason_3,2.515738
3,Reason_4,-0.030212
4,Month Value,0.192688
5,Transportation Expense,0.512517
6,Age,-0.074602
7,Body Mass Index,0.181512
8,Education,-0.01485
9,Children,0.287509


In [163]:
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ['Intercept', logreg.intercept_[0]]
summary_table = summary_table.sort_index()
summary_table["Odds Ratio"] = np.exp(summary_table.Coefficient)
summary_table.sort_values("Odds Ratio", ascending=False)

# The coefficients whose value is closer to 0 or odds ratio closer to 1, those features are not relevant to our study

Unnamed: 0,Feature name,Coefficient,Odds Ratio
3,Reason_3,2.515738,12.375741
1,Reason_1,1.753177,5.772915
6,Transportation Expense,0.512517,1.669488
10,Children,0.287509,1.333103
5,Month Value,0.192688,1.212505
8,Body Mass Index,0.181512,1.199029
9,Education,-0.01485,0.98526
2,Reason_2,-0.022769,0.977488
4,Reason_4,-0.030212,0.97024
7,Age,-0.074602,0.928113


In [164]:
# Reason 3 have the highest odds ratio means when person sufferes poisoning it is most likely to absent,same for reason 1 for diseases
# Distance to work, daily load average, day of the week, education have low impact removing from the table and again running the steps

<b>Model Testing</b>

In [165]:
logreg.score(x_test,y_test)

0.7214285714285714

<b>Model Saving</b>

In [166]:
# We will save the logreg variable in file using pickle method

import pickle

with open('model','wb') as file:
    pickle.dump(logreg, file)

with open('scaler','wb') as file:
    pickle.dump(scaling_obj, file)