# ***Using Logistic Regression To Predict Absenteeism***

## Importing Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [2]:
data = pd.read_csv('1.1 Absenteeism_preprocessed.csv.csv')

In [3]:
data.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pet,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2


In [4]:
data['Absenteeism Time in Hours'].value_counts()

8      195
2      149
3      106
1       87
4       57
0       39
16      18
24      15
40       7
5        7
32       6
64       3
112      2
56       2
80       2
120      2
48       1
7        1
104      1
Name: Absenteeism Time in Hours, dtype: int64

In [5]:
data['Absenteeism Time in Hours'].median()

3.0

In [6]:
targets = np.where(data['Absenteeism Time in Hours']>3, 1, 0)

In [7]:
targets

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,

In [8]:
targets.sum()/targets.shape[0]

0.45571428571428574

In [9]:
data['Excessive Absentism'] = targets

In [10]:
data.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pet,Absenteeism Time in Hours,Excessive Absentism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2,0


In [11]:
data['Excessive Absentism'].value_counts()

0    381
1    319
Name: Excessive Absentism, dtype: int64

In [12]:
data_preprocessed = data.drop(['Absenteeism Time in Hours'], axis=1)
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pet,Excessive Absentism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,0


## Feature Scaling(Excluding Dummy Vaiables)

In [13]:
data_preprocessed.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Day of the Week', 'Transportation Expense', 'Distance to Work',
       'Age', 'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pet', 'Excessive Absentism'], dtype=object)

In [14]:
columns_to_scale = ['Month Value', 'Day of the Week', 'Transportation Expense', 'Distance to Work', 'Age', 'Daily Work Load Average', 'Body Mass Index', 'Children', 'Pet']

In [15]:
pd.options.display.max_rows = None
pd.options.display.max_columns = None

data_preprocessed

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pet,Excessive Absentism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,0
5,0,0,0,1,10,2,179,51,38,239.554,31,0,0,0,0
6,0,0,0,1,7,4,361,52,28,239.554,27,0,1,4,1
7,0,0,0,1,7,4,260,50,36,239.554,23,0,4,0,1
8,0,0,1,0,6,6,155,12,34,239.554,25,0,2,0,1
9,0,0,0,1,7,0,235,11,37,239.554,29,1,1,1,1


In [16]:
sc = StandardScaler()
data_preprocessed[columns_to_scale] = sc.fit_transform(data_preprocessed[columns_to_scale])

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [17]:
data_preprocessed

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pet,Excessive Absentism
0,0,0,0,1,0.030796,-0.80095,1.005844,0.412816,-0.536062,-0.806331,0.767431,0,0.880469,0.268487,1
1,0,0,0,0,0.030796,-0.80095,-1.574681,-1.141882,2.130803,-0.806331,1.002633,0,-0.01928,-0.58969,0
2,0,0,0,1,0.030796,-0.2329,-0.654143,1.426749,0.24831,-0.806331,1.002633,0,-0.91903,-0.58969,0
3,1,0,0,0,0.030796,0.335149,0.854936,-1.682647,0.405184,-0.806331,-0.643782,0,0.880469,-0.58969,1
4,0,0,0,1,0.030796,0.335149,1.005844,0.412816,-0.536062,-0.806331,0.767431,0,0.880469,0.268487,0
5,0,0,0,1,0.929019,-0.2329,-0.654143,1.426749,0.24831,-0.806331,1.002633,0,-0.91903,-0.58969,0
6,0,0,0,1,0.030796,0.903199,2.092381,1.494345,-1.320435,-0.806331,0.061825,0,-0.01928,2.843016,1
7,0,0,0,1,0.030796,0.903199,0.568211,1.359154,-0.065439,-0.806331,-0.878984,0,2.679969,-0.58969,1
8,0,0,1,0,-0.268611,2.039298,-1.016322,-1.209478,-0.379188,-0.806331,-0.40858,0,0.880469,-0.58969,1
9,0,0,0,1,0.030796,-1.368999,0.190942,-1.277074,0.091435,-0.806331,0.532229,1,-0.01928,0.268487,1


## Selecting Dependent And Independent Variables

In [18]:

X = data_preprocessed.iloc[:, :-1].values
Y = data_preprocessed.iloc[:, 14].values

## For Printing Whole Array In NumPy

In [19]:
np.set_printoptions(threshold=np.nan)

In [20]:
X

array([[ 0.        ,  0.        ,  0.        ,  1.        ,  0.03079619,
        -0.80094984,  1.00584437,  0.41281584, -0.53606239, -0.80633129,
         0.76743118,  0.        ,  0.88046927,  0.26848661],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.03079619,
        -0.80094984, -1.57468098, -1.1418824 ,  2.13080317, -0.80633129,
         1.00263338,  0.        , -0.01928035, -0.58968976],
       [ 0.        ,  0.        ,  0.        ,  1.        ,  0.03079619,
        -0.23290031, -0.6541427 ,  1.42674947,  0.24830984, -0.80633129,
         1.00263338,  0.        , -0.91902997, -0.58968976],
       [ 1.        ,  0.        ,  0.        ,  0.        ,  0.03079619,
         0.33514923,  0.85493646, -1.68264701,  0.40518428, -0.80633129,
        -0.64378202,  0.        ,  0.88046927, -0.58968976],
       [ 0.        ,  0.        ,  0.        ,  1.        ,  0.03079619,
         0.33514923,  1.00584437,  0.41281584, -0.53606239, -0.80633129,
         0.76743118,  0.  

## Splitting into test and train data

In [21]:
X_train, X_test, Y_train, Y_test =train_test_split(X, Y, test_size=0.2, random_state=0)

In [22]:
X_test

array([[ 0.        ,  0.        ,  0.        ,  1.        ,  0.92901852,
         0.33514923, -1.57468098, -1.1418824 ,  2.13080317, -0.16964817,
         1.00263338,  0.        , -0.01928035, -0.58968976],
       [ 0.        ,  0.        ,  0.        ,  1.        , -1.46624102,
        -0.23290031, -0.6541427 , -0.26313992, -1.00668572, -0.18885143,
        -1.81979303,  1.        , -0.91902997, -0.58968976],
       [ 0.        ,  0.        ,  0.        ,  1.        , -1.76564846,
         0.90319876, -0.57868874,  0.81838929, -1.47730906, -0.75827313,
        -1.34938863,  0.        , -0.91902997, -0.58968976],
       [ 0.        ,  0.        ,  0.        ,  1.        ,  0.33020364,
         0.33514923,  1.03602595,  0.07483796,  0.56205873, -0.5502128 ,
        -0.40857982,  0.        , -0.01928035,  0.26848661],
       [ 0.        ,  0.        ,  0.        ,  1.        ,  0.03079619,
        -0.23290031, -0.6541427 ,  1.42674947,  0.24830984, -0.80633129,
         1.00263338,  0.  

## Fitting Logistic Regression

In [23]:
classifier = LogisticRegression(random_state=0)
classifier.fit(X_train, Y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=0, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [24]:
classifier.score(X_train, Y_train)

0.7625

## Predicting With Test Set

In [25]:
Y_Pred = classifier.predict(X_test)

## Confusin Matrix

In [26]:
cm = confusion_matrix(Y_test, Y_Pred)
cm

array([[57, 15],
       [19, 49]], dtype=int64)

In [27]:
accuracy_score(Y_test, Y_Pred)

0.7571428571428571

## Finding Intercept And Coefficients

In [28]:
classifier.intercept_

array([-1.4911198])

In [29]:
classifier.coef_

array([[ 2.51219149,  0.39426425,  2.96624754,  0.66194842,  0.00678983,
        -0.10678799,  0.60797506,  0.0341006 , -0.16141509,  0.02764361,
         0.20772946,  0.1160823 ,  0.47524168, -0.29905387]])

## Creating Summary Table

In [30]:
X1 = data_preprocessed.iloc[:, :-1]

In [31]:
X1.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Day of the Week', 'Transportation Expense', 'Distance to Work',
       'Age', 'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pet'], dtype=object)

In [32]:
feature_name = X1.columns.values

In [33]:
summary_table = pd.DataFrame(columns=['Feature name'], data = feature_name)
summary_table['Coefficients'] = np.transpose(classifier.coef_)
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ['Intercept', classifier.intercept_[0]]

In [34]:
summary_table = summary_table.sort_index()

In [35]:
summary_table

Unnamed: 0,Feature name,Coefficients
0,Intercept,-1.49112
1,Reason_1,2.512191
2,Reason_2,0.394264
3,Reason_3,2.966248
4,Reason_4,0.661948
5,Month Value,0.00679
6,Day of the Week,-0.106788
7,Transportation Expense,0.607975
8,Distance to Work,0.034101
9,Age,-0.161415


In [36]:
summary_table['Odds_Ratio'] = np.exp(summary_table.Coefficients)
summary_table
summary_table.sort_values('Odds_Ratio', ascending=False)

Unnamed: 0,Feature name,Coefficients,Odds_Ratio
3,Reason_3,2.966248,19.418914
1,Reason_1,2.512191,12.331926
4,Reason_4,0.661948,1.938566
7,Transportation Expense,0.607975,1.836708
13,Children,0.475242,1.608403
2,Reason_2,0.394264,1.483292
11,Body Mass Index,0.207729,1.23088
12,Education,0.116082,1.123088
8,Distance to Work,0.034101,1.034689
10,Daily Work Load Average,0.027644,1.028029


In [43]:
summary_table

Unnamed: 0,Feature name,Coefficients,Odds_Ratio
0,Intercept,-1.49112,0.22512
1,Reason_1,2.512191,12.331926
2,Reason_2,0.394264,1.483292
3,Reason_3,2.966248,19.418914
4,Reason_4,0.661948,1.938566
5,Month Value,0.00679,1.006813
6,Day of the Week,-0.106788,0.898716
7,Transportation Expense,0.607975,1.836708
8,Distance to Work,0.034101,1.034689
9,Age,-0.161415,0.850939


## Saving Model

In [39]:
import pickle
with open('model', 'wb') as file:
    pickle.dump(classifier, file)

## Loading Model

In [41]:
with open('model', 'rb') as file:
    load_model = pickle.load(file)
    

## Using Load_model

In [42]:
load_model.predict(X_test)

array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1,
       0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1,
       0, 1, 0, 0, 0, 0, 0, 1])