### We will apply logistic regression to predict absenteeism

In [1]:
import pandas as pd
import numpy as np

In [2]:
data1 = pd.read_csv('Preprocessed_data.csv')
data1.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2


### Grouping target values

In [3]:
targets = np.where(
    data1['Absenteeism Time in Hours'] > data1['Absenteeism Time in Hours'].median(), 1, 0)

data1['Excessive Absenteeism'] = targets

data1.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Excessive Absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2,0


#### A comment on excessive absenteeism

As can be seen below, by using the median value as a measure for threshold to split data into 1s and 0s; we have succeeded to get a balanced data. That will prevent the model to be trained mostly for one target. So, we will be getting healthier outputs.

In [4]:
data1['Excessive Absenteeism'].value_counts()

0    381
1    319
Name: Excessive Absenteeism, dtype: int64

In [5]:
targets.sum()/targets.shape[0]

0.45571428571428574

Around 46 % of targets are 1s. Usually 40-60 splits would work for fine for logistic regressions.

In [6]:
data2 = data1.drop(['Absenteeism Time in Hours'], axis=1) 

### Scaling Inputs

In [7]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

class CustomScaler(BaseEstimator,TransformerMixin): 
    
    # init or what information we need to declare a CustomScaler object
    # and what is calculated/declared as we do
    
    def __init__(self,columns,copy=True,with_mean=True,with_std=True):
        
        # scaler is nothing but a Standard Scaler object
        self.scaler = StandardScaler(copy,with_mean,with_std)
        # with some columns 'twist'
        self.columns = columns
        self.mean_ = None
        self.var_ = None
        
    
    # the fit method, which, again based on StandardScale
    
    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    
    # the transform method which does the actual scaling

    def transform(self, X, y=None, copy=None):
        
        # record the initial order of the columns
        init_col_order = X.columns
        
        # scale all features that you chose when creating the instance of the class
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        
        # declare a variable containing all information that was not scaled
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        
        # return a data frame which contains all scaled features and all 'not scaled' features
        # use the original order (that you recorded in the beginning)
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

In [8]:
# data2.iloc[:, :14]
# data2.iloc[:, :-1]

unscaled = data2.iloc[:, :-1]

columns_not_to_standardize = ['Reason_1','Reason_2','Reason_3','Reason_4','Education']
columns_to_standardize = [x for x in unscaled.columns.values if x not in columns_not_to_standardize]

scaler = CustomScaler(columns_to_standardize)

scaler.fit(unscaled)

data_scaled = scaler.transform(unscaled)

### Train Test Split

In [9]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(data_scaled, targets, train_size = 0.8, random_state = 20) 
#shuffle=True by default 

x_train.shape, y_train.shape, x_test.shape, y_test.shape

((560, 14), (560,), (140, 14), (140,))

### Building Logistic Regression

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [11]:
logreg = LogisticRegression()

logreg.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [12]:
logreg.score(x_test, y_test)

0.7428571428571429

### Manually checking the accuracy

In [13]:
outputs = logreg.predict(x_test)
outputs == y_test

array([ True, False,  True,  True,  True,  True,  True,  True,  True,
       False,  True,  True,  True, False,  True, False,  True, False,
        True,  True,  True,  True,  True, False,  True,  True,  True,
       False, False,  True, False,  True, False,  True,  True,  True,
        True,  True, False,  True,  True, False,  True,  True, False,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
       False, False,  True,  True,  True,  True, False,  True,  True,
        True,  True,  True,  True,  True, False,  True, False,  True,
        True,  True, False, False,  True, False, False,  True,  True,
        True,  True, False, False,  True,  True,  True,  True,  True,
       False,  True,  True,  True,  True,  True, False,  True,  True,
        True, False,  True,  True, False,  True,  True,  True,  True,
       False, False,  True,  True,  True,  True, False,  True,  True,
        True,  True,

In [14]:
np.sum(outputs == y_test) / outputs.shape[0]

0.7428571428571429

### intercepts and coefficients

In [15]:
logreg.intercept_, logreg.coef_

(array([-1.6561092]),
 array([[ 2.80096498e+00,  9.34857518e-01,  3.09561645e+00,
          8.56587468e-01,  1.66248119e-01, -8.43703301e-02,
          6.12732578e-01, -7.79685996e-03, -1.65922708e-01,
         -1.47005122e-04,  2.71811477e-01, -2.05738037e-01,
          3.61989880e-01, -2.85510745e-01]]))

In [16]:
feature_names = unscaled.columns.values

summary_table = pd.DataFrame(columns=['Feature Name'], data = feature_names)

summary_table['Coefficient'] = np.transpose(logreg.coef_)
summary_table

Unnamed: 0,Feature Name,Coefficient
0,Reason_1,2.800965
1,Reason_2,0.934858
2,Reason_3,3.095616
3,Reason_4,0.856587
4,Month,0.166248
5,Day of the Week,-0.08437
6,Transportation Expense,0.612733
7,Distance to Work,-0.007797
8,Age,-0.165923
9,Daily Work Load Average,-0.000147


In [17]:
summary_table.index = summary_table.index +1
summary_table.loc[0] = ['Intercept', logreg.intercept_[0]]
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Feature Name,Coefficient
0,Intercept,-1.656109
1,Reason_1,2.800965
2,Reason_2,0.934858
3,Reason_3,3.095616
4,Reason_4,0.856587
5,Month,0.166248
6,Day of the Week,-0.08437
7,Transportation Expense,0.612733
8,Distance to Work,-0.007797
9,Age,-0.165923


In [18]:
summary_table['odds ratio'] = np.exp(summary_table.Coefficient)
summary_table

Unnamed: 0,Feature Name,Coefficient,odds ratio
0,Intercept,-1.656109,0.19088
1,Reason_1,2.800965,16.460523
2,Reason_2,0.934858,2.546851
3,Reason_3,3.095616,22.100858
4,Reason_4,0.856587,2.35511
5,Month,0.166248,1.180866
6,Day of the Week,-0.08437,0.919091
7,Transportation Expense,0.612733,1.845467
8,Distance to Work,-0.007797,0.992233
9,Age,-0.165923,0.847112


In [19]:
summary_table.sort_values('odds ratio', ascending = False) 

Unnamed: 0,Feature Name,Coefficient,odds ratio
3,Reason_3,3.095616,22.100858
1,Reason_1,2.800965,16.460523
2,Reason_2,0.934858,2.546851
4,Reason_4,0.856587,2.35511
7,Transportation Expense,0.612733,1.845467
13,Children,0.36199,1.436184
11,Body Mass Index,0.271811,1.31234
5,Month,0.166248,1.180866
10,Daily Work Load Average,-0.000147,0.999853
8,Distance to Work,-0.007797,0.992233


A feature is not particularlu important (statistically insignificant for predicting the targets). 

__IN CASE:__
    * coefficient is around 0
    * odds ratio is around 1
    
For a unit change in the standardized feature, the odds increase by a multiple equal to the odds ratio. __(1= no change)__

### Dropping redundant features

In [20]:
# I drop the useless columns
data2 = data1.drop(['Daily Work Load Average', 
                    'Distance to Work', 
                    'Day of the Week'], axis=1) 

# code below is running all standardization, train-test splitting and logistic model building again.
unscaled = data2.iloc[:, :-1]
columns_not_to_standardize = ['Reason_1','Reason_2','Reason_3','Reason_4','Education']
columns_to_standardize = [x for x in unscaled.columns.values if x not in columns_not_to_standardize]
scaler = CustomScaler(columns_to_standardize)
scaler.fit(unscaled)
data_scaled = scaler.transform(unscaled)

x_train, x_test, y_train, y_test = train_test_split(data_scaled, targets, train_size = 0.8, random_state = 20) 

logreg = LogisticRegression()

logreg.fit(x_train, y_train)

logreg.score(x_test, y_test)

0.9285714285714286

Seems like model did not improve much. It is not affected by dropping colmns. That mean it was a good decision to drop redundant columns.

### Testing

In [21]:
logreg.score(x_test, y_test)

0.9285714285714286

In [22]:
pp = logreg.predict_proba(x_test)
pp[:,1]

array([0.06255056, 0.45782212, 0.9413477 , 0.14379319, 0.98151239,
       0.37961743, 0.93775989, 0.9485677 , 0.08238497, 0.29985076,
       0.93429911, 0.70748725, 1.        , 0.86042296, 0.92417928,
       0.4198498 , 0.13765545, 0.91814332, 1.        , 0.97931978,
       0.24632736, 0.14379319, 0.91737738, 0.25220296, 0.9277763 ,
       0.10776764, 0.21807215, 0.85634442, 0.10767382, 0.14379319,
       0.91597635, 0.92381673, 0.07806004, 0.94121775, 0.14379319,
       0.93535202, 0.1608087 , 0.92830491, 0.38388704, 0.64165415,
       0.07294632, 0.99944468, 0.0968785 , 0.09010654, 0.10897445,
       0.8733556 , 0.99999905, 0.06255056, 0.09266326, 0.06903469,
       0.99971076, 0.05716231, 0.99999861, 0.06758341, 1.        ,
       0.10461725, 0.94740949, 0.48283697, 0.04595025, 0.04862218,
       0.92494531, 0.90420181, 0.148062  , 0.3442335 , 0.86077376,
       0.19338166, 0.02523625, 0.08698236, 0.99969488, 0.88656335,
       0.1608087 , 0.0468594 , 0.96860316, 0.09338934, 0.95381

### Saving the model

In [23]:
import pickle

In [24]:
with open('my_model', 'wb') as file:
    pickle.dump(logreg, file)

In [25]:
with open('my_scaler', 'wb') as file:
    pickle.dump(scaler, file)

The pickled objcets 'my_model' and 'my_scaler' will be used as objects in the next notebook.