# Absenteeism Exercise - Logistic Regression #

## Import the relevant Libraries ##

In [1]:
import pandas as pd
import numpy as np

## Load the Data ##

In [2]:
data_preprocessed = pd.read_csv('Absenteeism_preprocessed.csv')
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Month Value,Day of the Week
0,False,False,False,True,2015-07-07,289,36,33,239.554,30,0,2,1,4,7,1
1,False,False,False,False,2015-07-14,118,13,50,239.554,31,0,1,0,0,7,1
2,False,False,False,True,2015-07-15,179,51,38,239.554,31,0,0,0,2,7,2
3,True,False,False,False,2015-07-16,279,5,39,239.554,24,0,2,0,4,7,3
4,False,False,False,True,2015-07-23,289,36,33,239.554,30,0,2,1,2,7,3


In [3]:
data_preprocessed = data_preprocessed.drop(['Date'],axis=1)

## Create the Targets ##

In [4]:
data_preprocessed['Absenteeism Time in Hours'].median()

3.0

So, now we will be Clustering the Absenteeism time :

  Moderately absent -- (<= 3 hours)  
  Excessively absent -- (>= 4 hours)

In [5]:
targets = np.where(data_preprocessed['Absenteeism Time in Hours'] > 
                   data_preprocessed['Absenteeism Time in Hours'].median() , 1 , 0)

data_preprocessed['Excessive Absenteeism'] = targets

In [6]:
targets.sum() / targets.shape[0]

0.45571428571428574

*This shows that around 46% of the targets are 1s*

In [7]:
data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours' ,'Daily Work Load Average', 'Distance to Work' , 'Day of the Week'],axis=1)

## Select the Inputs for the Regression ##

In [8]:
unscaled_inputs = data_with_targets.iloc[: , :-1]

## Standardize the Data ##

In [9]:
#from sklearn.preprocessing import StandardScaler

#absenteeism_scaler = StandardScaler()

In [10]:
# import the libraries needed to create the Custom Scaler
# note that all of them are a part of the sklearn package
# moreover, one of them is actually the StandardScaler module, 
# so you can imagine that the Custom Scaler is build on it

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

# create the Custom Scaler class

class CustomScaler(BaseEstimator,TransformerMixin): 
    
    # init or what information we need to declare a CustomScaler object
    # and what is calculated/declared as we do
    
    def __init__(self,columns,copy=True,with_mean=True,with_std=True):
        
        # scaler is nothing but a Standard Scaler object
        self.copy = copy
        self.with_mean = with_mean
        self.with_std = with_std
        self.scaler = StandardScaler(copy=copy, with_mean=with_mean, with_std=with_std)
        # with some columns 'twist'
        self.columns = columns
        self.mean_ = None
        self.var_ = None
        
    
    # the fit method, which, again based on StandardScale
    
    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    
    # the transform method which does the actual scaling

    def transform(self, X, y=None, copy=None):
        
        # record the initial order of the columns
        init_col_order = X.columns
        
        # scale all features that you chose when creating the instance of the class
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        
        # declare a variable containing all information that was not scaled
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        
        # return a data frame which contains all scaled features and all 'not scaled' features
        # use the original order (that you recorded in the beginning)
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

In [11]:
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Month Value'], dtype=object)

In [12]:
# choose the columns to scale
# we later augmented this code and put it in comments
# columns_to_scale = ['Month Value','Day of the Week', 'Transportation Expense', 'Distance to Work',
       #'Age', 'Daily Work Load Average', 'Body Mass Index', 'Children', 'Pet']
    
# select the columns to omit
columns_to_omit = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4','Education']

# create the columns to scale, based on the columns to omit
# use list comprehension to iterate over the list
columns_to_scale = [x for x in unscaled_inputs.columns.values if x not in columns_to_omit]

In [13]:
# declare a scaler object, specifying the columns you want to scale
absenteeism_scaler = CustomScaler(columns_to_scale)

In [14]:
absenteeism_scaler.fit(unscaled_inputs)

  return var(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)


In [15]:
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)

In [16]:
scaled_inputs

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Month Value
0,False,False,False,True,1.005844,-0.536062,0.767431,0,0.880469,0.268487,0.182726
1,False,False,False,False,-1.574681,2.130803,1.002633,0,-0.019280,-0.589690,0.182726
2,False,False,False,True,-0.654143,0.248310,1.002633,0,-0.919030,-0.589690,0.182726
3,True,False,False,False,0.854936,0.405184,-0.643782,0,0.880469,-0.589690,0.182726
4,False,False,False,True,1.005844,-0.536062,0.767431,0,0.880469,0.268487,0.182726
...,...,...,...,...,...,...,...,...,...,...,...
695,True,False,False,False,-0.654143,0.562059,-1.114186,1,0.880469,-0.589690,-0.388293
696,True,False,False,False,0.040034,-1.320435,-0.643782,0,-0.019280,1.126663,-0.388293
697,True,False,False,False,1.624567,-1.320435,-0.408580,1,-0.919030,-0.589690,-0.388293
698,False,False,False,True,0.190942,-0.692937,-0.408580,1,-0.919030,-0.589690,-0.388293


## Splitting the Data ##

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
x_train , x_test , y_train , y_test = train_test_split(scaled_inputs , targets , train_size = 0.8 , random_state = 20)

## Logistic Regression with SkLearn ##

In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [20]:
reg = LogisticRegression()

reg.fit(x_train , y_train)
reg.score(x_train , y_train)

0.7732142857142857

In [21]:
# Manually checking the accuracy
model_outputs = reg.predict(x_train)

accuracy = np.sum(model_outputs == y_train) / model_outputs.shape[0]
accuracy

0.7732142857142857

We can see the exact same result from two approaches

Now, we want to create a function that can be integrated in Python, SQL and Tableau so that the model can be displayed to the users in a more efficient way. 

To do this, we're first require to get familiar with the coefficients/weights and the intercept/bias of the logistic function.

## Finding the Intercept and Coefficients ##

In [22]:
reg.intercept_

array([-1.6469898])

In [23]:
reg.coef_

array([[ 2.80000644,  0.95174778,  3.1140605 ,  0.83835931,  0.60513709,
        -0.16990589,  0.27998236, -0.21017416,  0.34842434, -0.27721907,
         0.15897713]])

In [24]:
feature_name = unscaled_inputs.columns.values

summary_table = pd.DataFrame(columns=['Feature name'] , data = feature_name)

summary_table['Coefficient'] = np.transpose(reg.coef_)

summary_table

Unnamed: 0,Feature name,Coefficient
0,Reason_1,2.800006
1,Reason_2,0.951748
2,Reason_3,3.114061
3,Reason_4,0.838359
4,Transportation Expense,0.605137
5,Age,-0.169906
6,Body Mass Index,0.279982
7,Education,-0.210174
8,Children,0.348424
9,Pets,-0.277219


In [25]:
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ['Intercept' , reg.intercept_[0]]
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Feature name,Coefficient
0,Intercept,-1.64699
1,Reason_1,2.800006
2,Reason_2,0.951748
3,Reason_3,3.114061
4,Reason_4,0.838359
5,Transportation Expense,0.605137
6,Age,-0.169906
7,Body Mass Index,0.279982
8,Education,-0.210174
9,Children,0.348424


In [26]:
#Interpreting the coefficients
summary_table['Odds_Ratio'] = np.exp(summary_table.Coefficient)
summary_table

Unnamed: 0,Feature name,Coefficient,Odds_Ratio
0,Intercept,-1.64699,0.192629
1,Reason_1,2.800006,16.444753
2,Reason_2,0.951748,2.590233
3,Reason_3,3.114061,22.51227
4,Reason_4,0.838359,2.31257
5,Transportation Expense,0.605137,1.831503
6,Age,-0.169906,0.843744
7,Body Mass Index,0.279982,1.323106
8,Education,-0.210174,0.810443
9,Children,0.348424,1.416833


A feature is not particularly important:
   - if it's coefficient is around 0

     A weight of 0 implies that no matter the feature value, we will multiply it by 0 (in the model)
   - if it's odd ratio is around 1

     For a unit change in the standardized feature, the odds increase by a multiple equal to the odds ratio (1 = no change)

**Problem** : 

1) When we standardized the inputs, we also standardized the dummies, which is a bad practice. So, we will be creating a custom scaler where we made our scaler before.

2) Here, the columns 'Daily work load average', 'Distance to work' and 'Day of the week' have least impact on the model (coefficient ver close to 0), but they appears to be little bit helpful if we think logically.

To solve this issue, we will use **Backward Elimination**:

    -The idea is that we can simplify our model by removing all features which have close to no contribution to the model.
    - When we have the p-values, we get rid of all coefficients with p-values > 0.05.
    - If the weight is small enough, it won't make a difference anyway......

**So, we're going to ommit these features from where we ommitted the 'Absenteeism in Hours' column**

*We obsereved that our model's accuracy remained same which shows that these 3 columns were useless at all and dropping them made our model more simpler and useful.*

## Testing the Model ##

In [27]:
reg.score(x_test , y_test)

0.75

In [28]:
predicted_proba = reg.predict_proba(x_test)
predicted_proba

array([[0.71342516, 0.28657484],
       [0.5873216 , 0.4126784 ],
       [0.44016153, 0.55983847],
       [0.78163061, 0.21836939],
       [0.08407928, 0.91592072],
       [0.3348226 , 0.6651774 ],
       [0.29971206, 0.70028794],
       [0.13112385, 0.86887615],
       [0.78627908, 0.21372092],
       [0.74906578, 0.25093422],
       [0.49395555, 0.50604445],
       [0.22492002, 0.77507998],
       [0.07135527, 0.92864473],
       [0.73173354, 0.26826646],
       [0.30957854, 0.69042146],
       [0.54726422, 0.45273578],
       [0.55051921, 0.44948079],
       [0.53926379, 0.46073621],
       [0.40197149, 0.59802851],
       [0.05365482, 0.94634518],
       [0.70030387, 0.29969613],
       [0.78163061, 0.21836939],
       [0.42028246, 0.57971754],
       [0.42028246, 0.57971754],
       [0.24801464, 0.75198536],
       [0.74567806, 0.25432194],
       [0.51026557, 0.48973443],
       [0.8569309 , 0.1430691 ],
       [0.20365204, 0.79634796],
       [0.78163061, 0.21836939],
       [0.

The left column here is 0 and right column is 1 of being absent.

Now, we want to know the excessive absenteeism. 

In [29]:
predicted_proba[: ,1]

array([0.28657484, 0.4126784 , 0.55983847, 0.21836939, 0.91592072,
       0.6651774 , 0.70028794, 0.86887615, 0.21372092, 0.25093422,
       0.50604445, 0.77507998, 0.92864473, 0.26826646, 0.69042146,
       0.45273578, 0.44948079, 0.46073621, 0.59802851, 0.94634518,
       0.29969613, 0.21836939, 0.57971754, 0.57971754, 0.75198536,
       0.25432194, 0.48973443, 0.1430691 , 0.79634796, 0.21836939,
       0.36947677, 0.67913195, 0.68508325, 0.52870791, 0.21836939,
       0.53505228, 0.22144744, 0.73673169, 0.40500758, 0.60504297,
       0.21072119, 0.45227108, 0.23749326, 0.39847178, 0.82763577,
       0.56771922, 0.69120847, 0.28657484, 0.2192347 , 0.2032712 ,
       0.57634482, 0.32954238, 0.6651774 , 0.26937528, 0.83323682,
       0.43484145, 0.88365871, 0.23125087, 0.33433749, 0.34451397,
       0.69915101, 0.6549938 , 0.29244583, 0.79186052, 0.20752232,
       0.26838009, 0.08710411, 0.22144744, 0.73215219, 0.30536526,
       0.22144744, 0.2900789 , 0.90443841, 0.46065771, 0.60175

## Save the Model ##

In [31]:
import pickle

In [33]:
#Saving the model
with open('model' , 'wb') as file:
    pickle.dump(reg , file)

In [34]:
#Saving the absenteeism_scaler
with open('scaler' , 'wb') as file:
    pickle.dump(absenteeism_scaler , file)