# Creating a Logistic Regression

## Import Libraries

In [2]:
import pandas as pd
import numpy as np

## Load the Data

In [3]:
data_preprocessed = pd.read_csv("Absenteeism_preprocessed.csv")

## Create the Targets

In [4]:
data_preprocessed['Absenteeism Time in Hours'].median()

3.0

In [5]:
targets = np.where(data_preprocessed['Absenteeism Time in Hours'] > 3, 1, 0)
targets

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,

In [6]:
data_preprocessed['Excessive Absenteeism'] = targets
data_preprocessed.head(10)

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Excessive Absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2,0
5,0,0,0,1,7,4,179,51,38,239.554,31,0,0,0,2,0
6,0,0,0,1,7,4,361,52,28,239.554,27,0,1,4,8,1
7,0,0,0,1,7,4,260,50,36,239.554,23,0,4,0,4,1
8,0,0,1,0,7,0,155,12,34,239.554,25,0,2,0,40,1
9,0,0,0,1,7,0,235,11,37,239.554,29,1,1,1,8,1


In [7]:
data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours', 'Day of the Week', 'Distance to Work', 'Daily Work Load Average'], axis = 1)
data_with_targets.head(10)

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Excessive Absenteeism
0,0,0,0,1,7,289,33,30,0,2,1,1
1,0,0,0,0,7,118,50,31,0,1,0,0
2,0,0,0,1,7,179,38,31,0,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0,1
4,0,0,0,1,7,289,33,30,0,2,1,0
5,0,0,0,1,7,179,38,31,0,0,0,0
6,0,0,0,1,7,361,28,27,0,1,4,1
7,0,0,0,1,7,260,36,23,0,4,0,1
8,0,0,1,0,7,155,34,25,0,2,0,1
9,0,0,0,1,7,235,37,29,1,1,1,1


## Inputs for regression

In [8]:
data_with_targets.shape

(700, 12)

In [9]:
unscaled_inputs = data_with_targets.iloc[:,:-1]
unscaled_inputs.head(10)

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,0,0,0,1,7,289,33,30,0,2,1
1,0,0,0,0,7,118,50,31,0,1,0
2,0,0,0,1,7,179,38,31,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0
4,0,0,0,1,7,289,33,30,0,2,1
5,0,0,0,1,7,179,38,31,0,0,0
6,0,0,0,1,7,361,28,27,0,1,4
7,0,0,0,1,7,260,36,23,0,4,0
8,0,0,1,0,7,155,34,25,0,2,0
9,0,0,0,1,7,235,37,29,1,1,1


## Standarize the data

In [10]:
# from sklearn.preprocessing import StandardScaler

# absenteeism_scaler = StandardScaler()
# absenteeism_scaler.fit(unscaled_inputs)

In [11]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

class CustomerScaler(BaseEstimator, TransformerMixin):
    
    def __init__(self,columns):   
        # scaler is nothing but a Standard Scaler object
        self.scaler = StandardScaler()
        # with some columns 'twist'
        self.columns = columns

    # the fit method, which, again based on StandardScale
    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self

    # the transform method which does the actual scaling
    def transform(self, X, y=None):
        # record the initial order of the columns
        init_col_order = X.columns

        # scale all features that you chose when creating the instance of the class
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
       
        # declare a variable containing all information that was not scaled
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]

        # return a data frame which contains all scaled features and all 'not scaled' features
        # use the original order (that you recorded in the beginning)
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]



In [12]:
unscaled_inputs.columns.values

array(['Reason 1', 'Reason 2', 'Reason 3', 'Reason 4', 'Month Value',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [13]:
columns_to_scale = ['Month Value', 'Transportation Expense', 'Body Mass Index','Children', 'Pets']

In [14]:
absenteeism_scaler = CustomerScaler(columns_to_scale)
absenteeism_scaler.fit(unscaled_inputs)

  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


In [15]:
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)
scaled_inputs

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,0,0,0,1,0.182726,1.005844,33,0.767431,0,0.880469,0.268487
1,0,0,0,0,0.182726,-1.574681,50,1.002633,0,-0.019280,-0.589690
2,0,0,0,1,0.182726,-0.654143,38,1.002633,0,-0.919030,-0.589690
3,1,0,0,0,0.182726,0.854936,39,-0.643782,0,0.880469,-0.589690
4,0,0,0,1,0.182726,1.005844,33,0.767431,0,0.880469,0.268487
...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,-0.388293,-0.654143,40,-1.114186,1,0.880469,-0.589690
696,1,0,0,0,-0.388293,0.040034,28,-0.643782,0,-0.019280,1.126663
697,1,0,0,0,-0.388293,1.624567,28,-0.408580,1,-0.919030,-0.589690
698,0,0,0,1,-0.388293,0.190942,32,-0.408580,1,-0.919030,-0.589690


## Train Test Split

In [16]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size=0.8, random_state=42)

In [17]:
x_train.shape, y_train.shape

((560, 11), (560,))

In [18]:
x_test.shape, y_test.shape

((140, 11), (140,))

## Logistic Regression

In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [20]:
reg = LogisticRegression()
reg.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [21]:
reg.score(x_train, y_train)

0.7732142857142857

### Manually check the accuracy

In [22]:
model_outputs = reg.predict(x_train)
model_outputs

array([1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0,
       1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
       1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1,
       1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0,

In [23]:
y_train

array([1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0,
       1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0,
       1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0,

In [24]:
model_outputs == y_train

array([ True,  True,  True,  True, False, False, False,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
       False,  True,  True,  True,  True, False,  True,  True,  True,
        True, False, False,  True,  True,  True,  True,  True,  True,
        True,  True, False,  True,  True, False,  True, False,  True,
        True, False, False,  True,  True,  True, False,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False, False,  True,  True,  True,  True,
        True, False,  True,  True,  True,  True,  True,  True,  True,
        True,  True, False,  True,  True,  True, False, False,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True, False,  True, False,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,

In [25]:
model_outputs.shape

(560,)

In [26]:
# Same accuracy as using LogisticRegression().score()

np.sum(model_outputs == y_train) / model_outputs.shape[0]

0.7732142857142857

## Finding the intercepts

In [27]:
reg.intercept_

array([-0.14245804])

In [28]:
reg.coef_

array([[ 2.83703405,  0.69242775,  2.95844212,  0.90586181,  0.07897978,
         0.63561281, -0.04089584,  0.265626  , -0.24384376,  0.38791732,
        -0.28982151]])

In [29]:
unscaled_inputs.columns.values

array(['Reason 1', 'Reason 2', 'Reason 3', 'Reason 4', 'Month Value',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [30]:
features_names = unscaled_inputs.columns.values

In [31]:
summary_table = pd.DataFrame(columns = ['Feature name'], data = features_names)
summary_table["Coefficient"] = np.transpose(reg.coef_)
summary_table

Unnamed: 0,Feature name,Coefficient
0,Reason 1,2.837034
1,Reason 2,0.692428
2,Reason 3,2.958442
3,Reason 4,0.905862
4,Month Value,0.07898
5,Transportation Expense,0.635613
6,Age,-0.040896
7,Body Mass Index,0.265626
8,Education,-0.243844
9,Children,0.387917


In [32]:
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]
summary_table = summary_table.sort_index()

In [33]:
summary_table

Unnamed: 0,Feature name,Coefficient
0,Intercept,-0.142458
1,Reason 1,2.837034
2,Reason 2,0.692428
3,Reason 3,2.958442
4,Reason 4,0.905862
5,Month Value,0.07898
6,Transportation Expense,0.635613
7,Age,-0.040896
8,Body Mass Index,0.265626
9,Education,-0.243844


## Interpreting the coefficients

In [34]:
summary_table['Odds_ratio'] = np.exp(summary_table.Coefficient)
summary_table

Unnamed: 0,Feature name,Coefficient,Odds_ratio
0,Intercept,-0.142458,0.867224
1,Reason 1,2.837034,17.065076
2,Reason 2,0.692428,1.998562
3,Reason 3,2.958442,19.267931
4,Reason 4,0.905862,2.474063
5,Month Value,0.07898,1.082182
6,Transportation Expense,0.635613,1.888179
7,Age,-0.040896,0.959929
8,Body Mass Index,0.265626,1.304247
9,Education,-0.243844,0.78361


In [35]:
summary_table.sort_values('Odds_ratio', ascending= False)

Unnamed: 0,Feature name,Coefficient,Odds_ratio
3,Reason 3,2.958442,19.267931
1,Reason 1,2.837034,17.065076
4,Reason 4,0.905862,2.474063
2,Reason 2,0.692428,1.998562
6,Transportation Expense,0.635613,1.888179
10,Children,0.387917,1.473908
8,Body Mass Index,0.265626,1.304247
5,Month Value,0.07898,1.082182
7,Age,-0.040896,0.959929
0,Intercept,-0.142458,0.867224


## Testing the model

In [36]:
reg.score(x_test, y_test)

0.7785714285714286

In [37]:
predicted_proba = reg.predict_proba(x_test)
predicted_proba

array([[0.83268237, 0.16731763],
       [0.84973871, 0.15026129],
       [0.80080357, 0.19919643],
       [0.59425944, 0.40574056],
       [0.57520274, 0.42479726],
       [0.08156587, 0.91843413],
       [0.67119987, 0.32880013],
       [0.35369545, 0.64630455],
       [0.69645053, 0.30354947],
       [0.73076705, 0.26923295],
       [0.86357441, 0.13642559],
       [0.67073471, 0.32926529],
       [0.26720729, 0.73279271],
       [0.44637682, 0.55362318],
       [0.72303475, 0.27696525],
       [0.49544438, 0.50455562],
       [0.88490034, 0.11509966],
       [0.21286303, 0.78713697],
       [0.86620933, 0.13379067],
       [0.58618333, 0.41381667],
       [0.72908449, 0.27091551],
       [0.73518044, 0.26481956],
       [0.6756978 , 0.3243022 ],
       [0.67569545, 0.32430455],
       [0.85729394, 0.14270606],
       [0.16409606, 0.83590394],
       [0.59707892, 0.40292108],
       [0.58068274, 0.41931726],
       [0.75236493, 0.24763507],
       [0.6024917 , 0.3975083 ],
       [0.

In [38]:
predicted_proba[:,1]

array([0.16731763, 0.15026129, 0.19919643, 0.40574056, 0.42479726,
       0.91843413, 0.32880013, 0.64630455, 0.30354947, 0.26923295,
       0.13642559, 0.32926529, 0.73279271, 0.55362318, 0.27696525,
       0.50455562, 0.11509966, 0.78713697, 0.13379067, 0.41381667,
       0.27091551, 0.26481956, 0.3243022 , 0.32430455, 0.14270606,
       0.83590394, 0.40292108, 0.41931726, 0.24763507, 0.3975083 ,
       0.12151086, 0.14770637, 0.59866099, 0.56473949, 0.28150354,
       0.64220492, 0.3243022 , 0.13939148, 0.8503496 , 0.20205386,
       0.51582521, 0.26045273, 0.63592843, 0.12864988, 0.23121499,
       0.72543969, 0.75733321, 0.88611735, 0.31449898, 0.1311989 ,
       0.26923295, 0.31938062, 0.42479726, 0.92923189, 0.14489012,
       0.23121499, 0.97512052, 0.28150354, 0.8668144 , 0.22643967,
       0.57991946, 0.12393854, 0.50757707, 0.64630455, 0.13642559,
       0.43033666, 0.68456002, 0.06304092, 0.27247237, 0.50635737,
       0.26923295, 0.24763507, 0.69370979, 0.31938062, 0.13727

## Save the model

In [39]:
import pickle

with open('model', 'wb') as file:
    pickle.dump(reg, file)

In [40]:
with open('scaler', 'wb') as file:
    pickle.dump(absenteeism_scaler, file)

In [None]:
# END.