#                Creating a logistic regression to predict absenteeism

## importing the relevant libraries

In [1]:
import pandas as pd
import numpy as np

## load the pre-processed data

In [2]:
data_preprocessed = pd.read_csv("Absenteeism_preprocessed.csv")

In [3]:
data_preprocessed.head(10)

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2
5,0,0,0,1,7,4,179,51,38,239.554,31,0,0,0,2
6,0,0,1,0,7,4,361,52,28,239.554,27,0,1,4,8
7,0,0,0,1,7,4,260,50,36,239.554,23,0,4,0,4
8,0,0,1,0,7,0,155,12,34,239.554,25,0,2,0,40
9,0,0,1,0,7,0,235,11,37,239.554,29,1,1,1,8


## create targets

In [4]:
data_preprocessed['Absenteeism Time in Hours'].median()

3.0

    We are going to categorize into 2 categories:
    <= 3 hrs --> moderately absent
    > 3 hrs --> excessively absent

In [5]:
targets = np.where(data_preprocessed['Absenteeism Time in Hours']>data_preprocessed['Absenteeism Time in Hours'].median(),1,0)
                        
                            ##### excessively absent = 1, moderately absent = 0 #####

In [6]:
targets


array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,

In [7]:
len(targets)

700

In [8]:
data_preprocessed['Excessive Absenteeism'] = targets

In [9]:
data_preprocessed.head(20)

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Excessive Absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2,0
5,0,0,0,1,7,4,179,51,38,239.554,31,0,0,0,2,0
6,0,0,1,0,7,4,361,52,28,239.554,27,0,1,4,8,1
7,0,0,0,1,7,4,260,50,36,239.554,23,0,4,0,4,1
8,0,0,1,0,7,0,155,12,34,239.554,25,0,2,0,40,1
9,0,0,1,0,7,0,235,11,37,239.554,29,1,1,1,8,1


In [10]:
data_preprocessed['Excessive Absenteeism'].value_counts()

0    381
1    319
Name: Excessive Absenteeism, dtype: int64

In [11]:
targets.sum()/targets.shape[0]  ## fraction of Excessive absenteeism

0.45571428571428574

In [12]:
data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours','Day of the Week','Daily Work Load Average','Distance to Work'],axis = 1)


In [13]:
data_with_targets is data_preprocessed

False

In [14]:
data_with_targets.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Excessive Absenteeism
0,0,0,0,1,7,289,33,30,0,2,1,1
1,0,0,0,0,7,118,50,31,0,1,0,0
2,0,0,0,1,7,179,38,31,0,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0,1
4,0,0,0,1,7,289,33,30,0,2,1,0


In [15]:
data_with_targets.iloc[:,:14]

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Excessive Absenteeism
0,0,0,0,1,7,289,33,30,0,2,1,1
1,0,0,0,0,7,118,50,31,0,1,0,0
2,0,0,0,1,7,179,38,31,0,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0,1
4,0,0,0,1,7,289,33,30,0,2,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,179,40,22,1,2,0,1
696,1,0,0,0,5,225,28,24,0,1,2,0
697,1,0,0,0,5,330,28,25,1,0,0,1
698,0,0,0,1,5,235,32,25,1,0,0,0


In [16]:
data_with_targets.iloc[:,:-1]

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,0,0,0,1,7,289,33,30,0,2,1
1,0,0,0,0,7,118,50,31,0,1,0
2,0,0,0,1,7,179,38,31,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0
4,0,0,0,1,7,289,33,30,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,179,40,22,1,2,0
696,1,0,0,0,5,225,28,24,0,1,2
697,1,0,0,0,5,330,28,25,1,0,0
698,0,0,0,1,5,235,32,25,1,0,0


In [17]:
unscaled_inputs = data_with_targets.iloc[:,:-1]

## standardize the data  -- like normalize , subtract the mean and divide each data point by the standard deviation




In [18]:
# from sklearn.preprocessing import StandardScaler

In [19]:
# absenteeism_scaler = StandardScaler()

In [20]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

class CustomScaler(BaseEstimator,TransformerMixin):
    def __init__(self,columns,copy=True, with_mean=True,with_std =True):
        self.scaler=StandardScaler(copy,with_mean, with_std)
        self.columns=columns
        self.mean_=None
        self.var_=None
    def fit(self,X,y=None):
        self.scaler.fit (X[self.columns],y)
        self.mean_= np.mean(X[self.columns])
        self.var_= np.var(X[self.columns])
        return self
    def transform(self, X, y=None,copy=None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]),columns=self.columns)
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled],axis=1)[init_col_order]
    
            

In [21]:
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [22]:
columns_to_scale = [ 'Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Day of the Week', 'Transportation Expense', 'Distance to Work',
       'Age', 'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets']
columns_to_omit = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4','Education']
columns_to_scale = [x for x in unscaled_inputs.columns.values if x not in columns_to_omit]

In [23]:
absenteeism_scaler = CustomScaler(columns_to_scale)

In [24]:
absenteeism_scaler.fit(unscaled_inputs)

CustomScaler(columns=['Month Value', 'Transportation Expense', 'Age',
                      'Body Mass Index', 'Children', 'Pets'],
             copy=None, with_mean=None, with_std=None)

In [25]:
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)

In [26]:
scaled_inputs

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,0,0,0,1,0.182726,1.005844,-0.536062,0.767431,0,0.880469,0.268487
1,0,0,0,0,0.182726,-1.574681,2.130803,1.002633,0,-0.019280,-0.589690
2,0,0,0,1,0.182726,-0.654143,0.248310,1.002633,0,-0.919030,-0.589690
3,1,0,0,0,0.182726,0.854936,0.405184,-0.643782,0,0.880469,-0.589690
4,0,0,0,1,0.182726,1.005844,-0.536062,0.767431,0,0.880469,0.268487
...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,-0.388293,-0.654143,0.562059,-1.114186,1,0.880469,-0.589690
696,1,0,0,0,-0.388293,0.040034,-1.320435,-0.643782,0,-0.019280,1.126663
697,1,0,0,0,-0.388293,1.624567,-1.320435,-0.408580,1,-0.919030,-0.589690
698,0,0,0,1,-0.388293,0.190942,-0.692937,-0.408580,1,-0.919030,-0.589690


In [27]:
scaled_inputs.shape

(700, 11)

## split the data into train and test and shuffle

In [28]:
from sklearn.model_selection import train_test_split

In [29]:
## split ##


train_test_split(scaled_inputs, targets)

[     Reason_1  Reason_2  Reason_3  Reason_4  Month Value  \
 615         0         0         0         1    -1.244823   
 394         0         0         0         1    -0.959313   
 397         0         0         1         0    -0.959313   
 426         0         0         0         1    -0.388293   
 624         0         0         0         1    -0.959313   
 ..        ...       ...       ...       ...          ...   
 505         0         0         0         1     0.753746   
 676         0         0         0         1    -0.388293   
 515         0         0         0         1     1.039256   
 632         0         0         0         1    -0.959313   
 564         0         0         1         0     1.610276   
 
      Transportation Expense       Age  Body Mass Index  Education  Children  \
 615               -0.654143  0.248310         1.002633          0 -0.919030   
 394               -0.654143  0.248310         1.002633          0 -0.919030   
 397                2.3489

In [30]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size=0.8, random_state=20)

In [31]:
print(x_train.shape, y_train.shape)

(560, 11) (560,)


In [32]:
print(x_test.shape, y_test.shape)

(140, 11) (140,)


## Logistic Regression from sklearn

In [33]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics


## training the model

In [34]:
reg = LogisticRegression()
reg.fit(x_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [35]:
reg.score(x_train,y_train)

0.7946428571428571

## manually calculating accuracy, buildng score method from scratch


In [36]:
model_outputs = reg.predict(x_train)

In [37]:
model_outputs

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0,

In [38]:
y_train


array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0,

In [39]:
np.sum(model_outputs == y_train)

445

In [40]:
model_outputs.shape[0]

560

In [41]:
accuracy_score = np.sum(model_outputs == y_train)/ model_outputs.shape[0]

In [42]:
accuracy_score



0.7946428571428571

In [43]:
##Finding the intercept and coefficients for the columns in the dataframe x_train


In [44]:
reg.intercept_

array([-1.40677819])

In [45]:
reg.coef_

array([[ 2.57106488,  0.8661636 ,  3.2491809 ,  0.41091309,  0.20838244,
         0.54246904, -0.16221097,  0.24897038, -0.39633308,  0.30160675,
        -0.42233869]])

In [46]:
## now what columns weights does these coefficients signify, we don't know yet

In [47]:
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [48]:
feature_name = unscaled_inputs.columns.values


In [49]:
## now storing in a table the features and their respective coefficients
summary_table = pd.DataFrame( columns=['Feature Name'], data=feature_name)
summary_table['Coeffs'] = reg.coef_.transpose()
summary_table

Unnamed: 0,Feature Name,Coeffs
0,Reason_1,2.571065
1,Reason_2,0.866164
2,Reason_3,3.249181
3,Reason_4,0.410913
4,Month Value,0.208382
5,Transportation Expense,0.542469
6,Age,-0.162211
7,Body Mass Index,0.24897
8,Education,-0.396333
9,Children,0.301607


In [50]:
summary_table.index=summary_table.index+1
summary_table.loc[0]= ['Intercept' ,reg.intercept_[0]]
summary_table= summary_table.sort_index()
summary_table

Unnamed: 0,Feature Name,Coeffs
0,Intercept,-1.406778
1,Reason_1,2.571065
2,Reason_2,0.866164
3,Reason_3,3.249181
4,Reason_4,0.410913
5,Month Value,0.208382
6,Transportation Expense,0.542469
7,Age,-0.162211
8,Body Mass Index,0.24897
9,Education,-0.396333


In [51]:
summary_table['Odds Ratio']=np.exp(summary_table.Coeffs)

# summary_table=summary_table.drop(index=[0,1,2,3],axis=0)
summary_table.sort_values('Odds Ratio',ascending=False)


Unnamed: 0,Feature Name,Coeffs,Odds Ratio
3,Reason_3,3.249181,25.769224
1,Reason_1,2.571065,13.079745
2,Reason_2,0.866164,2.377771
6,Transportation Expense,0.542469,1.720249
4,Reason_4,0.410913,1.508194
10,Children,0.301607,1.352029
8,Body Mass Index,0.24897,1.282704
5,Month Value,0.208382,1.231684
7,Age,-0.162211,0.850262
9,Education,-0.396333,0.672783


# Testing the model

In [52]:
reg.score(x_test,y_test)

0.7642857142857142

In [53]:
predicted_proba =  reg.predict_proba(x_test)
predicted_proba

array([[0.70960135, 0.29039865],
       [0.64311027, 0.35688973],
       [0.47027787, 0.52972213],
       [0.79728493, 0.20271507],
       [0.08629627, 0.91370373],
       [0.31200957, 0.68799043],
       [0.33085348, 0.66914652],
       [0.10880133, 0.89119867],
       [0.84822989, 0.15177011],
       [0.75610265, 0.24389735],
       [0.10362566, 0.89637434],
       [0.02355373, 0.97644627],
       [0.0542    , 0.9458    ],
       [0.20616226, 0.79383774],
       [0.24354982, 0.75645018],
       [0.63807109, 0.36192891],
       [0.72674514, 0.27325486],
       [0.12790705, 0.87209295],
       [0.41673074, 0.58326926],
       [0.04094254, 0.95905746],
       [0.75506892, 0.24493108],
       [0.79728493, 0.20271507],
       [0.35043875, 0.64956125],
       [0.35043875, 0.64956125],
       [0.21484424, 0.78515576],
       [0.80585655, 0.19414345],
       [0.55117401, 0.44882599],
       [0.87078291, 0.12921709],
       [0.12511938, 0.87488062],
       [0.79728493, 0.20271507],
       [0.

In [54]:
predicted_proba[:,1]

array([0.29039865, 0.35688973, 0.52972213, 0.20271507, 0.91370373,
       0.68799043, 0.66914652, 0.89119867, 0.15177011, 0.24389735,
       0.89637434, 0.97644627, 0.9458    , 0.79383774, 0.75645018,
       0.36192891, 0.27325486, 0.87209295, 0.58326926, 0.95905746,
       0.24493108, 0.20271507, 0.64956125, 0.64956125, 0.78515576,
       0.19414345, 0.44882599, 0.12921709, 0.87488062, 0.20271507,
       0.88198855, 0.63607243, 0.71294523, 0.90691736, 0.20271507,
       0.92816441, 0.15958897, 0.80947054, 0.3240494 , 0.54972713,
       0.1932692 , 0.44430482, 0.17620132, 0.44735958, 0.80149984,
       0.69408034, 0.70836325, 0.29039865, 0.2033095 , 0.18416181,
       0.60605006, 0.36321244, 0.68799043, 0.26722156, 0.82796351,
       0.38464019, 0.85822216, 0.20879585, 0.39870239, 0.41304794,
       0.68420425, 0.6750786 , 0.28582828, 0.83110026, 0.14763137,
       0.26650277, 0.13681965, 0.15958897, 0.81344178, 0.83008032,
       0.15958897, 0.3394673 , 0.91202991, 0.39227776, 0.51937

# Save the model

In [55]:
import pickle

In [56]:
with open('linear_reg_model','wb') as file:
    pickle.dump(reg,file)

In [57]:
with open('absenteeism_scaler_object','wb') as file:
    pickle.dump(absenteeism_scaler,file)