In [1]:
import pandas as pd 
import numpy as np 

In [2]:
data_preprocessed=pd.read_csv("C:/Users/Yasmine Mohsen/Desktop/DataScience-Python/Absenteeism_preprocessed.csv")
data_preprocessed.head()

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Month Value,Day of the week,Transportation Expense,Distance to Work,Age,Daily Work Load,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2


#### Create the targets for the logistic regression model

In [3]:
# we will take the median value of the "Absenteeism Time in Hours" column as the threshold to classify employees --> we will use it as a cut-off line.
# everything above the median (>=4) will be considered as 'excessively absent' (will be assigned a value of 1) , 
# and everything below the median (<=3) will be considered as 'moderately absent' (will be assigned a value of 0)
# these are the targets for our logistic regression model
data_preprocessed['Absenteeism Time in Hours'].median()

3.0

In [4]:
targets=np.where(data_preprocessed['Absenteeism Time in Hours']>3,1,0)
targets

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,

In [5]:
# we can perform the same operation with the "Absenteeism Time in Hours" column to create a binary classification problem in this way:
# this way we minimize the risk of any mistakes 
targets=np.where(data_preprocessed['Absenteeism Time in Hours']>
                 data_preprocessed['Absenteeism Time in Hours'].median(),1,0)
targets

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,

In [6]:
# let's add the targets to our preprocessed data
data_preprocessed['Excessive Absenteeism'] = targets
data_preprocessed.head()
# now the dataset is balanced since the targets are now equally distributed (around 50% 0s and 50% 1s)
# this will prevent our model from learning to output only 0s or only 1s

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Month Value,Day of the week,Transportation Expense,Distance to Work,Age,Daily Work Load,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Excessive Absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2,0


#### A comment on targets

In [7]:
# we'll divide the nb of targets that are 1s on the total number of targets
targets.sum() / targets.shape[0]
# targets.sum() gets the number of 1s in the targets array
# targets.shape[0] gets the total number of targets in the array
# 45% --> the two groups have been distributed roughly equally

0.45571428571428574

#### Drop the absenteeism time in hours column as it's not needed anymore

In [8]:
data_with_targets=data_preprocessed.drop(['Absenteeism Time in Hours','Day of the week','Daily Work Load','Distance to Work'],axis=1)
data_with_targets is data_preprocessed
# ^ to find out if data_with_targets is the same as data_preprocessed, it should return True 
# otherwise it will return False  ( --> it's a checkpoint)
# data_with_targets is the same as data_preprocessed but with the "Absenteeism Time in Hours", ,'Day of the week','Daily Work Load','Distance to Work' column dropped

False

In [9]:
data_with_targets.head()

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Excessive Absenteeism
0,0,0,0,1,7,289,33,30,0,2,1,1
1,0,0,0,0,7,118,50,31,0,1,0,0
2,0,0,0,1,7,179,38,31,0,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0,1
4,0,0,0,1,7,289,33,30,0,2,1,0


#### Select the inputs for the regression model

In [10]:
data_with_targets.shape

(700, 12)

In [11]:
# the iloc method is used for integer-location based indexing / selection by position when given the wanted rows/columns
# to select the inputs (the features) for the regression model, we use all columns except the last one ( which is the target)

inputs=data_with_targets.iloc[:,0:14] # 14 is the index of the last column (Excessive Absenteeism)
inputs.head()

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Excessive Absenteeism
0,0,0,0,1,7,289,33,30,0,2,1,1
1,0,0,0,0,7,118,50,31,0,1,0,0
2,0,0,0,1,7,179,38,31,0,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0,1
4,0,0,0,1,7,289,33,30,0,2,1,0


In [12]:
# if we don't exactly know the index of the target column, we can use the following code:
inputs=data_with_targets.iloc[:,:-1] # :-1 means all columns except the last one
inputs.head()

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,0,0,0,1,7,289,33,30,0,2,1
1,0,0,0,0,7,118,50,31,0,1,0
2,0,0,0,1,7,179,38,31,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0
4,0,0,0,1,7,289,33,30,0,2,1


In [13]:
# store the results
unscaled_inputs=data_with_targets.iloc[:,:-1]

#### Standardize the data

###### first method that we have tried but there's a better one

In [14]:
#from sklearn.preprocessing import StandardScaler
# we create the absenteeism_scaler: an empty StandardScaler object
#absenteeism_scaler = StandardScaler()
# we will use absenteeism_scaler to standardize our inputs --> all features will have a mean of 0 and a standard deviation of 1
# --> we will use this scaler to substract the mean and divide by the standard deviation of each feature

In [15]:
# fit our input data
# => calculate and store the mean and standard deviation of each feature
#absenteeism_scaler.fit(inputs)

In [16]:
# to finish scaling our inputs, we will use the transform method of the absenteeism_scaler
# transform() does the actual scaling
#scaled_inputs=absenteeism_scaler.transform(inputs)
#scaled_inputs

In [17]:
#scaled_inputs.shape
# 700 observations and 14 features (after dropping the target) have been standardized

###### Second method which is preferred

In [18]:
# when we standardized the features, we also standardized the dummies which is not best practice --> we lose the whole interpretability of a dummy
# so now we will do some corrections to our code
# we no longer need these two lines of code which we have written earlier in the Standardization of the Data steps
# from sklearn.preprocessing import StandardScaler
# absenteeism_scaler = StandardScaler() 
# so now we will do it all over again ( so we will keep everything written before those two lines and change what we have written after them)

# Create a CustomScaler class :
# this will not standardize all the inputs, but only the specified ones so we can preserve the dummies untouched

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

absenteeism_scaler = StandardScaler()

class CustomScaler(BaseEstimator, TransformerMixin):
    def __init__(self, columns, with_mean=True, with_std=True):
        self.scaler = StandardScaler(with_mean=with_mean, with_std=with_std)
        self.columns = columns
        self.mean_ = None
        self.var_ = None
        
    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns])
        self.mean_ = np.mean(X[self.columns], axis=0)  # Specifying axis=0 for column-wise mean
        self.var_ = np.var(X[self.columns], axis=0)  # Specifying axis=0 for column-wise variance
        return self
    
    def transform(self, X):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        X_not_scaled = X.loc[:, ~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]
    
    def get_params(self, deep=True):
        return {"columns": self.columns, "with_mean": self.scaler.with_mean, "with_std": self.scaler.with_std}
    
    def set_params(self, **params):
        for key, value in params.items():
            setattr(self, key, value)
        return self

In [19]:
unscaled_inputs.columns.values

array(['Reason 1', 'Reason 2', 'Reason 3', 'Reason 4', 'Month Value',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [20]:
# columns_to_scale = ['Month Value','Day of the Week', 'Transportation Expense', 'Distance to Work',
#                     'Age', 'Daily Work Load Average', 'Body Mass Index', 'Children', 'Pets']
columns_to_omit = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4','Education']

In [21]:
columns_to_scale = [x for x in unscaled_inputs.columns.values if x not in columns_to_omit]

In [22]:
absenteeism_scaler = CustomScaler(columns_to_scale)

In [23]:
absenteeism_scaler.fit(unscaled_inputs)

In [24]:
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)
scaled_inputs

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,-0.577350,-0.092981,-0.314485,0.821365,0.182726,1.005844,-0.536062,0.767431,0,0.880469,0.268487
1,-0.577350,-0.092981,-0.314485,-1.217485,0.182726,-1.574681,2.130803,1.002633,0,-0.019280,-0.589690
2,-0.577350,-0.092981,-0.314485,0.821365,0.182726,-0.654143,0.248310,1.002633,0,-0.919030,-0.589690
3,1.732051,-0.092981,-0.314485,-1.217485,0.182726,0.854936,0.405184,-0.643782,0,0.880469,-0.589690
4,-0.577350,-0.092981,-0.314485,0.821365,0.182726,1.005844,-0.536062,0.767431,0,0.880469,0.268487
...,...,...,...,...,...,...,...,...,...,...,...
695,1.732051,-0.092981,-0.314485,-1.217485,-0.388293,-0.654143,0.562059,-1.114186,1,0.880469,-0.589690
696,1.732051,-0.092981,-0.314485,-1.217485,-0.388293,0.040034,-1.320435,-0.643782,0,-0.019280,1.126663
697,1.732051,-0.092981,-0.314485,-1.217485,-0.388293,1.624567,-1.320435,-0.408580,1,-0.919030,-0.589690
698,-0.577350,-0.092981,-0.314485,0.821365,-0.388293,0.190942,-0.692937,-0.408580,1,-0.919030,-0.589690


### Split the data into train and test sets and shuffle the data

In [25]:
from sklearn.model_selection import train_test_split

Split

In [26]:
train_test_split(scaled_inputs, targets)
# array1 : a training dataset with inputs
# array2 : a training dataset with targets
# array3 : a testing dataset with inputs
# array4 : a testing dataset with targets

[     Reason 1   Reason 2  Reason 3  Reason 4  Month Value  \
 173  1.732051  -0.092981 -0.314485 -1.217485    -0.959313   
 275 -0.577350  -0.092981 -0.314485  0.821365     0.753746   
 498 -0.577350  -0.092981 -0.314485  0.821365     0.753746   
 488 -0.577350  -0.092981 -0.314485  0.821365     0.468236   
 78  -0.577350  -0.092981 -0.314485  0.821365     1.039256   
 ..        ...        ...       ...       ...          ...   
 124 -0.577350  -0.092981 -0.314485  0.821365    -1.530333   
 29  -0.577350  -0.092981 -0.314485  0.821365     0.468236   
 335 -0.577350  10.754844 -0.314485 -1.217485     1.324766   
 483 -0.577350  -0.092981 -0.314485  0.821365     0.468236   
 438 -0.577350  -0.092981 -0.314485  0.821365    -0.388293   
 
      Transportation Expense       Age  Body Mass Index  Education  Children  \
 173               -0.654143  0.248310         1.002633          0 -0.919030   
 275                1.036026  0.562059        -0.408580          0 -0.019280   
 498          

In [27]:
# declare 4 variables to store the results
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs,targets)

In [28]:
print(x_train.shape, y_train.shape)
# the inputs contain 525 observations along 14 features
# targets are vectors of length 525 ( the column 'Excessive Absenteeism' which is our target)
# --> 14 inputs and 1 target value for each observation
# 525 is 75% of the observations that will help us with training (default in sklearn)

(525, 11) (525,)


In [29]:
print(x_test.shape, y_test.shape)
# this method split the scaled inputs and targets into matching forms that we can now use in the machine learning part
# 175 is 25% of the observations that will help us with testing (default in sklearn)

(175, 11) (175,)


80% training and 20% testing

In [30]:
# to change the train-split ratio, we can adjust the train_size parameter in the train_test_split function
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size=0.8, random_state=20)
# train_size=0.8 means that 80% of the data will be used for training
# random_state=20 ensures that the split will be the same every time we run --> therefore same accuracy
# ^ shuffles the data

In [31]:
print(x_train.shape, y_train.shape)
# now, 80% of the data (560 observations) is used for training

(560, 11) (560,)


In [32]:
print(x_test.shape, y_test.shape)
# the remaining 20% of the data (140 observations) is used for testing

(140, 11) (140,)


#### Logistic Regression Model with sklearn

In [33]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

#### Training the model

In [34]:
reg=LogisticRegression()

In [35]:
reg.fit(x_train,y_train)
# this method does all the work of training the logistic regression model (basically all the machine learning)

In [36]:
reg.score(x_train,y_train)
# this method returns the accuracy of the model on the training data (how well it learned the patterns)
# the model is trained well (around 80% accuracy)
# based on the datawe used for training, the model classifies around 80% of the observations correctly

0.7875

#### Now, manually check the accuracy of the model 

In [37]:
# build the score method manually from scratch
# find the outputs of the model and compare them to the actual targets
model_outputs=reg.predict(x_train)
model_outputs
# model_outputs contains the predicted outputs (0s and 1s) for the training data

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0,

In [38]:
y_train
# targets

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0,

In [39]:
model_outputs==y_train
# model_outputs==y_train returns a boolean array where True means the model predicted the correct target

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False,  True, False, False,  True,  True,  True,  True,
       False,  True, False,  True, False, False,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
       False, False, False, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True, False,  True, False,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
       False,  True, False,  True,  True, False, False, False,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
       False,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,

In [40]:
np.sum(model_outputs==y_train)
# this number represents the number of correct predictions made by the model on the training data
# the model correctly predicted 439 observations 

441

In [41]:
# this is the total number of observations in the training data
y_train.shape[0]

560

In [42]:
reg.score(x_train,y_train)
# this is the default method in sklearn that finds the accuracy of the model (80% in this case)

0.7875

In [43]:
# to find the accuracy, we divide the number of correct predictions by the total number of observations

model_accuracy=np.sum(model_outputs==y_train) / y_train.shape[0]
model_accuracy
# this is the manually calculated accuracy of the model on the training data

0.7875

### Finding the coefficients and the intercept
#### to use this logistic regression model outside of Python (ex. in tableau, SQL...), we must know the coefficients and the intercept of the model

In [44]:
reg.intercept_

array([-0.17130612])

In [45]:
reg.coef_

array([[ 2.0693389 ,  0.33451181,  1.56099711,  1.31403939,  0.18534521,
         0.69064428, -0.19811719,  0.32822984, -0.31414434,  0.37219726,
        -0.32452265]])

In [46]:
# we want to know which variables these coefficients refer to
# we can get the coefficients from the name of our inputs, column values
feature_names=unscaled_inputs.columns.values
feature_names

array(['Reason 1', 'Reason 2', 'Reason 3', 'Reason 4', 'Month Value',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [47]:
# create a dataframe that will contain the intercept,the feature names, and the corresponding coefficients

summary_table=pd.DataFrame(columns=['Feature Name'],data=feature_names)

# match the coefficients to the feature names
summary_table['Coefficients']=np.transpose(reg.coef_)
# ^by default, ND arrays are rows and not columns so we need to transpose the coefficients using np.transpose()
# ^ this will create a new column in the summary_table called 'Coefficients' and fill it with the coefficients
summary_table

Unnamed: 0,Feature Name,Coefficients
0,Reason 1,2.069339
1,Reason 2,0.334512
2,Reason 3,1.560997
3,Reason 4,1.314039
4,Month Value,0.185345
5,Transportation Expense,0.690644
6,Age,-0.198117
7,Body Mass Index,0.32823
8,Education,-0.314144
9,Children,0.372197


In [48]:
# now we need the intercept
summary_table.index=summary_table.index+1
# in this way the index will start from 1 instead of 0 ( we shifted the index by 1)
# so now we can add the intercept to the beginning of the summary_table
# if we didn't do this step, the intercept would be in the end of the summary_table
summary_table.loc[0]=['Intercept',reg.intercept_[0]]
# reg.intercept_[0] returns the intercept as a single value (a float) rather than a 1D array
summary_table=summary_table.sort_index()
# finally, we sort the summary_table by the index (the feature names) in ascending order
summary_table

Unnamed: 0,Feature Name,Coefficients
0,Intercept,-0.171306
1,Reason 1,2.069339
2,Reason 2,0.334512
3,Reason 3,1.560997
4,Reason 4,1.314039
5,Month Value,0.185345
6,Transportation Expense,0.690644
7,Age,-0.198117
8,Body Mass Index,0.32823
9,Education,-0.314144


### Interpreting the coefficients
##### the coefficents are also called weights or importance of the features in the model
##### the intercept is also called bias which is the predicted output when all the features are 0
#### Standardized coefficients: the coefficients of a regression where all variables have been standardized to have a mean of 0 and a standard deviation of 1
#### the bigger the coefficient, the more the corresponding feature is important in predicting the target variable

In [49]:
# all the coefficients that we have refer to the log(odds)
# create a new series in our dataframe that will contain the odds ratio
summary_table['Odds Ratio']=np.exp(summary_table['Coefficients'])
summary_table

Unnamed: 0,Feature Name,Coefficients,Odds Ratio
0,Intercept,-0.171306,0.842564
1,Reason 1,2.069339,7.919586
2,Reason 2,0.334512,1.397258
3,Reason 3,1.560997,4.763569
4,Reason 4,1.314039,3.721175
5,Month Value,0.185345,1.203634
6,Transportation Expense,0.690644,1.995
7,Age,-0.198117,0.820274
8,Body Mass Index,0.32823,1.388508
9,Education,-0.314144,0.730414


In [50]:
summary_table.sort_values('Odds Ratio',ascending=False)
# the odds ratio of a feature tells us how much more likely the target variable is to be 1 (excessive absenteeism) compared to 0 (no excessive absenteeism) when that feature is present in the model 
# .sort_values() sorts the summary_table by the odds ratio in descending order (from most important to least important)
# if a coef is around 0, the odds ratio is around 1, meaning that the feature does not have much impact on the target variable --> it's not important
# if a coef is around 1, the odds ratio is around 2, meaning that the feature has a big impact on the target variable --> it's important
# if the odds ratio is 1, it means that the odds don't change at all --> if we have odds 5:1 (5 to 1) and odds ratio is 1 => new odds=odds x odds ratio => new odds=5:1 --> nothing changes
# for example, the Daily Work Load has odds ratio=0,.99 so it's almost 1 --> it's not important --> the result with or without it would be the same
# the same goes for other features (like day of the week, distance to work, ... )
# this means that the features (Daily Work Load, Distance to Work, Day of the week ... ) are the least important features out of all the other features we have in predicting excessive absenteeism
# we will consider dropping them later on

Unnamed: 0,Feature Name,Coefficients,Odds Ratio
1,Reason 1,2.069339,7.919586
3,Reason 3,1.560997,4.763569
4,Reason 4,1.314039,3.721175
6,Transportation Expense,0.690644,1.995
10,Children,0.372197,1.450919
2,Reason 2,0.334512,1.397258
8,Body Mass Index,0.32823,1.388508
5,Month Value,0.185345,1.203634
0,Intercept,-0.171306,0.842564
7,Age,-0.198117,0.820274


In [51]:
from sklearn.model_selection import train_test_split
train_test_split(scaled_inputs, targets)

[     Reason 1  Reason 2  Reason 3  Reason 4  Month Value  \
 353  1.732051 -0.092981 -0.314485 -1.217485     1.610276   
 338 -0.577350 -0.092981 -0.314485  0.821365     1.324766   
 92   1.732051 -0.092981 -0.314485 -1.217485     1.324766   
 295  1.732051 -0.092981 -0.314485 -1.217485     1.039256   
 274  1.732051 -0.092981 -0.314485 -1.217485     0.753746   
 ..        ...       ...       ...       ...          ...   
 446 -0.577350 -0.092981 -0.314485 -1.217485    -0.102784   
 78  -0.577350 -0.092981 -0.314485  0.821365     1.039256   
 479 -0.577350 -0.092981 -0.314485  0.821365     0.182726   
 34  -0.577350 -0.092981 -0.314485  0.821365     0.468236   
 0   -0.577350 -0.092981 -0.314485  0.821365     0.182726   
 
      Transportation Expense       Age  Body Mass Index  Education  Children  \
 353                0.190942  0.091435         0.532229          1 -0.019280   
 338               -0.654143  0.248310         1.002633          0 -0.919030   
 92                 0.0400

In [52]:
# 80% training, 20% testing
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size=0.8, random_state=20)

In [53]:
print(x_train.shape, y_train.shape)


(560, 11) (560,)


In [54]:
print(x_test.shape, y_test.shape)

(140, 11) (140,)


#### Logistic Regression Model with sklearn


In [55]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
reg=LogisticRegression()
reg.fit(x_train,y_train)

In [56]:
reg.score(x_train,y_train)
# we can see that the accuracy has dropped down a bit, it's 77% now  

0.7875

#### Now, manually check the accuracy of the model 

In [57]:
model_outputs=reg.predict(x_train)
model_outputs

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0,

In [58]:
y_train
# targets

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0,

In [59]:
model_outputs==y_train

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False,  True, False, False,  True,  True,  True,  True,
       False,  True, False,  True, False, False,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
       False, False, False, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True, False,  True, False,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
       False,  True, False,  True,  True, False, False, False,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
       False,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,

In [60]:
np.sum(model_outputs==y_train)

441

In [61]:
model_accuracy=np.sum(model_outputs==y_train) / y_train.shape[0]
model_accuracy

0.7875

#### The coefficience table

In [62]:
summary_table.sort_values('Odds Ratio',ascending=False)
# the odds ratio of a feature tells us how much more likely the target variable is to be 1 (excessive absenteeism) compared to 0 (no excessive absenteeism) when that feature is present in the model 
# .sort_values() sorts the summary_table by the odds ratio in descending order (from most important to least important)
# if a coef is around 0, the odds ratio is around 1, meaning that the feature does not have much impact on the target variable --> it's not important
# if a coef is around 1, the odds ratio is around 2, meaning that the feature has a big impact on the target variable --> it's important
# if the odds ratio is 1, it means that the odds don't change at all --> if we have odds 5:1 (5 to 1) and odds ratio is 1 => new odds=odds x odds ratio => new odds=5:1 --> nothing changes
# for example, the Daily Work Load has odds ratio=0,.99 so it's almost 1 --> it's not important --> the result with or without it would be the same
# the same goes for other features (like day of the week, distance to work, ... )
# this means that the features (Daily Work Load, Distance to Work, Day of the week ... ) are the least important features out of all the other features we have in predicting excessive absenteeism
# we will consider dropping them later on

Unnamed: 0,Feature Name,Coefficients,Odds Ratio
1,Reason 1,2.069339,7.919586
3,Reason 3,1.560997,4.763569
4,Reason 4,1.314039,3.721175
6,Transportation Expense,0.690644,1.995
10,Children,0.372197,1.450919
2,Reason 2,0.334512,1.397258
8,Body Mass Index,0.32823,1.388508
5,Month Value,0.185345,1.203634
0,Intercept,-0.171306,0.842564
7,Age,-0.198117,0.820274


#### Test the model

In [63]:
reg.score(x_test, y_test)
# accuracy of the model on the test set is 73.5%
# the test set is data that the model has never seen before, 
# in 73.5% of the cases, the model will predict correctly if the person is going to be excessively absent or not
# the test accuracy is always less than the training accuracy

0.7357142857142858

In [64]:
predict_proba = reg.predict_proba(x_test)
predict_proba
# first column: the probability of being 0 (not going to be absent)
# second column: the probability of being 1 (going to be excessively absent)

array([[0.70801877, 0.29198123],
       [0.57280047, 0.42719953],
       [0.39705265, 0.60294735],
       [0.78736658, 0.21263342],
       [0.0671913 , 0.9328087 ],
       [0.31202277, 0.68797723],
       [0.28668409, 0.71331591],
       [0.08112727, 0.91887273],
       [0.80015278, 0.19984722],
       [0.74978251, 0.25021749],
       [0.46800836, 0.53199164],
       [0.18496701, 0.81503299],
       [0.04118853, 0.95881147],
       [0.75148546, 0.24851454],
       [0.23908175, 0.76091825],
       [0.53717628, 0.46282372],
       [0.53405914, 0.46594086],
       [0.52087036, 0.47912964],
       [0.40703101, 0.59296899],
       [0.02769275, 0.97230725],
       [0.70225738, 0.29774262],
       [0.78736658, 0.21263342],
       [0.40667291, 0.59332709],
       [0.40667291, 0.59332709],
       [0.17552869, 0.82447131],
       [0.75448179, 0.24551821],
       [0.48923399, 0.51076601],
       [0.87905692, 0.12094308],
       [0.13128385, 0.86871615],
       [0.78736658, 0.21263342],
       [0.

In [65]:
predict_proba.shape

(140, 2)

In [66]:
predict_proba[:,1]
# extracting the second column which represents the probability of being 1 (going to be excessively absent)

array([0.29198123, 0.42719953, 0.60294735, 0.21263342, 0.9328087 ,
       0.68797723, 0.71331591, 0.91887273, 0.19984722, 0.25021749,
       0.53199164, 0.81503299, 0.95881147, 0.24851454, 0.76091825,
       0.46282372, 0.46594086, 0.47912964, 0.59296899, 0.97230725,
       0.29774262, 0.21263342, 0.59332709, 0.59332709, 0.82447131,
       0.24551821, 0.51076601, 0.12094308, 0.86871615, 0.21263342,
       0.37637627, 0.6938945 , 0.71023288, 0.55822895, 0.21263342,
       0.56313291, 0.20844377, 0.80864401, 0.41379297, 0.62968194,
       0.2039086 , 0.42383034, 0.2264448 , 0.10449233, 0.85750221,
       0.6524952 , 0.70504258, 0.29198123, 0.21054105, 0.1954529 ,
       0.56832683, 0.07887842, 0.68797723, 0.26812434, 0.85894047,
       0.45327413, 0.92995967, 0.21732995, 0.08570795, 0.08994664,
       0.70874539, 0.6765062 , 0.286822  , 0.85954661, 0.18956337,
       0.27059283, 0.01389535, 0.20844377, 0.81057774, 0.29010402,
       0.20844377, 0.06802061, 0.92893757, 0.47908278, 0.63908

### Save the model 

In [67]:
import pickle 
# pickling is the process of converting a Python object into a byte stream
with open('model', 'wb') as file: # wb is write bytes
    pickle.dump(reg, file) # dump the info in a file

# we pickled the reg object because it contains the information of our trained logistic regression model
# when we want to unpickle the model we use rb : read bytes     

In [68]:
with open('scaler', 'wb') as file:
    pickle.dump(absenteeism_scaler, file)
# the absenteeism_scaler is the scaler that we used to scale the inputs before feeding them into the model