#This project is a data science project to analyze a csv file that contains employee absenteeism. It will use a logistic regression Machine Learning model to predict the absenteeism of employees.

##This part of the project focuses on building the ML model using logistic regression to predict absenteeism

In [1]:
import pandas as pd
import numpy as np

In [2]:
data_preprocessed = pd.read_csv('/content/Absenteeism_preprocessed.csv')

In [3]:
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2


##Create the targets

In [4]:
#We will creat 2 classes: moderately absent, excessively absent
#Through the median, anything below is considered moderate, anything above is excessive

data_preprocessed['Absenteeism Time in Hours'].median()
#moderate <=3 , will have the value of 0 (target)
#excessive >=4 , will have the value of 1 (target)

3.0

In [5]:
#np.where(condition, if true, if false)
targets = np.where(data_preprocessed['Absenteeism Time in Hours']>
                   data_preprocessed['Absenteeism Time in Hours'].median(), 1, 0)
targets

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,

In [6]:
data_preprocessed['Excessive Absenteeism'] = targets
data_preprocessed

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Excessive Absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,2,179,22,40,237.656,22,1,2,0,8,1
696,1,0,0,0,5,2,225,26,28,237.656,24,0,1,2,3,0
697,1,0,0,0,5,3,330,16,28,237.656,25,1,0,0,8,1
698,0,0,0,1,5,3,235,16,32,237.656,25,1,0,0,2,0


###Validating targets and making sure they're balanced

In [7]:
#Finding the percentage of number of 1s
targets.sum()/targets.shape[0]
#45.5% are 1s, 55.5% are 0s

0.45571428571428574

In [8]:
#As the targets are balanced, we will delete Absenteeism Time in Hours column
data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours', 'Day of the Week',
                                            'Daily Work Load Average', 'Distance to Work'], axis =1)
data_with_targets

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Excessive Absenteeism
0,0,0,0,1,7,289,33,30,0,2,1,1
1,0,0,0,0,7,118,50,31,0,1,0,0
2,0,0,0,1,7,179,38,31,0,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0,1
4,0,0,0,1,7,289,33,30,0,2,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,179,40,22,1,2,0,1
696,1,0,0,0,5,225,28,24,0,1,2,0
697,1,0,0,0,5,330,28,25,1,0,0,1
698,0,0,0,1,5,235,32,25,1,0,0,0


##Selecting inputs for regression

In [9]:
data_with_targets.shape

(700, 12)

In [10]:
#All rows, all columns except the targers 'Excessive Absenteeism', the last column
data_with_targets.iloc[:,:-1]

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,0,0,0,1,7,289,33,30,0,2,1
1,0,0,0,0,7,118,50,31,0,1,0
2,0,0,0,1,7,179,38,31,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0
4,0,0,0,1,7,289,33,30,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,179,40,22,1,2,0
696,1,0,0,0,5,225,28,24,0,1,2
697,1,0,0,0,5,330,28,25,1,0,0
698,0,0,0,1,5,235,32,25,1,0,0


In [11]:
unscaled_inputs = data_with_targets.iloc[:,:-1]

##Standarizing the data
####We need to create a custom scalar to avoid scaling the dummy variables (Reasons) and Education

In [12]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler


class CustomScaler(BaseEstimator, TransformerMixin):
    def __init__(self, columns, copy=True, with_mean=True, with_std=True):
        # Initialize StandardScaler with keyword arguments
        self.scaler = StandardScaler(copy=copy, with_mean=with_mean, with_std=with_std)
        self.columns = columns
        self.mean_ = None
        self.var_ = None
        self.copy = copy
        self.with_mean = with_mean
        self.with_std = with_std

    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self

    def transform(self, X, y=None, copy=None):
        # Record the initial order of the columns
        init_col_order = X.columns

        # Scale only the specified columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)

        # Declare a variable containing all information that was not scaled
        X_not_scaled = X.loc[:, ~X.columns.isin(self.columns)]

        # Return a DataFrame containing all scaled and not scaled features in the original order
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

In [13]:
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [14]:
columns_to_omit = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4','Education']

In [15]:
columns_to_scale = [x for x in unscaled_inputs.columns.values if x not in columns_to_omit]

In [16]:
absenteeism_scaler = CustomScaler(columns_to_scale)

In [17]:
absenteeism_scaler.fit(unscaled_inputs)

In [18]:
absenteeism_scalar = CustomScaler(columns_to_scale)
#fit() computes the mean and standard deviation of the parameter unscaled_inputs
absenteeism_scalar.fit(unscaled_inputs)

In [19]:
#transform() does the actual scaling. It subtracts the mean and divides by standard deviation
scaled_inputs = absenteeism_scalar.transform(unscaled_inputs)
scaled_inputs


Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,0,0,0,1,0.182726,1.005844,-0.536062,0.767431,0,0.880469,0.268487
1,0,0,0,0,0.182726,-1.574681,2.130803,1.002633,0,-0.019280,-0.589690
2,0,0,0,1,0.182726,-0.654143,0.248310,1.002633,0,-0.919030,-0.589690
3,1,0,0,0,0.182726,0.854936,0.405184,-0.643782,0,0.880469,-0.589690
4,0,0,0,1,0.182726,1.005844,-0.536062,0.767431,0,0.880469,0.268487
...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,-0.388293,-0.654143,0.562059,-1.114186,1,0.880469,-0.589690
696,1,0,0,0,-0.388293,0.040034,-1.320435,-0.643782,0,-0.019280,1.126663
697,1,0,0,0,-0.388293,1.624567,-1.320435,-0.408580,1,-0.919030,-0.589690
698,0,0,0,1,-0.388293,0.190942,-0.692937,-0.408580,1,-0.919030,-0.589690


###The above output shows all columns scaled in respect to themselves, except for the columns we omitted

In [20]:
scaled_inputs.shape

(700, 11)

##Splitting the data

In [21]:
from sklearn.model_selection import train_test_split

In [22]:
#train_test_split(inputs, targets)
train_test_split(scaled_inputs, targets)

#The output is 4 arrays:
# array1: training data set with inputs
# array2: training data set with targets
# array3: test data set with inputs
# array4: test data set with targets

[     Reason_1  Reason_2  Reason_3  Reason_4  Month Value  \
 490         0         0         0         1     0.468236   
 121         0         0         0         1    -1.530333   
 67          0         0         0         1     1.039256   
 191         1         0         0         0    -0.673803   
 288         1         0         0         0     0.753746   
 ..        ...       ...       ...       ...          ...   
 621         0         0         0         1    -0.959313   
 437         0         0         0         1    -0.388293   
 245         0         0         0         1     0.182726   
 295         1         0         0         0     1.039256   
 644         0         0         0         1    -0.959313   
 
      Transportation Expense       Age  Body Mass Index  Education  Children  \
 490                0.190942  0.091435         0.532229          1 -0.019280   
 121               -1.574681  0.091435         0.297027          0 -0.919030   
 67                 0.0400

In [23]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size = 0.8, random_state = 20)
print (x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

#The x (input) train and test are a matrix
#The y (target) train and test are a vector (the target column)

#Training is 80% and testing is 20%
#random_state is used ensure the shuffling always happens in the same random way

(560, 11) (560,)
(140, 11) (140,)


##Logistic regression with sklearn

In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

##Training the model

In [25]:
reg = LogisticRegression()
reg.fit(x_train, y_train) #train using the training inputs and targets

In [26]:
#score(inputs, targets), returns the men accuracy
reg.score(x_train, y_train)

0.7732142857142857

##Manually checking the accuracy

In [27]:
#Finding the predicted output of the training input
model_outputs = reg.predict(x_train)
model_outputs

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0,

In [28]:
#Printing the target output
y_train

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0,

In [29]:
#Comparing the predicted output from input, and the targets
model_outputs == y_train

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False,  True, False, False,  True,  True,  True,  True,
       False,  True, False,  True, False, False,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
       False, False, False, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True, False,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
       False,  True, False,  True,  True, False, False, False,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
       False,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,

In [30]:
#Finding the number of correct predictions (True entries) through the sum
np.sum(model_outputs == y_train)

433

In [31]:
#Finding the accuracy

#Total number of outputs
totalOutputs = model_outputs.shape[0]

#Finding the number of correct predictions (True entries) through the sum
correctOutputs = np.sum(model_outputs == y_train)
accuracy = correctOutputs/totalOutputs
accuracy

0.7732142857142857

##Finding the Intercept and Coefficients

In [32]:
#Finding the intercept
reg.intercept_

array([-1.6474549])

In [33]:
#Finding the coefficient
reg.coef_

array([[ 2.80019733,  0.95188356,  3.11555338,  0.83900082,  0.1589299 ,
         0.60528415, -0.16989096,  0.27981088, -0.21053312,  0.34826214,
        -0.27739602]])

In [34]:
feature_name = unscaled_inputs.columns.values
feature_name

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [35]:
#Creating a data frame that has the features matched with their coeffiencts
summary_table = pd.DataFrame (columns = ['Feature name'], data = feature_name)

#Creating a new column that has the coefficients
#Used the transpose method because ndarray is a row not a column
summary_table['Coefficient'] = np.transpose(reg.coef_)
summary_table

Unnamed: 0,Feature name,Coefficient
0,Reason_1,2.800197
1,Reason_2,0.951884
2,Reason_3,3.115553
3,Reason_4,0.839001
4,Month Value,0.15893
5,Transportation Expense,0.605284
6,Age,-0.169891
7,Body Mass Index,0.279811
8,Education,-0.210533
9,Children,0.348262


In [36]:
#Adding the intercept to index 0 by shifting all indices one index below
summary_table.index = summary_table.index +1
summary_table.loc[0] = ['Intercept', reg.intercept_[0]] #putting [0] for the intercept to show value not array
summary_table = summary_table.sort_index() #to sort rows based on indices
summary_table

Unnamed: 0,Feature name,Coefficient
0,Intercept,-1.647455
1,Reason_1,2.800197
2,Reason_2,0.951884
3,Reason_3,3.115553
4,Reason_4,0.839001
5,Month Value,0.15893
6,Transportation Expense,0.605284
7,Age,-0.169891
8,Body Mass Index,0.279811
9,Education,-0.210533


###As all variables are standarized, the bigger the weight (in positive or negative) the more important it is

------------------------

###The output of the logistic regression is a log(odds) that gets interpretted as a 0 or 1

In [37]:
#The exponential of the weight gives us the OR (Odd Ratio)
summary_table['Odds_ratio'] = np.exp(summary_table.Coefficient)
summary_table

Unnamed: 0,Feature name,Coefficient,Odds_ratio
0,Intercept,-1.647455,0.192539
1,Reason_1,2.800197,16.447892
2,Reason_2,0.951884,2.590585
3,Reason_3,3.115553,22.545903
4,Reason_4,0.839001,2.314054
5,Month Value,0.15893,1.172256
6,Transportation Expense,0.605284,1.831773
7,Age,-0.169891,0.843757
8,Body Mass Index,0.279811,1.32288
9,Education,-0.210533,0.810152


In [38]:
#Sorting the dataframe to sort the whole DF with respect to the given column
summary_table.sort_values('Odds_ratio', ascending =False)

Unnamed: 0,Feature name,Coefficient,Odds_ratio
3,Reason_3,3.115553,22.545903
1,Reason_1,2.800197,16.447892
2,Reason_2,0.951884,2.590585
4,Reason_4,0.839001,2.314054
6,Transportation Expense,0.605284,1.831773
10,Children,0.348262,1.416604
8,Body Mass Index,0.279811,1.32288
5,Month Value,0.15893,1.172256
7,Age,-0.169891,0.843757
9,Education,-0.210533,0.810152


###The highest the OR, the more important the feature/variable is
####If coefficient is around 0, or if the OR is around 1 --> not important feature

####If weight is near 0, when multiplied by the variable, result will be near 0 too, doing very minimal affect

#### ODDS * ODDS RATIO = NEW ODDS
#### 5:1  *     2      =   10:1
#### 5:1  *     0.2    =   1:1
#### 5:1  *     1      =   5:1 (no change)

_______________
###Features that are not important and may be dropped are:
* Daily Work Load Average
* Distance to Work
* Day of the Week

________________
###As we deleted Reason_0 (No given reason for absence), the base model is when there is no reason.

##Example: dummy non stadarized feature

####For example, a person with Reason_3 (poisoning) is almost 20 times more likely to be excessively absent than not having a reason.

##### Reason_1: various diseases
##### Reason_2: pregnancy and giving birth
##### Reason_3: poisoning
##### Reason_4: light diseases

##Example: standarized positive coef feature

####For example, in consideration of standarized features, one standarized unit, or for one standard deviation increase in Transportation Expense it is close to be twice as likely to be excessively absent.

##Example: standarized negative coef feature

####For example, the Pet feature (OR = 0.759676), for each additional standarized unit of Pet, the odds are 1-OR = 24% lower than the base model (no pet). In short, if a person has several pets, another person is probably taking care of them and not just this individual.



###The intercept calibrates the model to give more accurate predictions

###Standarized models almost always yield higher accuracy because the optimization algorithm works better this way.

---------------

##Backward Elimination: simplifying the model through removing all features with minimal contribution

###in respect to p-values, we get rid of all coef with p-values>0.05

##3 columns were dropped, 'Day of the Week','Daily Work Load Average', 'Distance to Work' in cell 8.

###By simply deleting the 3 columns from the .drop(), and running the whole kernal again, we can see that their coef is close to 0 and their OR is close to 1.
--------------


#TESTING THE MODEL






In [39]:
reg.score(x_test, y_test)

0.75

###Based on the testing data, data the model have never seen before, in 75% of the cases the model will predict correctly if the person is going to be excessively absent

##Test accuracy is always less than the train accuracy by definition

In [None]:
#.predict_proba(x) returns the probability for all possible outputs (classes)

In [41]:
predicted_proba = reg.predict_proba(x_test)
predicted_proba

array([[0.71340413, 0.28659587],
       [0.58724228, 0.41275772],
       [0.44020821, 0.55979179],
       [0.78159464, 0.21840536],
       [0.08410854, 0.91589146],
       [0.33487603, 0.66512397],
       [0.29984576, 0.70015424],
       [0.13103971, 0.86896029],
       [0.78625404, 0.21374596],
       [0.74903632, 0.25096368],
       [0.49397598, 0.50602402],
       [0.22484913, 0.77515087],
       [0.07129151, 0.92870849],
       [0.73178133, 0.26821867],
       [0.30934135, 0.69065865],
       [0.5471671 , 0.4528329 ],
       [0.55052275, 0.44947725],
       [0.5392707 , 0.4607293 ],
       [0.40201117, 0.59798883],
       [0.05361575, 0.94638425],
       [0.7003009 , 0.2996991 ],
       [0.78159464, 0.21840536],
       [0.42037128, 0.57962872],
       [0.42037128, 0.57962872],
       [0.24783565, 0.75216435],
       [0.74566259, 0.25433741],
       [0.51017274, 0.48982726],
       [0.85690195, 0.14309805],
       [0.20349733, 0.79650267],
       [0.78159464, 0.21840536],
       [0.

In [42]:
predicted_proba.shape
#The first column refers to the probability of the observation being 0
#While the second refers to 1
#The sum of any 2 numbers horizontally = 1

(140, 2)

In [43]:
#As we are only interested in the probability of 1 (exessive absenteeism), we cut out the first column
predicted_proba[:,1]

array([0.28659587, 0.41275772, 0.55979179, 0.21840536, 0.91589146,
       0.66512397, 0.70015424, 0.86896029, 0.21374596, 0.25096368,
       0.50602402, 0.77515087, 0.92870849, 0.26821867, 0.69065865,
       0.4528329 , 0.44947725, 0.4607293 , 0.59798883, 0.94638425,
       0.2996991 , 0.21840536, 0.57962872, 0.57962872, 0.75216435,
       0.25433741, 0.48982726, 0.14309805, 0.79650267, 0.21840536,
       0.36956558, 0.67906035, 0.68502567, 0.52868083, 0.21840536,
       0.53506551, 0.22147081, 0.73692105, 0.40498044, 0.60505988,
       0.21075848, 0.45224466, 0.23751292, 0.39833498, 0.82755447,
       0.56797575, 0.69113325, 0.28659587, 0.21935267, 0.2033097 ,
       0.57628256, 0.3294664 , 0.66512397, 0.26949499, 0.83321968,
       0.43491525, 0.88374612, 0.23127072, 0.33415858, 0.34432939,
       0.69909345, 0.65494263, 0.29244941, 0.79200758, 0.20750276,
       0.26840558, 0.08708566, 0.22147081, 0.73245417, 0.30530219,
       0.22147081, 0.29014408, 0.90438191, 0.46061297, 0.60174

#Saving the model

###Using pickle [module], we convert a python object into a character stream. The character stream will contain sufficient information to be used in a new notebook

In [44]:
import pickle

In [45]:
#open(model name, write bytes)
with open('logisticRegression_Model', 'wb') as file:

  #dump = save, reg = object to be saved
  pickle.dump(reg, file)

###We also need to pickle the scalar, to use it to preprocess all new data in the same rules applied to training data.

In [46]:
with open('logisticRegression_Scaler', 'wb') as file:
  pickle.dump(absenteeism_scalar, file)