### Creating a logistic regression to predict absenteeism

#### Import the relevant libraries

In [1]:
import pandas as pd
import numpy as np

#### Load the data

In [2]:
data_preprocessed = pd.read_csv('C:/Users/Samuel Mwaniki/Downloads/Compressed/Data Scie/part_8_case_study/S53_L379/df-preprocessed.csv')

In [3]:
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2


#### Create the targets

np.where(condition,value if True, value if False)
-.checks if a condition has been satisfied and assigns a value accordingly

In [4]:
# the median value of absenteeism time in hours to use it as a cut-off line
data_preprocessed['Absenteeism Time in Hours'].median()

3.0

In [5]:
targets = np.where(data_preprocessed['Absenteeism Time in Hours'] > 
                   data_preprocessed['Absenteeism Time in Hours'].median(), 1,0)

In [6]:
targets

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,

In [7]:
data_preprocessed['Excessive Absenteeism'] = targets

In [8]:
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Excessive Absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2,0


#### A comment on the targets

In [9]:
#the dataset should be balanced for the two outputs: it will prevent the model from learning to output only 0s or only 1s
# we check by dividing the no_ of targets equal to 1 with the sum of all targets
targets.sum() / targets.shape[0]

0.45571428571428574

In [10]:
# dropping the absenteeism time in hours column
data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours','Day of the Week','Daily Work Load Average','Distance to Work'],axis=1)

In [11]:
#checking if the data_with_targets is the same as data_preprocessed (False/True?)
data_with_targets is data_preprocessed

False

In [12]:
data_with_targets.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Excessive Absenteeism
0,0,0,0,1,7,289,33,30,0,2,1,1
1,0,0,0,0,7,118,50,31,0,1,0,0
2,0,0,0,1,7,179,38,31,0,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0,1
4,0,0,0,1,7,289,33,30,0,2,1,0


#### Select the inputs for regression

In [13]:
data_with_targets.shape

(700, 12)

In [14]:
#to select just the first 14 columns
data_with_targets.iloc[:,:14]

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Excessive Absenteeism
0,0,0,0,1,7,289,33,30,0,2,1,1
1,0,0,0,0,7,118,50,31,0,1,0,0
2,0,0,0,1,7,179,38,31,0,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0,1
4,0,0,0,1,7,289,33,30,0,2,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,179,40,22,1,2,0,1
696,1,0,0,0,5,225,28,24,0,1,2,0
697,1,0,0,0,5,330,28,25,1,0,0,1
698,0,0,0,1,5,235,32,25,1,0,0,0


In [15]:
# select all columns until -1 (same as above) (by putting a - it shows how many columns we want to skip from the end)
data_with_targets.iloc[:,:-1]

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,0,0,0,1,7,289,33,30,0,2,1
1,0,0,0,0,7,118,50,31,0,1,0
2,0,0,0,1,7,179,38,31,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0
4,0,0,0,1,7,289,33,30,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,179,40,22,1,2,0
696,1,0,0,0,5,225,28,24,0,1,2
697,1,0,0,0,5,330,28,25,1,0,0
698,0,0,0,1,5,235,32,25,1,0,0


In [16]:
unscaled_inputs = data_with_targets.iloc[:,:-1]

#### Standardize the data

In [17]:
#from sklearn.preprocessing import StandardScaler

#absenteeism_scaler = StandardScaler()

In [18]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

# create the Custom Scaler class

class CustomScaler(BaseEstimator,TransformerMixin): 
    
    # init or what information we need to declare a CustomScaler object
    # and what is calculated/declared as we do
    
    def __init__(self,columns,copy=True,with_mean=True,with_std=True):
        
        # scaler is nothing but a Standard Scaler object
        self.scaler = StandardScaler(copy,with_mean,with_std)
        # with some columns 'twist'
        self.columns = columns
        self.mean_ = None
        self.var_ = None
        
    
    # the fit method, which, again based on StandardScale
    
    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    
    # the transform method which does the actual scaling

    def transform(self, X, y=None, copy=None):
        
        # record the initial order of the columns
        init_col_order = X.columns
        
        # scale all features that you chose when creating the instance of the class
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        
        # declare a variable containing all information that was not scaled
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        
        # return a data frame which contains all scaled features and all 'not scaled' features
        # use the original order (that you recorded in the beginning)
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]  

In [19]:
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [20]:
columns_to_scale = ['Month Value', 'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
      'Children', 'Pets']


In [21]:
absenteeism_scaler = CustomScaler(columns_to_scale)

In [22]:
#this will calculate and store the mean and the standard deviation
#whenever you get new data you will know that the standardization information is contained in absenteeism_scaler
absenteeism_scaler.fit(unscaled_inputs)


CustomScaler(columns=['Month Value', 'Transportation Expense', 'Age',
                      'Body Mass Index', 'Education', 'Children', 'Pets'],
             copy=None, with_mean=None, with_std=None)

In [23]:
# .transform() does the actual scaling
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)

In [24]:
scaled_inputs

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,0,0,0,1,0.182726,1.005844,-0.536062,0.767431,-0.447980,0.880469,0.268487
1,0,0,0,0,0.182726,-1.574681,2.130803,1.002633,-0.447980,-0.019280,-0.589690
2,0,0,0,1,0.182726,-0.654143,0.248310,1.002633,-0.447980,-0.919030,-0.589690
3,1,0,0,0,0.182726,0.854936,0.405184,-0.643782,-0.447980,0.880469,-0.589690
4,0,0,0,1,0.182726,1.005844,-0.536062,0.767431,-0.447980,0.880469,0.268487
...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,-0.388293,-0.654143,0.562059,-1.114186,2.232242,0.880469,-0.589690
696,1,0,0,0,-0.388293,0.040034,-1.320435,-0.643782,-0.447980,-0.019280,1.126663
697,1,0,0,0,-0.388293,1.624567,-1.320435,-0.408580,2.232242,-0.919030,-0.589690
698,0,0,0,1,-0.388293,0.190942,-0.692937,-0.408580,2.232242,-0.919030,-0.589690


In [25]:
scaled_inputs.shape

(700, 11)

### Split the data into Train and Test  and Shuffle

#### Import the relevant module

In [26]:
# this module splits arrays or matrices into train and test subsets
from sklearn.model_selection import train_test_split

#### Split

In [27]:
#sklearn.mode_selection.train_test_split(inputs,targets)
train_test_split(scaled_inputs, targets)

#array 1: a training dataset with inputs
#array 2: a training dataset with targets
#array 3: a trest dataset with inputs
#array 4: a test dataset with targets

[     Reason_1  Reason_2  Reason_3  Reason_4  Month Value  \
 593         0         0         0         1    -1.244823   
 66          0         0         0         1     1.039256   
 375         0         0         0         1    -1.244823   
 65          0         0         0         1     1.039256   
 337         0         0         0         0     1.324766   
 ..        ...       ...       ...       ...          ...   
 24          0         0         1         0     0.468236   
 329         0         0         0         1     1.324766   
 120         0         0         0         1    -1.530333   
 4           0         0         0         1     0.182726   
 163         1         0         0         0    -0.959313   
 
      Transportation Expense       Age  Body Mass Index  Education  Children  \
 593               -0.654143  0.248310         1.002633  -0.447980 -0.919030   
 66                -0.654143  0.248310         1.002633  -0.447980 -0.919030   
 375               -0.6541

In [28]:
##sklearn.mode_selection.train_test_split(inputs,targets, train_size, shuffle=True, random_state
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size = 0.8, random_state=20)

In [29]:
print (x_train.shape, y_train.shape)

(560, 11) (560,)


In [30]:
print (x_test.shape, y_test.shape)

(140, 11) (140,)


### Logistic regression with sklearn

In [31]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

#### Train the model

In [32]:
reg = LogisticRegression()

In [33]:
#sklearn.linear_model.LogisticRegression.fit(x,y) fits the model according to the given training data
reg.fit(x_train,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [34]:
#to evaluate the model's accuracy
#sklearn.linear_model.LogisticRegression.score(inputs,targets) returns the mean accuracy on the given test data and labels
reg.score(x_train,y_train)

0.775

#### Manually check the accuracy

sklearn.linear_model.LogisticRegression.predict(inputs)
-> predicts class labels(logistic regression outputs) for given input samples

In [35]:
#finding the predicted outputs of the regression
model_outputs = reg.predict(x_train)
model_outputs

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0,

In [36]:
y_train

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0,

In [37]:
model_outputs == y_train

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False,  True, False, False,  True,  True,  True,  True,
       False,  True, False,  True, False, False,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
       False, False, False, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True, False,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
       False,  True, False,  True,  True, False, False, False,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
       False,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,

In [38]:
#total number  of correct predictions(True entries)
np.sum((model_outputs==y_train))

434

In [39]:
# to get accuracy (Accuracy = Correct predictions /#observations)
np.sum((model_outputs==y_train)) / model_outputs.shape[0]

0.775

#### Finding the intercept and coefficients


In [40]:
reg.intercept_

array([-1.50051987])

In [41]:
reg.coef_

array([[ 2.62325342,  0.86052885,  2.95627765,  0.65997736,  0.15494934,
         0.59957868, -0.17245729,  0.27613841, -0.08613388,  0.34267721,
        -0.27736372]])

In [42]:
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [43]:
feature_name = unscaled_inputs.columns.values

In [44]:
summary_table = pd.DataFrame (columns=['Feature name'],data = feature_name)

summary_table['Coefficient'] = np.transpose(reg.coef_)

summary_table

Unnamed: 0,Feature name,Coefficient
0,Reason_1,2.623253
1,Reason_2,0.860529
2,Reason_3,2.956278
3,Reason_4,0.659977
4,Month Value,0.154949
5,Transportation Expense,0.599579
6,Age,-0.172457
7,Body Mass Index,0.276138
8,Education,-0.086134
9,Children,0.342677


In [45]:
#adding the intercept
summary_table.index = summary_table.index + 1
# +1 will shift all indices by 1
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Feature name,Coefficient
0,Intercept,-1.50052
1,Reason_1,2.623253
2,Reason_2,0.860529
3,Reason_3,2.956278
4,Reason_4,0.659977
5,Month Value,0.154949
6,Transportation Expense,0.599579
7,Age,-0.172457
8,Body Mass Index,0.276138
9,Education,-0.086134


#### Interpreting the coefficients

In [46]:
#adding exponentials of the coefficients to our table
summary_table['Odds_ratio'] = np.exp(summary_table.Coefficient)

In [47]:
summary_table

Unnamed: 0,Feature name,Coefficient,Odds_ratio
0,Intercept,-1.50052,0.223014
1,Reason_1,2.623253,13.780484
2,Reason_2,0.860529,2.364411
3,Reason_3,2.956278,19.226271
4,Reason_4,0.659977,1.934749
5,Month Value,0.154949,1.167599
6,Transportation Expense,0.599579,1.821351
7,Age,-0.172457,0.841594
8,Body Mass Index,0.276138,1.31803
9,Education,-0.086134,0.917471


In [48]:
#DataFrame.sort_values(Series) sorts the values in a data frame with respect to a given column(Series)
summary_table.sort_values('Odds_ratio', ascending=False)

Unnamed: 0,Feature name,Coefficient,Odds_ratio
3,Reason_3,2.956278,19.226271
1,Reason_1,2.623253,13.780484
2,Reason_2,0.860529,2.364411
4,Reason_4,0.659977,1.934749
6,Transportation Expense,0.599579,1.821351
10,Children,0.342677,1.408714
8,Body Mass Index,0.276138,1.31803
5,Month Value,0.154949,1.167599
9,Education,-0.086134,0.917471
7,Age,-0.172457,0.841594


#### Testing the model

In [49]:
#test accuracy is normally lower than the train accuracy (due to overfitting)
reg.score(x_test,y_test)

0.75

In [56]:
#to get the output
#model_outputs = reg.predict(x_test)
#model_outputs
#or

# use sklearn.linear_model.LogisticRegression.predict_proba(x) -(it returns the probability estimates for all possible outputs(classes))

predicted_proba = reg.predict_proba(x_test)
predicted_proba

array([[0.71222685, 0.28777315],
       [0.58778491, 0.41221509],
       [0.44320565, 0.55679435],
       [0.7790503 , 0.2209497 ],
       [0.08464799, 0.91535201],
       [0.33111741, 0.66888259],
       [0.29801904, 0.70198096],
       [0.12956608, 0.87043392],
       [0.78316959, 0.21683041],
       [0.74709573, 0.25290427],
       [0.49524568, 0.50475432],
       [0.22654899, 0.77345101],
       [0.0703639 , 0.9296361 ],
       [0.73435854, 0.26564146],
       [0.30555167, 0.69444833],
       [0.5497454 , 0.4502546 ],
       [0.55037353, 0.44962647],
       [0.53940324, 0.46059676],
       [0.40143356, 0.59856644],
       [0.05322408, 0.94677592],
       [0.69885218, 0.30114782],
       [0.7790503 , 0.2209497 ],
       [0.41645917, 0.58354083],
       [0.41645917, 0.58354083],
       [0.24144286, 0.75855714],
       [0.74327028, 0.25672972],
       [0.51085227, 0.48914773],
       [0.85674429, 0.14325571],
       [0.19939557, 0.80060443],
       [0.7790503 , 0.2209497 ],
       [0.

In [53]:
predicted_proba.shape

(140, 2)

In [54]:
predicted_proba[:,1]

array([0.28777315, 0.41221509, 0.55679435, 0.2209497 , 0.91535201,
       0.66888259, 0.70198096, 0.87043392, 0.21683041, 0.25290427,
       0.50475432, 0.77345101, 0.9296361 , 0.26564146, 0.69444833,
       0.4502546 , 0.44962647, 0.46059676, 0.59856644, 0.94677592,
       0.30114782, 0.2209497 , 0.58354083, 0.58354083, 0.75855714,
       0.25672972, 0.48914773, 0.14325571, 0.80060443, 0.2209497 ,
       0.3701032 , 0.68298449, 0.68817792, 0.5268484 , 0.2209497 ,
       0.53490698, 0.224437  , 0.74377823, 0.40335639, 0.60289237,
       0.21342877, 0.4553322 , 0.24021247, 0.43968366, 0.82623819,
       0.57845486, 0.69448135, 0.28777315, 0.22189347, 0.20609612,
       0.57627163, 0.36587777, 0.66888259, 0.27106447, 0.83319751,
       0.43380666, 0.88593556, 0.23391037, 0.37278339, 0.38318415,
       0.69848331, 0.65901233, 0.29386538, 0.79673039, 0.20953592,
       0.26998412, 0.10428008, 0.224437  , 0.73916521, 0.30155762,
       0.224437  , 0.32742148, 0.90340794, 0.4575272 , 0.59995

### Save the model
###### pickle [module]
is a python module used to convert a python object into a character stream

In [57]:
import pickle 

In [60]:
#model-file name wb- write bytes  dump -'save' reg - object to be dumped
with open('model', 'wb') as file:
    pickle.dump(reg, file)

In [61]:
with open('scaler', 'wb') as file:
    pickle.dump(absenteeism_scaler, file)