In [1]:
import pandas as pd 
import numpy as np

In [2]:
df_preprocessed = pd.read_csv('df_preprocessed.csv')

In [3]:
df_preprocessed.head(3)

Unnamed: 0,reason_type_1,reason_type_2,reason_type_3,reason_type_4,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Month value,Day of Week
0,0,0,0,1,289,36,33,239.554,30,0,2,1,4,7,1
1,0,0,0,0,118,13,50,239.554,31,0,1,0,0,7,1
2,0,0,0,1,179,51,38,239.554,31,0,0,0,2,7,2


### Create the targets

In [4]:
df_preprocessed['Absenteeism Time in Hours'].median()

3.0

In [5]:
targets = np.where(df_preprocessed['Absenteeism Time in Hours'] > 3, 1, 0)

In [6]:
df_preprocessed['Excessive Absenteeism'] = targets

### Comment on the targets 

In [7]:
targets.sum()/targets.shape[0]

0.45571428571428574

In [8]:
data_with_target = df_preprocessed.drop(['Absenteeism Time in Hours'], axis=1) # Data check point

In [9]:
data_with_target.head(2)

Unnamed: 0,reason_type_1,reason_type_2,reason_type_3,reason_type_4,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Month value,Day of Week,Excessive Absenteeism
0,0,0,0,1,289,36,33,239.554,30,0,2,1,7,1,1
1,0,0,0,0,118,13,50,239.554,31,0,1,0,7,1,0


In [10]:
unscaled_data = data_with_target.iloc[:,:-1]

### Standardize the data

In [11]:
from sklearn.preprocessing import StandardScaler

In [12]:
absenteeism_scaler = StandardScaler()

In [13]:
absenteeism_scaler.fit(unscaled_data)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [14]:
scaled_inputs = absenteeism_scaler.transform(unscaled_data)

### Split data into train & test and shuffle 

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size = 0.8, random_state = 20)

### Logistic regression with sklearn

In [20]:
from sklearn.linear_model import LogisticRegression 

In [21]:
from sklearn import metrics

### Training the model

In [22]:
reg = LogisticRegression()

In [23]:
reg.fit(x_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [24]:
reg.score(x_train,y_train)

0.7803571428571429

### Manually check the accuracy

In [25]:
model_output = reg.predict(x_train)
model_output

array([0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0,

In [26]:
y_train

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0,

In [27]:
model_output == y_train

array([ True,  True, False,  True,  True,  True,  True,  True,  True,
        True, False,  True, False, False,  True,  True,  True,  True,
       False,  True, False,  True, False, False,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
       False, False, False, False,  True, False,  True,  True, False,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True, False,  True, False,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
       False,  True, False,  True,  True, False, False, False,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,

In [28]:
np.sum((model_output == y_train))

437

In [29]:
model_output.shape[0]

560

In [30]:
np.sum((model_output == y_train))/model_output.shape[0]

0.7803571428571429

### Finding the intercept and coefficients

In [31]:
reg.intercept_

array([-0.21150898])

In [33]:
reg.coef_

array([[ 2.07192269,  0.33075027,  1.56390046,  1.31283678,  0.72326969,
        -0.06149037, -0.20628304, -0.02865604,  0.3259002 , -0.16141702,
         0.38153429, -0.32129837,  0.02577323, -0.08622837]])

In [35]:
feature_name = unscaled_data.columns.values

In [36]:
summary_table = pd.DataFrame(columns = ['Feature name'], data = feature_name)

In [37]:
summary_table['Coeficient']= np.transpose(reg.coef_)

In [38]:
summary_table

Unnamed: 0,Feature name,Coeficient
0,reason_type_1,2.071923
1,reason_type_2,0.33075
2,reason_type_3,1.5639
3,reason_type_4,1.312837
4,Transportation Expense,0.72327
5,Distance to Work,-0.06149
6,Age,-0.206283
7,Daily Work Load Average,-0.028656
8,Body Mass Index,0.3259
9,Education,-0.161417


In [39]:
summary_table.index = summary_table.index +1 
summary_table.loc[0]=['Intercept', reg.intercept_[0]]
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Feature name,Coeficient
0,Intercept,-0.211509
1,reason_type_1,2.071923
2,reason_type_2,0.33075
3,reason_type_3,1.5639
4,reason_type_4,1.312837
5,Transportation Expense,0.72327
6,Distance to Work,-0.06149
7,Age,-0.206283
8,Daily Work Load Average,-0.028656
9,Body Mass Index,0.3259


In [40]:
summary_table['Odds_ratio'] = np.exp(summary_table.Coeficient)
summary_table

Unnamed: 0,Feature name,Coeficient,Odds_ratio
0,Intercept,-0.211509,0.809362
1,reason_type_1,2.071923,7.940075
2,reason_type_2,0.33075,1.392012
3,reason_type_3,1.5639,4.777419
4,reason_type_4,1.312837,3.716702
5,Transportation Expense,0.72327,2.061162
6,Distance to Work,-0.06149,0.940362
7,Age,-0.206283,0.813603
8,Daily Work Load Average,-0.028656,0.971751
9,Body Mass Index,0.3259,1.385277


In [41]:
summary_table.sort_values('Odds_ratio')

Unnamed: 0,Feature name,Coeficient,Odds_ratio
12,Pets,-0.321298,0.725207
0,Intercept,-0.211509,0.809362
7,Age,-0.206283,0.813603
10,Education,-0.161417,0.850937
14,Day of Week,-0.086228,0.917385
6,Distance to Work,-0.06149,0.940362
8,Daily Work Load Average,-0.028656,0.971751
13,Month value,0.025773,1.026108
9,Body Mass Index,0.3259,1.385277
2,reason_type_2,0.33075,1.392012


In [42]:
summary_table.sort_values('Odds_ratio')

Unnamed: 0,Feature name,Coeficient,Odds_ratio
12,Pets,-0.321298,0.725207
0,Intercept,-0.211509,0.809362
7,Age,-0.206283,0.813603
10,Education,-0.161417,0.850937
14,Day of Week,-0.086228,0.917385
6,Distance to Work,-0.06149,0.940362
8,Daily Work Load Average,-0.028656,0.971751
13,Month value,0.025773,1.026108
9,Body Mass Index,0.3259,1.385277
2,reason_type_2,0.33075,1.392012
