## Creating Logistic Regression

## Import revelant libraries

In [1]:
import pandas as pd
import numpy as np

## Load the data

In [15]:
data_preprocessed = pd.read_csv('preprocessed.csv')

In [16]:
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Month Value,Day of the Week,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,2015-07-07,289,36,33,239.554,7,1,30,0,2,1,4
1,0,0,0,0,2015-07-14,118,13,50,239.554,7,1,31,0,1,0,0
2,0,0,0,1,2015-07-15,179,51,38,239.554,7,2,31,0,0,0,2
3,1,0,0,0,2015-07-16,279,5,39,239.554,7,3,24,0,2,0,4
4,0,0,0,1,2015-07-23,289,36,33,239.554,7,3,30,0,2,1,2


## Create Targets/Labels

In [17]:
median = data_preprocessed['Absenteeism Time in Hours'].median()

In [18]:
targets = np.where(data_preprocessed['Absenteeism Time in Hours'] > median,1,0)

In [19]:
data_preprocessed['Excessive Absenteeism'] = targets

In [20]:
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Month Value,Day of the Week,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Excessive Absenteeism
0,0,0,0,1,2015-07-07,289,36,33,239.554,7,1,30,0,2,1,4,1
1,0,0,0,0,2015-07-14,118,13,50,239.554,7,1,31,0,1,0,0,0
2,0,0,0,1,2015-07-15,179,51,38,239.554,7,2,31,0,0,0,2,0
3,1,0,0,0,2015-07-16,279,5,39,239.554,7,3,24,0,2,0,4,1
4,0,0,0,1,2015-07-23,289,36,33,239.554,7,3,30,0,2,1,2,0


## A comment on the targets
Dataset need to be balanced. A normal will be on 45-55% of distinct targets.

Normally:
60-40 for Logistic Regression
Different balance on another model.

In [14]:
targets.sum()/targets.shape[0]

0.45571428571428574

In [25]:
#drop date bcoz is no longer needed
data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours','Date'],axis=1)

In [26]:
#just a check
data_with_targets is data_preprocessed

False

In [27]:
data_with_targets.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Month Value,Day of the Week,Body Mass Index,Education,Children,Pets,Excessive Absenteeism
0,0,0,0,1,289,36,33,239.554,7,1,30,0,2,1,1
1,0,0,0,0,118,13,50,239.554,7,1,31,0,1,0,0
2,0,0,0,1,179,51,38,239.554,7,2,31,0,0,0,0
3,1,0,0,0,279,5,39,239.554,7,3,24,0,2,0,1
4,0,0,0,1,289,36,33,239.554,7,3,30,0,2,1,0


## Select the input for the regression

In [28]:
data_with_targets.shape

(700, 15)

In [31]:
data_with_targets.iloc[:,:14]

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Month Value,Day of the Week,Body Mass Index,Education,Children,Pets
0,0,0,0,1,289,36,33,239.554,7,1,30,0,2,1
1,0,0,0,0,118,13,50,239.554,7,1,31,0,1,0
2,0,0,0,1,179,51,38,239.554,7,2,31,0,0,0
3,1,0,0,0,279,5,39,239.554,7,3,24,0,2,0
4,0,0,0,1,289,36,33,239.554,7,3,30,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,179,22,40,237.656,5,2,22,1,2,0
696,1,0,0,0,225,26,28,237.656,5,2,24,0,1,2
697,1,0,0,0,330,16,28,237.656,5,3,25,1,0,0
698,0,0,0,1,235,16,32,237.656,5,3,25,1,0,0


In [35]:
data_with_targets.iloc[:,:-1]

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Month Value,Day of the Week,Body Mass Index,Education,Children,Pets
0,0,0,0,1,289,36,33,239.554,7,1,30,0,2,1
1,0,0,0,0,118,13,50,239.554,7,1,31,0,1,0
2,0,0,0,1,179,51,38,239.554,7,2,31,0,0,0
3,1,0,0,0,279,5,39,239.554,7,3,24,0,2,0
4,0,0,0,1,289,36,33,239.554,7,3,30,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,179,22,40,237.656,5,2,22,1,2,0
696,1,0,0,0,225,26,28,237.656,5,2,24,0,1,2
697,1,0,0,0,330,16,28,237.656,5,3,25,1,0,0
698,0,0,0,1,235,16,32,237.656,5,3,25,1,0,0


In [36]:
#watdehell
data_with_targets.iloc[:,:14] is data_with_targets.iloc[:,:-1]

False

In [37]:
unscaled_inputs = data_with_targets.iloc[:,:-1]

## Standardize the Data
to subtract the mean and devide by the standard deviation featurewise

In [38]:
from sklearn.preprocessing import StandardScaler

absenteeism_scaler = StandardScaler()

In [39]:
#no need to make new variables, results(mean,std_dev) will be stored in absenteeism_scaler variables
absenteeism_scaler.fit(unscaled_inputs)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [40]:
#transform unscaled_inputs with absenteeism_scaler info
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)

In [42]:
scaled_inputs

array([[-0.57735027, -0.09298136, -0.39626354, ..., -0.44798003,
         0.88046927,  0.26848661],
       [-0.57735027, -0.09298136, -0.39626354, ..., -0.44798003,
        -0.01928035, -0.58968976],
       [-0.57735027, -0.09298136, -0.39626354, ..., -0.44798003,
        -0.91902997, -0.58968976],
       ...,
       [ 1.73205081, -0.09298136, -0.39626354, ...,  2.23224237,
        -0.91902997, -0.58968976],
       [-0.57735027, -0.09298136, -0.39626354, ...,  2.23224237,
        -0.91902997, -0.58968976],
       [-0.57735027, -0.09298136, -0.39626354, ..., -0.44798003,
        -0.01928035,  0.26848661]])

In [43]:
scaled_inputs.shape

(700, 14)

## Data Splitting

### Import relevant module

In [45]:
from sklearn.model_selection import train_test_split

### Split

In [46]:
train_test_split(scaled_inputs,targets)

[array([[-0.57735027, -0.09298136,  2.52357307, ...,  2.23224237,
         -0.01928035,  0.26848661],
        [-0.57735027, -0.09298136, -0.39626354, ..., -0.44798003,
         -0.91902997, -0.58968976],
        [ 1.73205081, -0.09298136, -0.39626354, ..., -0.44798003,
         -0.01928035,  0.26848661],
        ...,
        [-0.57735027, -0.09298136, -0.39626354, ..., -0.44798003,
         -0.01928035,  2.8430157 ],
        [-0.57735027, -0.09298136, -0.39626354, ...,  2.23224237,
         -0.91902997, -0.58968976],
        [-0.57735027, -0.09298136, -0.39626354, ...,  2.23224237,
         -0.91902997, -0.58968976]]),
 array([[ 1.73205081, -0.09298136, -0.39626354, ..., -0.44798003,
         -0.01928035,  0.26848661],
        [-0.57735027, -0.09298136, -0.39626354, ...,  2.23224237,
         -0.91902997, -0.58968976],
        [-0.57735027, -0.09298136, -0.39626354, ..., -0.44798003,
         -0.01928035, -0.58968976],
        ...,
        [-0.57735027, -0.09298136,  2.52357307, ..., -

In [51]:
# 75:25 train test by default. shuffle is true by default. random_state to lock the randomness of shuffling.
x_train,x_test,y_train,y_test = train_test_split(scaled_inputs,targets,test_size=0.2,shuffle=True,random_state=20)

In [52]:
print(x_train.shape, y_train.shape)

(560, 14) (560,)


In [53]:
print(x_test.shape, y_test.shape)

(140, 14) (140,)


## Logistic regression with sklearn

In [55]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

## Training the model 

In [56]:
reg = LogisticRegression()

In [57]:
reg.fit(x_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [58]:
reg.score(x_train,y_train)

0.8035714285714286

## Manually check the accuracy

In [61]:
prediction = reg.predict(x_train)

In [62]:
sum_of_true = np.where(prediction==y_train,1,0)

In [71]:
sum_of_true.sum() / x_train.shape[0]

0.8035714285714286

## Finding the intercept and coefficients

In [74]:
reg.intercept_

array([-0.18646174])

In [76]:
reg.coef_

array([[ 2.02082027,  0.33194596,  1.93874068,  1.18117989,  0.64363094,
        -0.05109749, -0.19810551, -0.02456936,  0.25031647, -0.05723313,
         0.29321923, -0.21811357,  0.33491771, -0.51178099]])

In [77]:
#sklearn transform everything to ndarray type, so its error!
scaled_inputs.columns.values

AttributeError: 'numpy.ndarray' object has no attribute 'columns'

In [79]:
feature_name = unscaled_inputs.columns.values

In [80]:
summary_table = pd.DataFrame(columns=['Feature name'],data = feature_name)

summary_table['coeffiecient'] = np.transpose(reg.coef_)

summary_table

Unnamed: 0,Feature name,coeffiecient
0,Reason_1,2.02082
1,Reason_2,0.331946
2,Reason_3,1.938741
3,Reason_4,1.18118
4,Transportation Expense,0.643631
5,Distance to Work,-0.051097
6,Age,-0.198106
7,Daily Work Load Average,-0.024569
8,Month Value,0.250316
9,Day of the Week,-0.057233


In [81]:
#adding intercept
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]
summary_table = summary_table.sort_index()

In [82]:
#log(odds) = intercept + b0 + b1x1 + b2x2 + b3x3 + ... + bnxn
#where coefficient is weight, intercept is bias
#the closer the coefficient to 0 = smaller weight
summary_table

Unnamed: 0,Feature name,coeffiecient
0,Intercept,-0.186462
1,Reason_1,2.02082
2,Reason_2,0.331946
3,Reason_3,1.938741
4,Reason_4,1.18118
5,Transportation Expense,0.643631
6,Distance to Work,-0.051097
7,Age,-0.198106
8,Daily Work Load Average,-0.024569
9,Month Value,0.250316


## Interpreting the Coefficient