In [1]:
#Importing the required libraries 
import pandas as pd
import numpy as np 
from sklearn.preprocessing import StandardScaler 
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [2]:
#Defining the CustomScaler 

class CustomScaler(BaseEstimator, TransformerMixin):

    def __init__(self, columns, copy=True, with_mean=True, with_std=True):
        self.columns = columns
        self.copy = copy
        self.with_mean = with_mean
        self.with_std = with_std

    def fit(self, X, y=None):
        self.scaler = StandardScaler(copy=self.copy, with_mean=self.with_mean, with_std=self.with_std)
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self

    def transform(self, X, y=None, copy=None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        X_not_scaled = X.loc[:, ~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis = 1)[init_col_order]

# Loading the preprocessed data 

In [3]:
preprocessed_data = pd.read_csv('Absenteesim Preprocessed.csv')
preprocessed_data.head()

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Month,Day of the week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2


In [4]:
#Finding the values in the Absenteeism field 
preprocessed_data['Absenteeism Time in Hours'].unique()

array([  4,   0,   2,   8,  40,   1,   7,   3,  32,   5,  16,  24,  64,
        56,  80, 120, 112, 104,  48], dtype=int64)

In [5]:
#Finding the median of the Absenteeism 
median = preprocessed_data['Absenteeism Time in Hours'].median()
median

3.0

In [6]:
# Here we will mark everyone with more than median as excessive absent as 1 and below as least absent as 0 
# Creating the targets with this information 

targets = np.where(preprocessed_data['Absenteeism Time in Hours'] > median, 1, 0 )
#targets

In [7]:
# Saving the targets in the new variable excessive absenteeism 
preprocessed_data['Excessive Absenteeism '] = targets

In [8]:
#Dropping the Absenteeism Time in Hours column

preprocessed_data = preprocessed_data.drop(['Absenteeism Time in Hours', 'Month', 'Children', 'Reason 2','Distance to Work'], axis = 1)
preprocessed_data.head(8)

Unnamed: 0,Reason 1,Reason 3,Reason 4,Day of the week,Transportation Expense,Age,Daily Work Load Average,Body Mass Index,Education,Pets,Excessive Absenteeism
0,0,0,1,1,289,33,239.554,30,0,1,1
1,0,0,0,1,118,50,239.554,31,0,0,0
2,0,0,1,2,179,38,239.554,31,0,0,0
3,1,0,0,3,279,39,239.554,24,0,0,1
4,0,0,1,3,289,33,239.554,30,0,1,0
5,0,0,1,4,179,38,239.554,31,0,0,0
6,0,0,0,4,361,28,239.554,27,0,4,1
7,0,0,1,4,260,36,239.554,23,0,0,1


In [9]:
#Seperating into inputs and targets 

unscaled_inputs = preprocessed_data.iloc[:,:-1]


In [10]:
#Finding the column to scale 

columns_to_omit = ['Reason 1', 'Reason 2', 'Reason 3', 'Reason 4', 'Education']
columns = unscaled_inputs.columns.values 
columns

array(['Reason 1', 'Reason 3', 'Reason 4', 'Day of the week',
       'Transportation Expense', 'Age', 'Daily Work Load Average',
       'Body Mass Index', 'Education', 'Pets'], dtype=object)

In [11]:
columns_to_scale = [x for x in columns if x not in columns_to_omit]
columns_to_scale

['Day of the week',
 'Transportation Expense',
 'Age',
 'Daily Work Load Average',
 'Body Mass Index',
 'Pets']

# Scaling the inputs 

In [12]:
#Scaling the Inputs using the CustomScaler 
absenteesim_scaler = CustomScaler(columns_to_scale)

In [13]:
absenteesim_scaler.fit(unscaled_inputs)

  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


In [14]:
scaled_inputs = absenteesim_scaler.transform(unscaled_inputs)
scaled_inputs

Unnamed: 0,Reason 1,Reason 3,Reason 4,Day of the week,Transportation Expense,Age,Daily Work Load Average,Body Mass Index,Education,Pets
0,0,0,1,-0.683704,1.005844,-0.536062,-0.806331,0.767431,0,0.268487
1,0,0,0,-0.683704,-1.574681,2.130803,-0.806331,1.002633,0,-0.589690
2,0,0,1,-0.007725,-0.654143,0.248310,-0.806331,1.002633,0,-0.589690
3,1,0,0,0.668253,0.854936,0.405184,-0.806331,-0.643782,0,-0.589690
4,0,0,1,0.668253,1.005844,-0.536062,-0.806331,0.767431,0,0.268487
...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,-0.007725,-0.654143,0.562059,-0.853789,-1.114186,1,-0.589690
696,1,0,0,-0.007725,0.040034,-1.320435,-0.853789,-0.643782,0,1.126663
697,1,0,0,0.668253,1.624567,-1.320435,-0.853789,-0.408580,1,-0.589690
698,0,0,1,0.668253,0.190942,-0.692937,-0.853789,-0.408580,1,-0.589690


# Train and Testing the data 

In [15]:
x_train,x_test,y_train,y_test = train_test_split(scaled_inputs,targets, train_size = 0.25, random_state = 25)

# Using Logistic Regression 

In [16]:
reg = LogisticRegression()

In [17]:
reg.fit(x_train,y_train)

In [18]:
reg.score(x_train, y_train)

0.76

In [19]:
reg.coef_

array([[ 1.69347871,  1.68259816, -0.46991378, -0.19625023,  0.75110839,
        -0.13701147, -0.17436269,  0.34081776, -0.07438762, -0.2943552 ]])

In [20]:
#Creating the table with the feature and its respective coeff

feature_name = unscaled_inputs.columns.values

summary_table = pd.DataFrame(columns=["Feature Name"], data = feature_name)

summary_table['coefficient'] = np.transpose(reg.coef_)
summary_table

Unnamed: 0,Feature Name,coefficient
0,Reason 1,1.693479
1,Reason 3,1.682598
2,Reason 4,-0.469914
3,Day of the week,-0.19625
4,Transportation Expense,0.751108
5,Age,-0.137011
6,Daily Work Load Average,-0.174363
7,Body Mass Index,0.340818
8,Education,-0.074388
9,Pets,-0.294355


In [21]:
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]
summary_table = summary_table.sort_index()
summary_table 

Unnamed: 0,Feature Name,coefficient
0,Intercept,-0.656214
1,Reason 1,1.693479
2,Reason 3,1.682598
3,Reason 4,-0.469914
4,Day of the week,-0.19625
5,Transportation Expense,0.751108
6,Age,-0.137011
7,Daily Work Load Average,-0.174363
8,Body Mass Index,0.340818
9,Education,-0.074388


In [22]:
# Finding the realtion between the coefficient and ratio
summary_table['ratio'] = np.exp(summary_table.coefficient)
summary_table

Unnamed: 0,Feature Name,coefficient,ratio
0,Intercept,-0.656214,0.518812
1,Reason 1,1.693479,5.438366
2,Reason 3,1.682598,5.379515
3,Reason 4,-0.469914,0.625056
4,Day of the week,-0.19625,0.821807
5,Transportation Expense,0.751108,2.119348
6,Age,-0.137011,0.87196
7,Daily Work Load Average,-0.174363,0.839992
8,Body Mass Index,0.340818,1.406097
9,Education,-0.074388,0.928312


In [23]:
summary_table.sort_values('ratio',ascending=False)


Unnamed: 0,Feature Name,coefficient,ratio
1,Reason 1,1.693479,5.438366
2,Reason 3,1.682598,5.379515
5,Transportation Expense,0.751108,2.119348
8,Body Mass Index,0.340818,1.406097
9,Education,-0.074388,0.928312
6,Age,-0.137011,0.87196
7,Daily Work Load Average,-0.174363,0.839992
4,Day of the week,-0.19625,0.821807
10,Pets,-0.294355,0.745012
3,Reason 4,-0.469914,0.625056


# Testing the Model 

In [24]:
reg.score(x_test,y_test)

0.7161904761904762

In [25]:
predict_proba = reg.predict_proba(x_test)
predict_proba

array([[0.86839885, 0.13160115],
       [0.67797244, 0.32202756],
       [0.59070819, 0.40929181],
       ...,
       [0.8959609 , 0.1040391 ],
       [0.64231222, 0.35768778],
       [0.78879975, 0.21120025]])

In [26]:
# Saving the Model 
import pickle 
with open('model','wb') as file:
    pickle.dump(reg,file)
    
with open('scaler','wb') as file:
    pickle.dump(absenteesim_scaler,file)