In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

# Load the data

In [2]:
data_preprocessed = pd.read_csv('C:/Users/youss/Desktop/Absenteeism project/Absenteeism_preprocessed.csv')

In [3]:
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,month,day of the week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2


# Create the targets

In [4]:
data_preprocessed['Absenteeism Time in Hours'].median() #our cutoff line 

3.0

In [5]:
targets = np.where(data_preprocessed['Absenteeism Time in Hours'] > 
                   data_preprocessed['Absenteeism Time in Hours'].median(), 1 , 0)

In [6]:
 data_preprocessed['excessive absenteeism']  = targets
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,month,day of the week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,excessive absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2,0


# ensuring data balancing

In [7]:
targets.sum() / targets.shape[0]

0.45571428571428574

In [8]:
#in the first model we found that  Daily Work Load Average , Distance to Work and day of the week their Cofficient's value
#is near to 0 and their odds_ratio are near to 1 wich mean they aren't particulary important to our model so we droped it here
data_with_targets  = data_preprocessed.drop(['Absenteeism Time in Hours' ,'Daily Work Load Average' ,'Distance to Work' ,'day of the week'] , axis = 1)

# Select the inputs for the regression 

In [9]:
data_with_targets.shape

(700, 12)

In [10]:
unscaled_inputs = data_with_targets.iloc[:,:-1]

# Standardize the data¶

In [11]:
#we made this class because in the buttom of the code we forgot that we scaled the dummy varibles and 
#we shouldn't do that because they expresses categorical data not numerical so we made a custom scaler 
#that scale only the numerical data we could avoid this step by scaling the numerical values before we make the dummy varibles 
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
class CustomScaler(BaseEstimator,TransformerMixin): 
    
    def __init__(self,columns):
        
        self.scaler = StandardScaler()
        self.columns = columns
        self.mean_ = None
        self.var_ = None
        
        
    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    
    def transform(self, X, y=None, copy=None):
        
        init_col_order = X.columns
        
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

In [12]:
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'month',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [13]:
#columns_to_scale = ['month','day of the week', 'Transportation Expense', 'Distance to Work',
 #      'Age', 'Daily Work Load Average', 'Body Mass Index', 'Children', 'Pets']
columns_to_ommit = ['Reason_1','Reason_2' , 'Reason_3' ,'Reason_4' , 'Education']

In [14]:
columns_to_scale = [x for x in unscaled_inputs.columns.values if x not in columns_to_ommit]


In [15]:
scaler = CustomScaler(columns_to_scale)
scaler.fit(unscaled_inputs)
scaled_inputs = scaler.transform(unscaled_inputs)

  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


In [16]:
scaled_inputs

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,month,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,0,0,0,1,0.182726,1.005844,-0.536062,0.767431,0,0.880469,0.268487
1,0,0,0,0,0.182726,-1.574681,2.130803,1.002633,0,-0.019280,-0.589690
2,0,0,0,1,0.182726,-0.654143,0.248310,1.002633,0,-0.919030,-0.589690
3,1,0,0,0,0.182726,0.854936,0.405184,-0.643782,0,0.880469,-0.589690
4,0,0,0,1,0.182726,1.005844,-0.536062,0.767431,0,0.880469,0.268487
...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,-0.388293,-0.654143,0.562059,-1.114186,1,0.880469,-0.589690
696,1,0,0,0,-0.388293,0.040034,-1.320435,-0.643782,0,-0.019280,1.126663
697,1,0,0,0,-0.388293,1.624567,-1.320435,-0.408580,1,-0.919030,-0.589690
698,0,0,0,1,-0.388293,0.190942,-0.692937,-0.408580,1,-0.919030,-0.589690


In [17]:
scaled_inputs.shape

(700, 11)

# Split the data into train & test and shuffle

In [18]:
x_train , x_test , y_train , y_test = train_test_split(scaled_inputs , targets , train_size = 0.8 , shuffle = True , random_state = 20)
#the default of the shuffle in the function is True
#used random_state to ensure every time we run the code we got the same shuffled data not being random 

In [19]:
reg = LogisticRegression()
reg.fit(x_train , y_train)

LogisticRegression()

In [20]:
reg.score(x_train , y_train)

0.7696428571428572

### manually check the accuracy

In [21]:
model_outputs = reg.predict(x_train)
model_outputs == y_train

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False,  True, False, False,  True,  True,  True,  True,
       False,  True, False,  True, False, False,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
       False, False, False, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True, False,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
       False,  True, False,  True,  True, False, False, False,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
       False,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,

In [22]:
sum(model_outputs == y_train)/model_outputs.shape[0]

0.7696428571428572

### finding the intercept and cofficients

In [23]:
reg.intercept_

array([-1.53693319])

In [24]:
reg.coef_

array([[ 2.69504616, -0.09539573,  3.00829099,  0.72845146,  0.1578702 ,
         0.60285245, -0.16149935,  0.27215729, -0.18687332,  0.3551367 ,
        -0.27856875]])

In [25]:
unscaled_inputs.columns.values #we didn't use scaled_inputs because they have been transformed into ndarray after we scaled the data because sklearn returns any thing into numpy arrays

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'month',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [26]:
feature_name = unscaled_inputs.columns.values

In [27]:
summary_table = pd.DataFrame(feature_name , columns = ['feature name'])
summary_table['Cofficient'] = np.transpose(reg.coef_)
summary_table

Unnamed: 0,feature name,Cofficient
0,Reason_1,2.695046
1,Reason_2,-0.095396
2,Reason_3,3.008291
3,Reason_4,0.728451
4,month,0.15787
5,Transportation Expense,0.602852
6,Age,-0.161499
7,Body Mass Index,0.272157
8,Education,-0.186873
9,Children,0.355137


In [28]:
summary_table.index = summary_table.index+1
summary_table.loc[0] = ['Intercept' , reg.intercept_[0]]
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,feature name,Cofficient
0,Intercept,-1.536933
1,Reason_1,2.695046
2,Reason_2,-0.095396
3,Reason_3,3.008291
4,Reason_4,0.728451
5,month,0.15787
6,Transportation Expense,0.602852
7,Age,-0.161499
8,Body Mass Index,0.272157
9,Education,-0.186873


### Interpreting the cofficients

In [29]:
summary_table['odds_ratio'] = np.exp(summary_table.Cofficient)
summary_table

Unnamed: 0,feature name,Cofficient,odds_ratio
0,Intercept,-1.536933,0.21504
1,Reason_1,2.695046,14.806202
2,Reason_2,-0.095396,0.909013
3,Reason_3,3.008291,20.252758
4,Reason_4,0.728451,2.07187
5,month,0.15787,1.171014
6,Transportation Expense,0.602852,1.827324
7,Age,-0.161499,0.850867
8,Body Mass Index,0.272157,1.312793
9,Education,-0.186873,0.829549


In [30]:
summary_table.sort_values('odds_ratio' , ascending = False)

Unnamed: 0,feature name,Cofficient,odds_ratio
3,Reason_3,3.008291,20.252758
1,Reason_1,2.695046,14.806202
4,Reason_4,0.728451,2.07187
6,Transportation Expense,0.602852,1.827324
10,Children,0.355137,1.426376
8,Body Mass Index,0.272157,1.312793
5,month,0.15787,1.171014
2,Reason_2,-0.095396,0.909013
7,Age,-0.161499,0.850867
9,Education,-0.186873,0.829549


# Testing the model

In [31]:
reg.score(x_test , y_test)

0.75

In [32]:
predicted_proba = reg.predict_proba(x_test)
predicted_proba

array([[0.71565953, 0.28434047],
       [0.58626012, 0.41373988],
       [0.44340077, 0.55659923],
       [0.78306539, 0.21693461],
       [0.08189466, 0.91810534],
       [0.33559776, 0.66440224],
       [0.29780059, 0.70219941],
       [0.12954083, 0.87045917],
       [0.78727916, 0.21272084],
       [0.75088282, 0.24911718],
       [0.4992094 , 0.5007906 ],
       [0.22411268, 0.77588732],
       [0.0694672 , 0.9305328 ],
       [0.72775228, 0.27224772],
       [0.299571  , 0.700429  ],
       [0.54547777, 0.45452223],
       [0.55532383, 0.44467617],
       [0.54416748, 0.45583252],
       [0.3903242 , 0.6096758 ],
       [0.05389689, 0.94610311],
       [0.70221484, 0.29778516],
       [0.78306539, 0.21693461],
       [0.42020963, 0.57979037],
       [0.42020963, 0.57979037],
       [0.24853184, 0.75146816],
       [0.74710562, 0.25289438],
       [0.5069808 , 0.4930192 ],
       [0.85389909, 0.14610091],
       [0.20475968, 0.79524032],
       [0.78306539, 0.21693461],
       [0.

In [33]:
predicted_proba.shape

(140, 2)

In [34]:
predicted_proba[:,1] #because we are interested in the exccesive absenteeism

array([0.28434047, 0.41373988, 0.55659923, 0.21693461, 0.91810534,
       0.66440224, 0.70219941, 0.87045917, 0.21272084, 0.24911718,
       0.5007906 , 0.77588732, 0.9305328 , 0.27224772, 0.700429  ,
       0.45452223, 0.44467617, 0.45583252, 0.6096758 , 0.94610311,
       0.29778516, 0.21693461, 0.57979037, 0.57979037, 0.75146816,
       0.25289438, 0.4930192 , 0.14610091, 0.79524032, 0.21693461,
       0.37079211, 0.67877317, 0.68419574, 0.52331045, 0.21693461,
       0.53362522, 0.22036704, 0.73963426, 0.40737811, 0.61070668,
       0.20937551, 0.46020712, 0.23624427, 0.43090302, 0.82670716,
       0.5682144 , 0.69444235, 0.28434047, 0.22059752, 0.20201187,
       0.58329141, 0.35540777, 0.66440224, 0.27056722, 0.82093173,
       0.4357622 , 0.8821096 , 0.23313421, 0.35635336, 0.36675728,
       0.70342227, 0.65427881, 0.29417698, 0.79114242, 0.20847061,
       0.26635814, 0.09579943, 0.22036704, 0.73451495, 0.30939393,
       0.22036704, 0.3159065 , 0.90370091, 0.46270756, 0.59992

# Save the model

In [35]:
import pickle

In [36]:
with open('model' , 'wb') as file:
    pickle.dump(reg,file)

In [37]:
with open('scaler' , 'wb') as file:
    pickle.dump(scaler,file)