In [2]:
import numpy as np
import pandas as pd
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator, TransformerMixin

In [3]:
data = pd.read_csv('my_preprocessed_data.csv')

In [4]:
data.head()

Unnamed: 0,Absenteeism Time in Hours,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Reason_1,Reason_2,Reason_3,Reason_4,Day,Month
0,4,289,36,33,239.554,30,1,2,1,0,0,0,1,7,7
1,0,118,13,50,239.554,31,1,1,0,0,0,0,0,14,7
2,2,179,51,38,239.554,31,1,0,0,0,0,0,1,15,7
3,4,279,5,39,239.554,24,1,2,0,1,0,0,0,16,7
4,2,289,36,33,239.554,30,1,2,1,0,0,0,1,23,7


In [5]:
input_data = data.iloc[:,1:]

In [6]:
all_cols = input_data.columns.values

In [7]:
cols_to_remove = ['Day','Month','Daily Work Load Average','Education']

In [40]:
to_include_col = [ i for i in all_cols if i not in cols_to_remove]

In [9]:
input_data = data[to_include_col]

In [10]:
columns_to_omit = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4','Education']

In [11]:
columns_to_scale = [x for x in input_data.columns.values if x not in columns_to_omit]

In [12]:
target_data = data.iloc[:,0]

In [13]:
# so you can imagine that the Custom Scaler is build on it



# create the Custom Scaler class

class CustomScaler(BaseEstimator,TransformerMixin): 
    
    # init or what information we need to declare a CustomScaler object
    # and what is calculated/declared as we do
    
    def __init__(self,columns,copy=True,with_mean=True,with_std=True):
        
        # scaler is nothing but a Standard Scaler object
        self.scaler = StandardScaler(copy,with_mean,with_std)
        # with some columns 'twist'
        self.columns = columns
        self.mean_ = None
        self.var_ = None
        
    
    # the fit method, which, again based on StandardScale
    
    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    
    # the transform method which does the actual scaling

    def transform(self, X, y=None, copy=None):
        
        # record the initial order of the columns
        init_col_order = X.columns
        
        # scale all features that you chose when creating the instance of the class
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        
        # declare a variable containing all information that was not scaled
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        
        # return a data frame which contains all scaled features and all 'not scaled' features
        # use the original order (that you recorded in the beginning)
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

In [14]:
scaler = CustomScaler(columns_to_scale)



In [15]:
scaler.fit(input_data)



CustomScaler(columns=['Transportation Expense', 'Distance to Work', 'Age',
                      'Body Mass Index', 'Children', 'Pets'],
             copy=None, with_mean=None, with_std=None)

In [16]:
scaled_data = scaler.transform(input_data)

In [17]:
target_data = np.where(target_data < target_data.mean(),0,1)

In [18]:
scaled_data


Unnamed: 0,Transportation Expense,Distance to Work,Age,Body Mass Index,Children,Pets,Reason_1,Reason_2,Reason_3,Reason_4
0,1.005844,0.412816,-0.536062,0.767431,0.880469,0.268487,0,0,0,1
1,-1.574681,-1.141882,2.130803,1.002633,-0.019280,-0.589690,0,0,0,0
2,-0.654143,1.426749,0.248310,1.002633,-0.919030,-0.589690,0,0,0,1
3,0.854936,-1.682647,0.405184,-0.643782,0.880469,-0.589690,1,0,0,0
4,1.005844,0.412816,-0.536062,0.767431,0.880469,0.268487,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...
695,-0.654143,-0.533522,0.562059,-1.114186,0.880469,-0.589690,1,0,0,0
696,0.040034,-0.263140,-1.320435,-0.643782,-0.019280,1.126663,1,0,0,0
697,1.624567,-0.939096,-1.320435,-0.408580,-0.919030,-0.589690,1,0,0,0
698,0.190942,-0.939096,-0.692937,-0.408580,-0.919030,-0.589690,0,0,0,1


In [19]:
from sklearn.model_selection import train_test_split

In [20]:
x_train,x_test,y_train,y_test = train_test_split(scaled_data,target_data,test_size=0.2,train_size=0.8,random_state=32)

In [21]:
x_train.shape,x_test.shape

((560, 10), (140, 10))

In [39]:
x_train

Unnamed: 0,Transportation Expense,Distance to Work,Age,Body Mass Index,Children,Pets,Reason_1,Reason_2,Reason_3,Reason_4
15,0.356940,-0.330735,0.718933,-0.878984,-0.919030,-0.589690,1,0,0,0
94,0.387122,-0.330735,1.660180,1.237836,0.880469,0.268487,0,0,0,1
570,-0.654143,-0.263140,-1.006686,-1.819793,-0.919030,-0.589690,1,0,0,0
149,-0.578689,0.818389,-1.477309,-1.349389,-0.919030,-0.589690,0,0,0,1
352,0.190942,-0.668713,1.032682,2.649049,-0.019280,-0.589690,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...
252,1.005844,0.412816,-0.536062,0.767431,0.880469,0.268487,0,0,1,0
403,-1.574681,-1.141882,2.130803,1.002633,-0.019280,-0.589690,1,0,0,0
88,0.040034,-0.263140,-1.320435,-0.643782,-0.019280,1.126663,0,0,0,1
310,2.092381,1.494345,-1.320435,0.061825,-0.019280,2.843016,1,0,0,0


In [22]:
reg = LogisticRegression()


In [23]:
reg.fit(x_train,y_train)

LogisticRegression()

In [24]:
reg.intercept_

array([-1.80709888])

In [25]:
reg.coef_

array([[ 0.71132657, -0.11634858, -0.16533634,  0.30323928,  0.1747096 ,
        -0.11279153,  2.60884252,  0.96781411,  3.07728401,  0.30162171]])

In [26]:
reg.score(x_train,y_train)

0.7892857142857143

In [38]:
y_target = reg.predict_proba(x_train)

In [28]:
y_target = y_target[:,1]

In [29]:
y_target

array([0.64900438, 0.27543681, 0.47193159, 0.09352638, 0.84697484,
       0.1745475 , 0.64900438, 0.07668686, 0.64900438, 0.40296294,
       0.43095675, 0.54687537, 0.1226195 , 0.79031034, 0.58403654,
       0.27543681, 0.44022471, 0.1226195 , 0.07742557, 0.1226195 ,
       0.45744564, 0.5935505 , 0.32225285, 0.07668686, 0.36775225,
       0.1226195 , 0.15544012, 0.12691025, 0.07668686, 0.1226195 ,
       0.07668686, 0.2990707 , 0.1226195 , 0.07668686, 0.45744564,
       0.71314838, 0.1745475 , 0.1226195 , 0.78236661, 0.21720224,
       0.36508872, 0.1226195 , 0.26352844, 0.1226195 , 0.1745475 ,
       0.08168931, 0.45744564, 0.1226195 , 0.19622338, 0.65847265,
       0.27280771, 0.27280771, 0.07742557, 0.40296294, 0.74708154,
       0.40296294, 0.12691025, 0.1226195 , 0.27543681, 0.12691025,
       0.07668686, 0.1226195 , 0.07668686, 0.58403654, 0.24816072,
       0.76830676, 0.77240257, 0.1745475 , 0.20392839, 0.08168931,
       0.5935505 , 0.84120433, 0.45486887, 0.1226195 , 0.08138

In [30]:
cols = input_data.columns.values

In [31]:
summary = pd.DataFrame(columns=['Feature'],data=cols)

In [32]:
summary['Coffecient'] = reg.coef_.transpose()

In [33]:
summary['odds'] = np.exp(reg.coef_.transpose())

In [34]:
summary.sort_values('Coffecient',ascending=False)

Unnamed: 0,Feature,Coffecient,odds
8,Reason_3,3.077284,21.699387
6,Reason_1,2.608843,13.583319
7,Reason_2,0.967814,2.632184
0,Transportation Expense,0.711327,2.036691
3,Body Mass Index,0.303239,1.354238
9,Reason_4,0.301622,1.35205
4,Children,0.17471,1.1909
5,Pets,-0.112792,0.893337
1,Distance to Work,-0.116349,0.890165
2,Age,-0.165336,0.847609


In [35]:
with open('model.pkl','wb') as f:
    pickle.dump(reg,f)
    
with open('scaler.pkl','wb') as f:
    pickle.dump(scaler,f)

In [37]:
with open('scaler.pkl','rb') as scalar:
    scalar = pickle.load(scalar)

In [42]:
reg.predict(x_train)

array([1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,