# Creating a logistic regression to predict productivity

## Import the relevant libraries

In [1]:
# import the relevant libraries
import pandas as pd
import numpy as np

## Load the data

In [2]:
# load the preprocessed CSV data
data_preprocessed = pd.read_csv('preprocessed_data')

In [3]:
# eyeball the data
data_preprocessed.head()

Unnamed: 0,quarter,day,month,smv,wip,over_time,incentive,idle_time,idle_men,no_of_style_change,...,team 4,team 5,team 6,team 7,team 8,team 9,team 10,team 11,team 12,productivity
0,1,3,1,26.16,1108.0,7080,98,0.0,0,0,...,0,0,0,0,1,0,0,0,0,1
1,1,3,1,3.94,1039.0,960,0,0.0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,1,3,1,11.41,968.0,3660,50,0.0,0,0,...,0,0,0,0,0,0,0,1,0,1
3,1,3,1,11.41,968.0,3660,50,0.0,0,0,...,0,0,0,0,0,0,0,0,1,1
4,1,3,1,25.9,1170.0,1920,50,0.0,0,0,...,0,0,1,0,0,0,0,0,0,1


## A comment on the targets

In [4]:
# check if dataset is balanced (what % of targets are 1s)
data_preprocessed['productivity'].sum() / data_preprocessed['productivity'].shape[0]

0.7278368794326241

## BALANCE THE DATAset

## Select the inputs for the regression

In [5]:
data_preprocessed.shape

(1128, 26)

In [6]:
# Selects all rows and all columns but the last one (basically the same operation)
data_preprocessed.iloc[:,:-1]

Unnamed: 0,quarter,day,month,smv,wip,over_time,incentive,idle_time,idle_men,no_of_style_change,...,team 3,team 4,team 5,team 6,team 7,team 8,team 9,team 10,team 11,team 12
0,1,3,1,26.16,1108.0,7080,98,0.0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,1,3,1,3.94,1039.0,960,0,0.0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,3,1,11.41,968.0,3660,50,0.0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,1,3,1,11.41,968.0,3660,50,0.0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,1,3,1,25.90,1170.0,1920,50,0.0,0,0,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1123,2,2,3,2.90,1039.0,960,0,0.0,0,0,...,0,0,0,0,0,0,0,1,0,0
1124,2,2,3,3.90,1039.0,960,0,0.0,0,0,...,0,0,0,0,0,1,0,0,0,0
1125,2,2,3,3.90,1039.0,960,0,0.0,0,0,...,0,0,0,0,1,0,0,0,0,0
1126,2,2,3,2.90,1039.0,1800,0,0.0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [7]:
# Create a variable that will contain the inputs (everything without the targets)
unscaled_inputs = data_preprocessed.iloc[:,:-1]

#checkpoint before backward elimination
check_point_inputs = unscaled_inputs.copy()

## Standardize the data

In [8]:
# standardize the inputs
from sklearn.preprocessing import StandardScaler

# define scaler as an object
scaler = StandardScaler()

In [9]:
#create a df with only non dummy features
df_no_dummies = unscaled_inputs.copy()
df_no_dummies = df_no_dummies.drop(['dept_finishing', 'dept_sweing', 'team 1', 'team 2','team 3','team 4', 'team 5', 'team 6', 'team 7', 'team 8', 'team 9', 'team 10', 'team 11',
       'team 12'], axis=1)
df_no_dummies.head()

Unnamed: 0,quarter,day,month,smv,wip,over_time,incentive,idle_time,idle_men,no_of_style_change,no_of_workers
0,1,3,1,26.16,1108.0,7080,98,0.0,0,0,59.0
1,1,3,1,3.94,1039.0,960,0,0.0,0,0,8.0
2,1,3,1,11.41,968.0,3660,50,0.0,0,0,30.5
3,1,3,1,11.41,968.0,3660,50,0.0,0,0,30.5
4,1,3,1,25.9,1170.0,1920,50,0.0,0,0,56.0


In [10]:
#scale the non dummy variables
scaler.fit(df_no_dummies)
inputs_scaled = scaler.transform(df_no_dummies)

In [11]:
inputs_scaled

array([[-1.14734431,  0.06739032, -0.99617478, ..., -0.11645512,
        -0.3606847 ,  1.13491211],
       [-1.14734431,  0.06739032, -0.99617478, ..., -0.11645512,
        -0.3606847 , -1.16534063],
       [-1.14734431,  0.06739032, -0.99617478, ..., -0.11645512,
        -0.3606847 , -0.15052325],
       ...,
       [-0.33081158, -0.40771142,  1.69851383, ..., -0.11645512,
        -0.3606847 , -1.16534063],
       [-0.33081158, -0.40771142,  1.69851383, ..., -0.11645512,
        -0.3606847 , -0.84961967],
       [-0.33081158, -0.40771142,  1.69851383, ..., -0.11645512,
        -0.3606847 , -1.25554662]])

In [12]:
inputs_scaled.shape

(1128, 11)

SUBSTITUTE SCALED VALUES IN THE INPUTS DATA FRAME

In [13]:
#map one by one the input columns that have been scaled

for i in range(7):
    for j in range (len(unscaled_inputs['wip'])):
        unscaled_inputs.iloc[j,i] = inputs_scaled[j][i]

unscaled_inputs.head()

Unnamed: 0,quarter,day,month,smv,wip,over_time,incentive,idle_time,idle_men,no_of_style_change,...,team 3,team 4,team 5,team 6,team 7,team 8,team 9,team 10,team 11,team 12
0,-1.147344,0.06739,-0.996175,1.121377,0.344216,0.880115,2.493115,0.0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,-1.147344,0.06739,-0.996175,-1.014291,0.088237,-1.091309,-0.826314,0.0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,-1.147344,0.06739,-0.996175,-0.296314,-0.175162,-0.221563,0.867272,0.0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,-1.147344,0.06739,-0.996175,-0.296314,-0.175162,-0.221563,0.867272,0.0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,-1.147344,0.06739,-0.996175,1.096388,0.574226,-0.782066,0.867272,0.0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [14]:
scaled_data = pd.concat([unscaled_inputs, data_preprocessed['productivity']], axis=1)
scaled_data.head(2)

Unnamed: 0,quarter,day,month,smv,wip,over_time,incentive,idle_time,idle_men,no_of_style_change,...,team 4,team 5,team 6,team 7,team 8,team 9,team 10,team 11,team 12,productivity
0,-1.147344,0.06739,-0.996175,1.121377,0.344216,0.880115,2.493115,0.0,0,0,...,0,0,0,0,1,0,0,0,0,1
1,-1.147344,0.06739,-0.996175,-1.014291,0.088237,-1.091309,-0.826314,0.0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [15]:
#saved the scaled inputs in a file to use the newral network
scaled_data.to_csv('scaled_data', index=False)

## Split the data into train & test and shuffle

### Import the relevant module

In [16]:
# import train_test_split so we can split our data into train and test
from sklearn.model_selection import train_test_split

### Split

In [17]:
#sklearn works with ndarray, transform inputs dataframe into nd array
scaled_inputs = unscaled_inputs.to_numpy()
scaled_inputs

array([[-1.14734431,  0.06739032, -0.99617478, ...,  0.        ,
         0.        ,  0.        ],
       [-1.14734431,  0.06739032, -0.99617478, ...,  0.        ,
         0.        ,  0.        ],
       [-1.14734431,  0.06739032, -0.99617478, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [-0.33081158, -0.40771142,  1.69851383, ...,  0.        ,
         0.        ,  0.        ],
       [-0.33081158, -0.40771142,  1.69851383, ...,  0.        ,
         0.        ,  0.        ],
       [-0.33081158, -0.40771142,  1.69851383, ...,  0.        ,
         0.        ,  0.        ]])

In [18]:
#same with target
targets = data_preprocessed['productivity'].to_numpy()
targets

array([1, 1, 1, ..., 0, 0, 0], dtype=int64)

In [19]:
# declare 4 variables for the split
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size = 0.8, 
                                                                            test_size = 0.2, random_state = 46)

In [20]:
# check the shape of the train inputs and targets
print (x_train.shape, y_train.shape)

(902, 25) (902,)


In [21]:
# check the shape of the test inputs and targets
print (x_test.shape, y_test.shape)

(226, 25) (226,)


## Logistic regression with sklearn

In [22]:
# import the LogReg model from sklearn
from sklearn.linear_model import LogisticRegression

# import the 'metrics' module, which includes important metrics we may want to use
from sklearn import metrics

### Training the model

In [23]:
# create a logistic regression object
reg = LogisticRegression()

In [24]:
# fit our train inputs
# that is basically the whole training part of the machine learning
reg.fit(x_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [25]:
# assess the train accuracy of the model
reg.score(x_train,y_train)

0.7838137472283814

### Manually check the accuracy

In [26]:
# find the model outputs according to our model
model_outputs = reg.predict(x_train)
model_outputs

array([1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1,

In [27]:
# compare them with the targets
y_train

array([1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1,
       0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1,
       0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0,
       0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0,
       1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1,
       1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1,
       1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,

In [28]:
# ACTUALLY compare the two variables
model_outputs == y_train

array([ True,  True, False,  True,  True,  True,  True,  True, False,
        True,  True, False,  True,  True,  True,  True, False, False,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
       False,  True,  True, False,  True,  True,  True,  True,  True,
        True,  True, False,  True,  True,  True, False,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
        True, False,  True, False,  True,  True, False,  True,  True,
        True, False,  True,  True,  True,  True, False,  True,  True,
       False,  True, False,  True, False,  True,  True, False, False,
       False, False, False,  True, False,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True, False,
        True,  True, False,  True,  True,  True, False,  True,  True,
        True,  True,  True, False, False,  True,  True,  True,  True,
        True, False,

In [29]:
# find out in how many instances we predicted correctly
np.sum((model_outputs==y_train))

707

In [30]:
# get the total number of instances
model_outputs.shape[0]

902

In [31]:
# calculate the accuracy of the model
np.sum((model_outputs==y_train)) / model_outputs.shape[0]

0.7838137472283814

### Finding the intercept and coefficients

In [32]:
# get the intercept (bias) of our model
reg.intercept_

array([0.17865257])

In [33]:
# get the coefficients (weights) of our model
reg.coef_

array([[-0.05285748,  0.01680005, -0.09916899, -0.8089748 ,  0.20284331,
        -0.25102596,  1.86116251,  0.01175332, -0.12634667,  0.24726778,
         0.03976356,  0.29349827, -0.15552262,  0.49416098,  0.02665916,
         0.75058444,  0.73653209,  0.39050821, -0.37950539, -0.22824585,
        -0.58563389, -0.46678185, -0.11971396, -0.61471207,  0.13412377]])

In [34]:
# check what were the names of our columns
unscaled_inputs.columns.values

array(['quarter', 'day', 'month', 'smv', 'wip', 'over_time', 'incentive',
       'idle_time', 'idle_men', 'no_of_style_change', 'no_of_workers',
       'dept_finishing', 'dept_sweing', 'team 1', 'team 2', 'team 3',
       'team 4', 'team 5', 'team 6', 'team 7', 'team 8', 'team 9',
       'team 10', 'team 11', 'team 12'], dtype=object)

In [35]:
# save the names of the columns in an ad-hoc variable
feature_name = unscaled_inputs.columns.values

In [36]:
# use the coefficients from this table (they will be exported later and will be used in Tableau)
# transpose the model coefficients (model.coef_) and throws them into a df (a vertical organization, so that they can be
# multiplied by certain matrices later) 
summary_table = pd.DataFrame (columns=['Feature name'], data = feature_name)

# add the coefficient values to the summary table
summary_table['Coefficient'] = np.transpose(reg.coef_)

# display the summary table
summary_table

Unnamed: 0,Feature name,Coefficient
0,quarter,-0.052857
1,day,0.0168
2,month,-0.099169
3,smv,-0.808975
4,wip,0.202843
5,over_time,-0.251026
6,incentive,1.861163
7,idle_time,0.011753
8,idle_men,-0.126347
9,no_of_style_change,0.247268


In [37]:
# do a little Python trick to move the intercept to the top of the summary table
# move all indices by 1
summary_table.index = summary_table.index + 1

# add the intercept at index 0
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]

# sort the df by index
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Feature name,Coefficient
0,Intercept,0.178653
1,quarter,-0.052857
2,day,0.0168
3,month,-0.099169
4,smv,-0.808975
5,wip,0.202843
6,over_time,-0.251026
7,incentive,1.861163
8,idle_time,0.011753
9,idle_men,-0.126347


## Interpreting the coefficients

In [38]:
# create a new Series called: 'Odds ratio' which will show the.. odds ratio of each feature
summary_table['Odds_ratio'] = np.exp(summary_table.Coefficient)

In [39]:
# display the df
summary_table

Unnamed: 0,Feature name,Coefficient,Odds_ratio
0,Intercept,0.178653,1.195605
1,quarter,-0.052857,0.948515
2,day,0.0168,1.016942
3,month,-0.099169,0.90559
4,smv,-0.808975,0.445314
5,wip,0.202843,1.224881
6,over_time,-0.251026,0.778002
7,incentive,1.861163,6.431209
8,idle_time,0.011753,1.011823
9,idle_men,-0.126347,0.881309


In [40]:
# sort the table according to odds ratio
# note that by default, the sort_values method sorts values by 'ascending'
summary_table.sort_values('Odds_ratio', ascending=False)

Unnamed: 0,Feature name,Coefficient,Odds_ratio
7,incentive,1.861163,6.431209
16,team 3,0.750584,2.118238
17,team 4,0.736532,2.08868
14,team 1,0.494161,1.639122
18,team 5,0.390508,1.477732
12,dept_finishing,0.293498,1.341111
10,no_of_style_change,0.247268,1.280522
5,wip,0.202843,1.224881
0,Intercept,0.178653,1.195605
25,team 12,0.134124,1.143534


# TEST the Model

In [42]:
#test the accuracy of the model on the test sets
reg.score(x_test, y_test)

0.7610619469026548

# SAVE THE MODEL

Saving the model = saving the reg object

Pickle is the standard Python tool for serialization and deserialization. In simple words, pickling means: converting a Python object (no matter what) into a string of characters. Logically, unpickling is about converting a string of characters (that has been pickled) into a Python object.

The second step of the deployment is about creating a mechanism to load the saved model and make predictions