<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Select-the-inputs-for-the-regression" data-toc-modified-id="Select-the-inputs-for-the-regression-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Select the inputs for the regression</a></span></li><li><span><a href="#Standardize-the-data" data-toc-modified-id="Standardize-the-data-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Standardize the data</a></span></li><li><span><a href="#Split-data" data-toc-modified-id="Split-data-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Split data</a></span></li><li><span><a href="#Logistic-regression" data-toc-modified-id="Logistic-regression-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Logistic regression</a></span></li><li><span><a href="#Train" data-toc-modified-id="Train-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Train</a></span></li><li><span><a href="#Manually-check-accuracy" data-toc-modified-id="Manually-check-accuracy-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Manually check accuracy</a></span></li><li><span><a href="#Coefficients" data-toc-modified-id="Coefficients-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Coefficients</a></span></li><li><span><a href="#Testing-the-model" data-toc-modified-id="Testing-the-model-8"><span class="toc-item-num">8&nbsp;&nbsp;</span>Testing the model</a></span></li><li><span><a href="#Save-the-model" data-toc-modified-id="Save-the-model-9"><span class="toc-item-num">9&nbsp;&nbsp;</span>Save the model</a></span></li></ul></div>

In [1]:
import numpy as np
import pandas as pd
import scipy
import statsmodels.api as sm
import sklearn

# plotting
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set(color_codes=True)
sns.set()

# seaborn warnings ignore
import warnings
warnings.filterwarnings('ignore')

# Jupyter notebook settings for pandas
pd.set_option('display.float_format', '{:,.2f}'.format) # numbers sep by comma
pd.set_option('display.max_rows', 100) # None for all the rows
pd.set_option('display.max_columns', 100) # None for all the columns
pd.set_option('display.max_colwidth', 100)


# ipython
from IPython.display import Image

In [2]:
df = pd.read_csv('../data/outputs/Absenteeism_preprocessed.csv',index_col=0)
df.head()

FileNotFoundError: File b'../data/csv/Absenteeism_preprocessed.csv' does not exist

In [None]:
df = df.drop('ID', axis=1)

In [None]:
df.columns.values

In [None]:
df['Absenteeism Time in Hours'].median()

In [None]:
targets = np.where(df['Absenteeism Time in Hours'] > df['Absenteeism Time in Hours'].median(), 1, 0)
targets[0:4]

In [None]:
df['Absenteeism Excessive'] = targets
df.head()

In [None]:
targets.sum()/ targets.shape[0]

In [None]:
data_with_targets = df.drop(['Absenteeism Time in Hours'], axis=1)
data_with_targets.head()

In [None]:
# After running all the code, we found out that 3 features do not impact much.
# their weights are near 0 or odd ratio were near 1.0

In [None]:
data_with_targets.columns

In [None]:
data_with_targets = data_with_targets.drop(['Day of Week','Daily Work Load Average','Distance to Work'],axis=1)
data_with_targets.head()

# Select the inputs for the regression

In [None]:
data_with_targets.shape

In [None]:
unscaled_inputs = data_with_targets.iloc[:,:-1]
unscaled_inputs.head()

# Standardize the data

In [None]:
from sklearn.preprocessing import StandardScaler

absenteesim_scaler = StandardScaler()

In [None]:
absenteesim_scaler.fit(unscaled_inputs)

In [None]:
scaled_inputs = absenteesim_scaler.transform(unscaled_inputs)
scaled_inputs[0:2]

In [None]:
scaled_inputs.shape

# Split data

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_test_split(scaled_inputs, targets);

In [None]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size = 0.8, random_state=100)
print(x_train.shape, y_train.shape)

In [None]:
print(x_test.shape, y_test.shape)

# Logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics


# Train

In [None]:
reg = LogisticRegression()

In [None]:
reg.fit(x_train, y_train)

In [None]:
reg.score(x_train,y_train)

# Manually check accuracy

In [None]:
model_outputs = reg.predict(x_train)

In [None]:
np.sum(model_outputs == y_train)

In [None]:
np.sum(model_outputs == y_train) / model_outputs.shape[0]

# Coefficients

In [None]:
reg.intercept_

In [None]:
reg.coef_

In [None]:
unscaled_inputs.columns.values

In [None]:
feature_name = unscaled_inputs.columns.values

In [None]:
summary_table = pd.DataFrame(columns=['Feature name'], data = feature_name)
summary_table['Coefficient'] = np.transpose(reg.coef_)
summary_table

In [None]:
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]
summary_table = summary_table.sort_index()
summary_table

In [None]:
summary_table['Odds_ratio'] = np.exp(summary_table.Coefficient)
summary_table = summary_table.sort_values('Odds_ratio', ascending=False)
summary_table

In [None]:
# if coeff is approx 0, or odds ratio is 1, feature is NOT MUCH IMPORTANT.

# Testing the model

In [None]:
reg.score(x_test,y_test)

In [None]:
predicted_proba = reg.predict_proba(x_test)
predicted_proba[0:5]

In [None]:
predicted_proba.shape

# Save the model

In [None]:
import pickle

In [None]:
with open('../data/outputs/model','wb') as fo:
    pickle.dump(reg,fo)

In [None]:
with open('../data/outputs/scaler', 'wb') as fo:
    pickle.dump(absenteesim_scaler,fo)