### This notebook contains a Logistic Regression model to predict absenteeism from work using certain features.
*  **Data source**: Absenteeism_data.csv (Available in github repo) <br>
*  **Data preprocessing**: Dropping unwanted featues, Extracting month&day values from Date <br>
*  **Model building**: Creating dummies for Categorical features, Feature scaling, Train_Test split, Model training <br>
*  **Model Evaluation**: Loading new dataset for prediction

In [1]:
import pandas as pd

#### Dropping unwanted featues

In [2]:
raw_data = pd.read_csv('E:\\Udemy\\Data science\\Python\\Regression Analysis\\Logitistic Regression\\Sklearn\\Absentee Case study\\Absenteeism_data.csv')
raw_data.head()

Unnamed: 0,ID,Reason for Absence,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,11,26,07/07/2015,289,36,33,239.554,30,1,2,1,4
1,36,0,14/07/2015,118,13,50,239.554,31,1,1,0,0
2,3,23,15/07/2015,179,51,38,239.554,31,1,0,0,2
3,7,7,16/07/2015,279,5,39,239.554,24,1,2,0,4
4,11,23,23/07/2015,289,36,33,239.554,30,1,2,1,2


In [3]:
raw_data.describe()

Unnamed: 0,ID,Reason for Absence,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
count,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0
mean,17.951429,19.411429,222.347143,29.892857,36.417143,271.801774,26.737143,1.282857,1.021429,0.687143,6.761429
std,11.028144,8.356292,66.31296,14.804446,6.379083,40.021804,4.254701,0.66809,1.112215,1.166095,12.670082
min,1.0,0.0,118.0,5.0,27.0,205.917,19.0,1.0,0.0,0.0,0.0
25%,9.0,13.0,179.0,16.0,31.0,241.476,24.0,1.0,0.0,0.0,2.0
50%,18.0,23.0,225.0,26.0,37.0,264.249,25.0,1.0,1.0,0.0,3.0
75%,28.0,27.0,260.0,50.0,40.0,294.217,31.0,1.0,2.0,1.0,8.0
max,36.0,28.0,388.0,52.0,58.0,378.884,38.0,4.0,4.0,8.0,120.0


In [4]:
df = raw_data.copy()
df = df.drop('ID', axis =1)
df.head()

Unnamed: 0,Reason for Absence,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,26,07/07/2015,289,36,33,239.554,30,1,2,1,4
1,0,14/07/2015,118,13,50,239.554,31,1,1,0,0
2,23,15/07/2015,179,51,38,239.554,31,1,0,0,2
3,7,16/07/2015,279,5,39,239.554,24,1,2,0,4
4,23,23/07/2015,289,36,33,239.554,30,1,2,1,2


In [5]:
reasons = df['Reason for Absence'].unique()
sorted(reasons)

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28]

#### Dummies for categorical features

In [6]:
reasons_df = pd.get_dummies(df['Reason for Absence'], drop_first=True)
reasons_df

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,18,19,21,22,23,24,25,26,27,28
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
696,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
697,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
698,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [7]:
reason1 = reasons_df.iloc[:,:15].max(axis=1)
type(reason1)

pandas.core.series.Series

In [8]:
reason2 = reasons_df.iloc[:,14:18].max(axis=1)
reason3 = reasons_df.iloc[:,17:22].max(axis=1)
reason4 = reasons_df.iloc[:,21:].max(axis=1)

In [9]:
df = pd.concat([df,reason1,reason2,reason3,reason4], axis=1)
df

Unnamed: 0,Reason for Absence,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,0,1,2,3
0,26,07/07/2015,289,36,33,239.554,30,1,2,1,4,0,0,0,1
1,0,14/07/2015,118,13,50,239.554,31,1,1,0,0,0,0,0,0
2,23,15/07/2015,179,51,38,239.554,31,1,0,0,2,0,0,1,1
3,7,16/07/2015,279,5,39,239.554,24,1,2,0,4,1,0,0,0
4,23,23/07/2015,289,36,33,239.554,30,1,2,1,2,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,10,23/05/2018,179,22,40,237.656,22,2,2,0,8,1,0,0,0
696,6,23/05/2018,225,26,28,237.656,24,1,1,2,3,1,0,0,0
697,10,24/05/2018,330,16,28,237.656,25,2,0,0,8,1,0,0,0
698,23,24/05/2018,235,16,32,237.656,25,3,0,0,2,0,0,1,1


In [10]:
df = df.drop('Reason for Absence',axis=1)
columns = df.columns.values
columns

array(['Date', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours', 0, 1, 2, 3],
      dtype=object)

In [11]:
columns_reorder = [ 0, 1, 2, 3,'Date', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours']
df = df[columns_reorder]
df

Unnamed: 0,0,1,2,3,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,07/07/2015,289,36,33,239.554,30,1,2,1,4
1,0,0,0,0,14/07/2015,118,13,50,239.554,31,1,1,0,0
2,0,0,1,1,15/07/2015,179,51,38,239.554,31,1,0,0,2
3,1,0,0,0,16/07/2015,279,5,39,239.554,24,1,2,0,4
4,0,0,1,1,23/07/2015,289,36,33,239.554,30,1,2,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,23/05/2018,179,22,40,237.656,22,2,2,0,8
696,1,0,0,0,23/05/2018,225,26,28,237.656,24,1,1,2,3
697,1,0,0,0,24/05/2018,330,16,28,237.656,25,2,0,0,8
698,0,0,1,1,24/05/2018,235,16,32,237.656,25,3,0,0,2


In [12]:
df = df.rename({0:'Reason1',1:'Reason2',2:'Reason3',3:'Reason4'},axis=1)
df

Unnamed: 0,Reason1,Reason2,Reason3,Reason4,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,07/07/2015,289,36,33,239.554,30,1,2,1,4
1,0,0,0,0,14/07/2015,118,13,50,239.554,31,1,1,0,0
2,0,0,1,1,15/07/2015,179,51,38,239.554,31,1,0,0,2
3,1,0,0,0,16/07/2015,279,5,39,239.554,24,1,2,0,4
4,0,0,1,1,23/07/2015,289,36,33,239.554,30,1,2,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,23/05/2018,179,22,40,237.656,22,2,2,0,8
696,1,0,0,0,23/05/2018,225,26,28,237.656,24,1,1,2,3
697,1,0,0,0,24/05/2018,330,16,28,237.656,25,2,0,0,8
698,0,0,1,1,24/05/2018,235,16,32,237.656,25,3,0,0,2


In [13]:
df_reasons = df.copy()

#### Converting Date feature to timestamp data type

In [14]:
df_reasons['Date'] = pd.to_datetime(df_reasons['Date'], format='%d/%m/%Y')
df_reasons['Date']

0     2015-07-07
1     2015-07-14
2     2015-07-15
3     2015-07-16
4     2015-07-23
         ...    
695   2018-05-23
696   2018-05-23
697   2018-05-24
698   2018-05-24
699   2018-05-31
Name: Date, Length: 700, dtype: datetime64[ns]

#### Extracting month and day values from date timestamp

In [15]:
def month_val(date):
    return date.month
df_reasons['Month'] = df_reasons['Date'].apply(month_val)
df_reasons['Month']

0      7
1      7
2      7
3      7
4      7
      ..
695    5
696    5
697    5
698    5
699    5
Name: Month, Length: 700, dtype: int64

In [16]:
def day_value(day):
    return day.weekday()
df_reasons['Day'] = df_reasons['Date'].apply(day_value)
df_reasons['Day']

0      1
1      1
2      2
3      3
4      3
      ..
695    2
696    2
697    3
698    3
699    3
Name: Day, Length: 700, dtype: int64

In [17]:
df_reasons = df_reasons.drop('Date',axis=1)
df_reasons.columns.values

array(['Reason1', 'Reason2', 'Reason3', 'Reason4',
       'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours', 'Month', 'Day'],
      dtype=object)

In [18]:
re_col = ['Reason1', 'Reason2', 'Reason3', 'Reason4','Month', 'Day',
       'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours']
df_reasons = df_reasons[re_col]
df_reasons

Unnamed: 0,Reason1,Reason2,Reason3,Reason4,Month,Day,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,1,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,1,1,0,0
2,0,0,1,1,7,2,179,51,38,239.554,31,1,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,1,2,0,4
4,0,0,1,1,7,3,289,36,33,239.554,30,1,2,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,2,179,22,40,237.656,22,2,2,0,8
696,1,0,0,0,5,2,225,26,28,237.656,24,1,1,2,3
697,1,0,0,0,5,3,330,16,28,237.656,25,2,0,0,8
698,0,0,1,1,5,3,235,16,32,237.656,25,3,0,0,2


In [19]:
df_r_t = df_reasons.copy()

In [20]:
df_r_t['Education'].unique()

array([1, 3, 2, 4], dtype=int64)

In [21]:
df_r_t['Education'] = df_r_t['Education'].map({1:0,2:1,3:1,4:1})
df_r_t['Education'].unique()

array([0, 1], dtype=int64)

In [22]:
df_r_t.head()

Unnamed: 0,Reason1,Reason2,Reason3,Reason4,Month,Day,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,1,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,1,1,7,3,289,36,33,239.554,30,0,2,1,2


In [23]:
df_clean = df_r_t.copy()

In [24]:
df_clean.to_csv('E:\\Udemy\\Data science\\Python\\Regression Analysis\\Logitistic Regression\\Sklearn\\Absentee Case study\\df_clean.csv', index=False)

## Logistic Regression

In [25]:
import numpy as np

In [26]:
df_preprocessed = pd.read_csv('E:\\Udemy\\Data science\\Python\\Regression Analysis\\Logitistic Regression\\Sklearn\\Absentee Case study\\df_clean.csv')
df_preprocessed.head()

Unnamed: 0,Reason1,Reason2,Reason3,Reason4,Month,Day,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,1,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,1,1,7,3,289,36,33,239.554,30,0,2,1,2


#### Converting targets into two classes 0(Absent from work for <=3 hrs) and 1(Absent from work for >3 hrs)

In [27]:
df_preprocessed['Excess Absenteeism'] = np.where(df_preprocessed['Absenteeism Time in Hours'] <= df_preprocessed['Absenteeism Time in Hours']\
                                                .median(), 0, 1)
df_preprocessed['Excess Absenteeism'] 

0      1
1      0
2      0
3      1
4      0
      ..
695    1
696    0
697    1
698    0
699    0
Name: Excess Absenteeism, Length: 700, dtype: int32

In [28]:
#To check the balance of our targets
np.sum(df_preprocessed['Excess Absenteeism']) / df_preprocessed['Excess Absenteeism'].shape[0]

0.45571428571428574

In [29]:
df_with_targets = df_preprocessed.drop('Absenteeism Time in Hours',axis=1)
df_with_targets.head()

Unnamed: 0,Reason1,Reason2,Reason3,Reason4,Month,Day,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Excess Absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,1,1,7,2,179,51,38,239.554,31,0,0,0,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,1
4,0,0,1,1,7,3,289,36,33,239.554,30,0,2,1,0


#### Feature Scaling

In [30]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
#Removing dummy variables and targets from scaling
inputs = df_with_targets.drop(['Reason1','Reason2','Reason3','Reason4','Education','Excess Absenteeism'],axis=1)
inputs.head()

Unnamed: 0,Month,Day,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Children,Pets
0,7,1,289,36,33,239.554,30,2,1
1,7,1,118,13,50,239.554,31,1,0
2,7,2,179,51,38,239.554,31,0,0
3,7,3,279,5,39,239.554,24,2,0
4,7,3,289,36,33,239.554,30,2,1


In [31]:
scaler.fit(inputs)
inputs_scaled = scaler.transform(inputs)
inputs_scaled_df = pd.DataFrame(inputs_scaled, columns=inputs.columns.values)
inputs_scaled_df

Unnamed: 0,Month,Day,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Children,Pets
0,0.182726,-0.683704,1.005844,0.412816,-0.536062,-0.806331,0.767431,0.880469,0.268487
1,0.182726,-0.683704,-1.574681,-1.141882,2.130803,-0.806331,1.002633,-0.019280,-0.589690
2,0.182726,-0.007725,-0.654143,1.426749,0.248310,-0.806331,1.002633,-0.919030,-0.589690
3,0.182726,0.668253,0.854936,-1.682647,0.405184,-0.806331,-0.643782,0.880469,-0.589690
4,0.182726,0.668253,1.005844,0.412816,-0.536062,-0.806331,0.767431,0.880469,0.268487
...,...,...,...,...,...,...,...,...,...
695,-0.388293,-0.007725,-0.654143,-0.533522,0.562059,-0.853789,-1.114186,0.880469,-0.589690
696,-0.388293,-0.007725,0.040034,-0.263140,-1.320435,-0.853789,-0.643782,-0.019280,1.126663
697,-0.388293,0.668253,1.624567,-0.939096,-1.320435,-0.853789,-0.408580,-0.919030,-0.589690
698,-0.388293,0.668253,0.190942,-0.939096,-0.692937,-0.853789,-0.408580,-0.919030,-0.589690


In [32]:
inputs_notscaled = df_with_targets[['Reason1','Reason2','Reason3','Reason4','Education']]
total_inputs = pd.concat([inputs_notscaled,inputs_scaled_df],axis=1)
total_inputs.head()

Unnamed: 0,Reason1,Reason2,Reason3,Reason4,Education,Month,Day,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Children,Pets
0,0,0,0,1,0,0.182726,-0.683704,1.005844,0.412816,-0.536062,-0.806331,0.767431,0.880469,0.268487
1,0,0,0,0,0,0.182726,-0.683704,-1.574681,-1.141882,2.130803,-0.806331,1.002633,-0.01928,-0.58969
2,0,0,1,1,0,0.182726,-0.007725,-0.654143,1.426749,0.24831,-0.806331,1.002633,-0.91903,-0.58969
3,1,0,0,0,0,0.182726,0.668253,0.854936,-1.682647,0.405184,-0.806331,-0.643782,0.880469,-0.58969
4,0,0,1,1,0,0.182726,0.668253,1.005844,0.412816,-0.536062,-0.806331,0.767431,0.880469,0.268487


#### Splitting dataset into train & test

In [36]:
targets = df_with_targets.iloc[:,-1]

from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(total_inputs, targets, test_size=0.2, random_state=42)
print(x_train.shape,x_test.shape)
print(y_train.shape,y_test.shape)

(560, 14) (140, 14)
(560,) (140,)


In [37]:
from sklearn.linear_model import LogisticRegression
reg = LogisticRegression()
reg.fit(x_train, y_train)

LogisticRegression()

In [38]:
reg.score(x_train, y_train)

0.7482142857142857

In [39]:
reg.intercept_

array([-0.3035076])

In [40]:
reg.coef_

array([[ 1.56430499,  0.97225185,  0.78172699, -0.9050252 , -0.29275784,
         0.00994985, -0.16762726,  0.52939871,  0.0227663 , -0.28649981,
        -0.05638131,  0.22260655,  0.43614259, -0.40049895]])

#### Summary table to interpret the results

In [41]:
df_summary = pd.DataFrame(total_inputs.columns.values, columns = ['Features'])
df_summary

Unnamed: 0,Features
0,Reason1
1,Reason2
2,Reason3
3,Reason4
4,Education
5,Month
6,Day
7,Transportation Expense
8,Distance to Work
9,Age


In [42]:
df_summary['Coefficients'] = np.transpose(reg.coef_)
df_summary

Unnamed: 0,Features,Coefficients
0,Reason1,1.564305
1,Reason2,0.972252
2,Reason3,0.781727
3,Reason4,-0.905025
4,Education,-0.292758
5,Month,0.00995
6,Day,-0.167627
7,Transportation Expense,0.529399
8,Distance to Work,0.022766
9,Age,-0.2865


In [43]:
df_summary.index = df_summary.index +1
df_summary.loc[0] = ['Intecept', reg.intercept_[0]]
df_summary = df_summary.sort_index()
df_summary

Unnamed: 0,Features,Coefficients
0,Intecept,-0.303508
1,Reason1,1.564305
2,Reason2,0.972252
3,Reason3,0.781727
4,Reason4,-0.905025
5,Education,-0.292758
6,Month,0.00995
7,Day,-0.167627
8,Transportation Expense,0.529399
9,Distance to Work,0.022766


In [44]:
df_summary['Odds Ratio'] = np.exp(df_summary['Coefficients'])
df_summary

Unnamed: 0,Features,Coefficients,Odds Ratio
0,Intecept,-0.303508,0.738224
1,Reason1,1.564305,4.779352
2,Reason2,0.972252,2.643891
3,Reason3,0.781727,2.185243
4,Reason4,-0.905025,0.404532
5,Education,-0.292758,0.746203
6,Month,0.00995,1.01
7,Day,-0.167627,0.845669
8,Transportation Expense,0.529399,1.697911
9,Distance to Work,0.022766,1.023027


In [45]:
df_summary = df_summary.sort_values('Odds Ratio', ascending=False)
df_summary

Unnamed: 0,Features,Coefficients,Odds Ratio
1,Reason1,1.564305,4.779352
2,Reason2,0.972252,2.643891
3,Reason3,0.781727,2.185243
8,Transportation Expense,0.529399,1.697911
13,Children,0.436143,1.546729
12,Body Mass Index,0.222607,1.249329
9,Distance to Work,0.022766,1.023027
6,Month,0.00995,1.01
11,Daily Work Load Average,-0.056381,0.945179
7,Day,-0.167627,0.845669


In [46]:
reg.score(x_test,y_test)

0.7357142857142858

## Saving the model

In [44]:
#import pickle

In [45]:
#with open('Logistic_model','wb') as file:
    #pickle.dump(reg,file)

## Loading a new dataset


In [47]:
new_data = pd.read_csv('E:\\Udemy\\Data science\\Python\\Regression Analysis\\Logitistic Regression\\Sklearn\\Absentee Case study\\Absenteeism_new_data.csv')
new_data.head()

Unnamed: 0,ID,Reason for Absence,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets
0,22,27,01/06/2018,179,26,30,237.656,19,3,0,0
1,10,7,04/06/2018,361,52,28,237.656,27,1,1,4
2,14,23,06/06/2018,155,12,34,237.656,25,1,2,0
3,17,25,08/06/2018,179,22,40,237.656,22,2,2,0
4,14,10,08/06/2018,155,12,34,237.656,25,1,2,0


#### Preprocessing to fit the Logit model 

In [50]:
sorted(new_data['Reason for Absence'].unique())

[0, 6, 7, 8, 10, 11, 13, 14, 19, 22, 23, 25, 26, 27, 28]

In [51]:
df = new_data.copy()

In [52]:

reasons_new = pd.get_dummies(df['Reason for Absence'], drop_first=True)
reason_new1 = reasons_new.loc[:,6:14].max(axis=1)
reason_new2 = reasons_new.loc[:,15:17].max(axis=1)
reason_new3 = reasons_new.loc[:,18:21].max(axis=1)
reason_new4 = reasons_new.loc[:,22:].max(axis=1)

In [53]:
df_new_reasons = pd.concat([df,reason_new1,reason_new2,reason_new3,reason_new4],axis=1)
df_new_reasons.head()

Unnamed: 0,ID,Reason for Absence,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,0,1,2,3
0,22,27,01/06/2018,179,26,30,237.656,19,3,0,0,0,,0,1
1,10,7,04/06/2018,361,52,28,237.656,27,1,1,4,1,,0,0
2,14,23,06/06/2018,155,12,34,237.656,25,1,2,0,0,,0,1
3,17,25,08/06/2018,179,22,40,237.656,22,2,2,0,0,,0,1
4,14,10,08/06/2018,155,12,34,237.656,25,1,2,0,1,,0,0


In [54]:

df_new_reasons = df_new_reasons.drop(['ID','Reason for Absence'],axis=1)
df_new_reasons.rename(columns={0: 'Reason1', 1: 'Reason2', 2: 'Reason3', 3: 'Reason4'}, inplace=True)
df_new_reasons.head()

Unnamed: 0,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Reason1,Reason2,Reason3,Reason4
0,01/06/2018,179,26,30,237.656,19,3,0,0,0,,0,1
1,04/06/2018,361,52,28,237.656,27,1,1,4,1,,0,0
2,06/06/2018,155,12,34,237.656,25,1,2,0,0,,0,1
3,08/06/2018,179,22,40,237.656,22,2,2,0,0,,0,1
4,08/06/2018,155,12,34,237.656,25,1,2,0,1,,0,0


In [55]:
df_new_reasons['Reason2'] = df_new_reasons['Reason2'].fillna(0)
df_new_reasons['Reason2'].isna().sum()

0

In [56]:
df_temp = df_new_reasons.copy()

In [57]:
df_temp['Date'] = pd.to_datetime(df_temp['Date'], format = '%d/%m/%Y')
def mon(data):
    return data.month
df_temp['Month'] = df_temp['Date'].apply(mon)
def day(data):
    return data.weekday()
df_temp['Day'] = df_temp['Date'].apply(day)
df_r_d = df_temp.drop('Date',axis=1)
colu = df_r_d.columns.values
colu

array(['Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Reason1', 'Reason2', 'Reason3', 'Reason4',
       'Month', 'Day'], dtype=object)

In [58]:
df_r_d['Education'].unique()
df_r_d['Education'] = df_r_d['Education'].map({1:0,2:1,3:1})
df_r_d['Education'].unique()

array([1, 0], dtype=int64)

In [59]:
re_colu = ['Month','Day','Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index','Children', 'Pets', 'Reason1', 'Reason2', 'Reason3','Reason4','Education' ]
df_r_d = df_r_d[re_colu]
df_r_d.head()

Unnamed: 0,Month,Day,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Children,Pets,Reason1,Reason2,Reason3,Reason4,Education
0,6,4,179,26,30,237.656,19,0,0,0,0.0,0,1,1
1,6,0,361,52,28,237.656,27,1,4,1,0.0,0,0,0
2,6,2,155,12,34,237.656,25,2,0,0,0.0,0,1,0
3,6,4,179,22,40,237.656,22,2,0,0,0.0,0,1,1
4,6,4,155,12,34,237.656,25,2,0,1,0.0,0,0,0


In [60]:
df_arranged = df_r_d.copy()

In [61]:
from sklearn.preprocessing import StandardScaler
new_scaler = StandardScaler()
inputs_to_scale = df_arranged.iloc[:,0:-5]
new_scaler.fit(inputs_to_scale)
scaled_new_inputs = new_scaler.transform(inputs_to_scale)
scaled_new_inputs_df = pd.DataFrame(data = scaled_new_inputs, columns=inputs_to_scale\
                                   .columns.values)
scaled_new_inputs_df.head()

Unnamed: 0,Month,Day,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Children,Pets
0,-1.051315,1.346874,-0.326675,0.064835,-0.877958,-2.227361,-1.419428,-1.187282,-0.660387
1,-1.051315,-1.488651,2.09758,1.839269,-1.12791,-2.227361,0.294598,0.030443,0.827808
2,-1.051315,-0.070888,-0.646357,-0.890629,-0.378053,-2.227361,-0.133908,1.248168,-0.660387
3,-1.051315,1.346874,-0.326675,-0.208155,0.371804,-2.227361,-0.776668,1.248168,-0.660387
4,-1.051315,1.346874,-0.646357,-0.890629,-0.378053,-2.227361,-0.133908,1.248168,-0.660387


In [62]:
not_scaled = df_arranged.iloc[:,-5:]
tot_inputs = pd.concat([scaled_new_inputs_df,not_scaled],axis=1)
tot_inputs.head()

Unnamed: 0,Month,Day,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Children,Pets,Reason1,Reason2,Reason3,Reason4,Education
0,-1.051315,1.346874,-0.326675,0.064835,-0.877958,-2.227361,-1.419428,-1.187282,-0.660387,0,0.0,0,1,1
1,-1.051315,-1.488651,2.09758,1.839269,-1.12791,-2.227361,0.294598,0.030443,0.827808,1,0.0,0,0,0
2,-1.051315,-0.070888,-0.646357,-0.890629,-0.378053,-2.227361,-0.133908,1.248168,-0.660387,0,0.0,0,1,0
3,-1.051315,1.346874,-0.326675,-0.208155,0.371804,-2.227361,-0.776668,1.248168,-0.660387,0,0.0,0,1,1
4,-1.051315,1.346874,-0.646357,-0.890629,-0.378053,-2.227361,-0.133908,1.248168,-0.660387,1,0.0,0,0,0


In [63]:
total_inputs.columns.values

array(['Reason1', 'Reason2', 'Reason3', 'Reason4', 'Education', 'Month',
       'Day', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Children', 'Pets'],
      dtype=object)

In [64]:
new_col_ord = ['Reason1', 'Reason2', 'Reason3', 'Reason4', 'Education', 'Month',
       'Day', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Children', 'Pets']
tot_inputs = tot_inputs[new_col_ord]
tot_inputs.columns.values

array(['Reason1', 'Reason2', 'Reason3', 'Reason4', 'Education', 'Month',
       'Day', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Children', 'Pets'],
      dtype=object)

In [65]:
reg.predict(tot_inputs)

array([0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0])

In [66]:
reg.predict_proba(tot_inputs)

array([[0.89101849, 0.10898151],
       [0.0547717 , 0.9452283 ],
       [0.63566989, 0.36433011],
       [0.77905578, 0.22094422],
       [0.15775131, 0.84224869],
       [0.11880859, 0.88119141],
       [0.25708661, 0.74291339],
       [0.88382475, 0.11617525],
       [0.90469005, 0.09530995],
       [0.3878099 , 0.6121901 ],
       [0.51736313, 0.48263687],
       [0.55761944, 0.44238056],
       [0.68174164, 0.31825836],
       [0.90609109, 0.09390891],
       [0.59504831, 0.40495169],
       [0.61299621, 0.38700379],
       [0.55761944, 0.44238056],
       [0.42034838, 0.57965162],
       [0.67862765, 0.32137235],
       [0.33231094, 0.66768906],
       [0.85261028, 0.14738972],
       [0.90968436, 0.09031564],
       [0.44058293, 0.55941707],
       [0.38309023, 0.61690977],
       [0.8994347 , 0.1005653 ],
       [0.57754355, 0.42245645],
       [0.66718271, 0.33281729],
       [0.19505234, 0.80494766],
       [0.84668447, 0.15331553],
       [0.214387  , 0.785613  ],
       [0.

In [67]:
df['Probability'] = np.hsplit(reg.predict_proba(tot_inputs), 2)[1]
df.head()

Unnamed: 0,ID,Reason for Absence,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Probability
0,22,27,01/06/2018,179,26,30,237.656,19,3,0,0,0.108982
1,10,7,04/06/2018,361,52,28,237.656,27,1,1,4,0.945228
2,14,23,06/06/2018,155,12,34,237.656,25,1,2,0,0.36433
3,17,25,08/06/2018,179,22,40,237.656,22,2,2,0,0.220944
4,14,10,08/06/2018,155,12,34,237.656,25,1,2,0,0.842249


In [68]:
df['Prediction'] = reg.predict(tot_inputs)
df.head()

Unnamed: 0,ID,Reason for Absence,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Probability,Prediction
0,22,27,01/06/2018,179,26,30,237.656,19,3,0,0,0.108982,0
1,10,7,04/06/2018,361,52,28,237.656,27,1,1,4,0.945228,1
2,14,23,06/06/2018,155,12,34,237.656,25,1,2,0,0.36433,0
3,17,25,08/06/2018,179,22,40,237.656,22,2,2,0,0.220944,0
4,14,10,08/06/2018,155,12,34,237.656,25,1,2,0,0.842249,1


In [68]:
df.to_csv('E:\\Udemy\\Data science\\Python\\Regression Analysis\\Logitistic Regression\\Sklearn\\Absentee Case study\\Absentee_final.csv', index=False)