In [51]:
import pandas as pd
import plotly.express as px
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [52]:
df=pd.read_csv('Absenteeism_at_work.csv',';')

In [53]:
mycolor=['#aeccdb', '#efa6a5', '#ebbd81']
df=df[df['Month of absence']!=0]
df.drop(columns=['ID'],inplace=True)
df.head(10)

Unnamed: 0,Reason for absence,Month of absence,Day of the week,Seasons,Transportation expense,Distance from Residence to Work,Service time,Age,Work load Average/day,Hit target,Disciplinary failure,Education,Son,Social drinker,Social smoker,Pet,Weight,Height,Body mass index,Absenteeism time in hours
0,26,7,3,1,289,36,13,33,239.554,97,0,1,2,1,0,1,90,172,30,4
1,0,7,3,1,118,13,18,50,239.554,97,1,1,1,1,0,0,98,178,31,0
2,23,7,4,1,179,51,18,38,239.554,97,0,1,0,1,0,0,89,170,31,2
3,7,7,5,1,279,5,14,39,239.554,97,0,1,2,1,1,0,68,168,24,4
4,23,7,5,1,289,36,13,33,239.554,97,0,1,2,1,0,1,90,172,30,2
5,23,7,6,1,179,51,18,38,239.554,97,0,1,0,1,0,0,89,170,31,2
6,22,7,6,1,361,52,3,28,239.554,97,0,1,1,1,0,4,80,172,27,8
7,23,7,6,1,260,50,11,36,239.554,97,0,1,4,1,0,0,65,168,23,4
8,19,7,2,1,155,12,14,34,239.554,97,0,1,2,1,0,0,95,196,25,40
9,22,7,2,1,235,11,14,37,239.554,97,0,3,1,0,0,1,88,172,29,8


In [54]:
fig = px.imshow(df.corr(), color_continuous_scale='PuRd',width=500,height=500)
fig.show()

In [55]:
# A->0 B->1-16 C->17-56 D-> >=18 
def grouping(val):
    
    if val==0:
        return 'A'
    elif ((val>=1) & (val<=16)):
        return 'B'
    elif ((val>=17) & (val<=56)):
        return 'C'
    else :
        return 'D'
    
df['Ab_new']=df['Absenteeism time in hours'].apply(lambda x: grouping(x))

In [56]:
df['Ab_new'].value_counts()
df['Absenteeism time in hours']=df['Ab_new'].copy()
df.drop(columns=['Ab_new'],inplace=True)

In [57]:
from collections import Counter
y.value_counts()
counter=Counter(y)
print(counter)

Counter({'B': 652, 'A': 41, 'C': 32, 'D': 12})


In [58]:
from sklearn.feature_selection import mutual_info_classif

importances=mutual_info_classif(X,y)
feat_importances=pd.Series(importances,X.columns[0:len(X.columns)])
feat_df=feat_importances.to_frame().reset_index().sort_values(by=0,ascending=False)
feat_df.rename(columns={'index':'Features',0:'Importance'}, inplace=True)
feat_df['Importance']=round(feat_df['Importance'],2)
fig = px.bar(feat_df, x='Features', y='Importance',width=800,height=600,color_discrete_sequence=mycolor,color='Features',
             text=[f'{i*100}%' for i in feat_df['Importance']])
fig.show()

In [60]:
y=df['Absenteeism time in hours']
# X=df.drop(columns=['Pet','Son','Absenteeism time in hours'])
X.columns

Index(['Reason for absence', 'Month of absence', 'Day of the week', 'Seasons',
       'Transportation expense', 'Distance from Residence to Work',
       'Service time', 'Age', 'Work load Average/day ', 'Hit target',
       'Disciplinary failure', 'Education', 'Social drinker', 'Social smoker',
       'Weight', 'Height', 'Body mass index'],
      dtype='object')

In [61]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

In [62]:
from imblearn.over_sampling import SMOTE
smt=SMOTE()
print(y_train.value_counts())
X_train,y_train=smt.fit_resample(X_train,y_train)
print(y_train.value_counts())


B    459
A     26
C     21
D      9
Name: Absenteeism time in hours, dtype: int64
D    459
C    459
A    459
B    459
Name: Absenteeism time in hours, dtype: int64


In [63]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [64]:
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [65]:
accuracy =[]
model_names =[]

gnb = GaussianNB()
dtree = DecisionTreeClassifier()
rf = RandomForestClassifier(n_estimators= 400,min_samples_split= 5,min_samples_leaf= 1,max_features= 'auto', max_depth= 90,bootstrap= False)
knn = KNeighborsClassifier(n_neighbors=4,metric='minkowski')

models = [gnb,dtree,rf,knn]

for model in models: 
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy.append(round(accuracy_score(y_test, y_pred),4))

model_names = ['Naive Bias','DecisionTree','RandomForest','KNeighbors']
result_df1 = pd.DataFrame({'Accuracy':accuracy}, index=model_names)
result_df1.reset_index(inplace=True)
result_df1.rename(columns={'index':'Model'}, inplace=True)

fig = px.bar(result_df1, x='Model', y='Accuracy',width=500,height=300,color_discrete_sequence=mycolor,color='Model',
             text=[f'{round(i*100,2)}%' for i in result_df1['Accuracy']])
fig.show()

In [66]:
# Hyperparaeter tuning for Random Forest
import numpy as np
from sklearn.model_selection import RandomizedSearchCV

n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)


{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [67]:
rf = RandomForestClassifier()
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
rf_random.fit(X_train, y_train)
rf_random.best_params_ 

Fitting 3 folds for each of 100 candidates, totalling 300 fits


{'n_estimators': 1000,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': 50,
 'bootstrap': False}

In [68]:
model=RandomForestClassifier(n_estimators= 400,min_samples_split= 5,min_samples_leaf= 1,max_features= 'auto', max_depth= 90,bootstrap= False)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(round(accuracy_score(y_test, y_pred),4)*100,'%')



90.09 %
