# Import Python libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import calendar
plt.style.use('fivethirtyeight')

In [None]:
raw_data = pd.read_csv('/kaggle/input/absentismo/Absentismo.csv')
raw_data

### Make a copy of the original data frame

In [None]:
df = raw_data.copy()
df

In [None]:
df.shape

In [None]:
df.info()

### Finding the missing value

In [None]:
df.isnull().sum()

## Verifying the dataset has valid values

### changing the date object to datetime

In [None]:
df['Date'].head()

In [None]:
type(df['Date'])

In [None]:
type(df['Date'][0])

Since date is inserted as a string, lets convert it into timestamp using .to_datetime()

In [None]:
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')
df['Date']

In [None]:
df['Date'][1]

In [None]:
df['Date'][0].month

In [None]:
list_months = []
for i in range(df['Date'].shape[0]):
    list_months.append(df['Date'][i].month)

In [None]:
df['Month Value'] = list_months
df.head()

Similary, we can extract the day the same way

In [None]:
list_days = []
for i in range(df.shape[0]):
    list_days.append(df['Date'][i].weekday())

In [None]:
df['Day of the Week'] = list_days
df.head()

## Dropping the Date column and rearranging the Month value and Day of the Week

In [None]:
df = df.drop(['Date'], axis = 1)

In [None]:
df.columns.values

In [None]:
df_reordered_monthandday = ['ID', 'Month Value', 'Day of the Week', 'Reason for Absence', 'Transportation Expense',
       'Distance to Work', 'Age', 'Daily Work Load Average',
       'Body Mass Index', 'Education', 'Children', 'Pets',
       'Absenteeism Time in Hours']

In [None]:
df = df[df_reordered_monthandday]
df.head()

### checking the outliers

In [None]:
df[df['Month Value'] == 0]

# Exploratory Data Analysis

It is better to add additional columns because the dataset contains numeric values only.

### mapping Reason for Absence

In [None]:
reason_mapping = {
    0: 'Unknown',
    1: 'Certain infectious and parasitic diseases',
    2: 'Neoplasms',
    3: 'Diseases of the blood and blood-forming organs and certain disorders involving the immune mechanism',
    4: 'Endocrine, nutritional and metabolic diseases',
    5: 'Mental and behavioural disorders',
    6: 'Diseases of the nervous system',
    7: 'Diseases of the eye and adnexa',
    8: 'Diseases of the ear and mastoid process',
    9: 'Diseases of the circulatory system',
    10: 'Diseases of the respiratory system',
    11: 'Diseases of the digestive system',
    12: 'Diseases of the skin and subcutaneous tissue',
    13: 'Diseases of the musculoskeletal system and connective tissue',
    14: 'Diseases of the genitourinary system',
    15: 'Pregnancy, childbirth and the puerperium',
    16: 'Certain conditions originating in the perinatal period',
    17: 'Congenital malformations, deformations and chromosomal abnormalities',
    18: 'Symptoms, signs and abnormal clinical and laboratory findings, not elsewhere classified',
    19: 'Injury, poisoning and certain other consequences of external causes',
    20: 'External causes of morbidity and mortality',
    21: 'Factors influencing health status and contact with health services',
    22: 'Patient follow-up',
    23: 'Medical consultation',
    24: 'Blood donation',
    25: 'Laboratory examination',
    26: 'Unjustified absence',
    27: 'Physiotherapy',
    28: 'Dental consultation'
}
df['reason_text'] = df['Reason for Absence'].map(reason_mapping)

In [None]:
df.head()

## Absenteeism by Reason

In [None]:
reason_abs = df.groupby('reason_text')[['Absenteeism Time in Hours']].sum()
reason_abs

In [None]:
reason_abs['Absenteeism Time in Hours'].max()

#### Those employees sufferring from the Disease of the musculoskeletal system and connective tissue has the highest absenteesim hours

## Sum of Absenteeism Hours by Month

### In which month, hours of absenteeism was the highest? to plot this in graph, month name according to month value must must be inserted as a column in the dataframe. 

In [None]:
df['Month Name'] =  df['Month Value'].apply(lambda x: calendar.month_abbr[x])

In [None]:
df.head()

In [None]:
month_abs = df.groupby('Month Name')[['Absenteeism Time in Hours']].sum()
axe = month_abs.plot(kind="bar", figsize=(8,6), legend=False)
axe.set_xlabel('Months')
axe.set_ylabel('Sum of Absenteeism Hours')
axe.set_title('Sum of Absenteeism Hours by Month')
plt.show()

#### From the bar graph above, we can see that the highest number of people were absent in the month of March.

## Sum of Absenteeism hours by Days

In [None]:
df['Day Name'] =  df['Day of the Week'].apply(lambda x: calendar.day_abbr[x])

In [None]:
df.head()

In [None]:
days_abs = df.groupby('Day Name')[['Absenteeism Time in Hours']].sum()
axe = days_abs.plot(kind="bar", figsize=(8,6), legend=False)
axe.set_xlabel('Days')
axe.set_ylabel('Sum of Absenteeism hours')
axe.set_title('Sum of Absenteeism hours by Days')
plt.show()

### From the bar graph above, it is clear that employee are absent mostly on Monday

## Average Transportation Expense by Distance to Work

In [None]:
transportation_distance = df.groupby('Distance to Work')[['Transportation Expense']].mean()
axe = transportation_distance.plot(kind='bar', figsize=(8,6), legend=False)
axe.set_xlabel('Distance to Wrok')
axe.set_ylabel('Transportation Expense')
axe.set_title('Average Transportation Expense by Distance to Work')
plt.show()

#### Generally when the distance increases, transportation should increase but the graph above shows the data is fluctuating and we dont know how they travelled and by which transportation mode. So this data is not much helping us. 

## Agewise Count of Employees

In [None]:
age_employee_count = df.groupby('Age').agg({'ID': pd.Series.nunique})
axe = age_employee_count.plot(kind='bar', figsize=(8,4), legend=False)
axe.set_xlabel('Age')
axe.set_ylabel('Number of Employees')
axe.set_title('Agewise Count of Employees')
plt.show()

#### Based on the bar chart there is a variation in the count of employees across different age groups, indicating a diverse workforce in terms of age

## Average absenteeism hours by Age

In [None]:
age_abs = df.groupby('Age')[['Absenteeism Time in Hours']].mean()
axe = age_abs.plot(kind='bar', figsize=(8,6), legend=False)
axe.set_xlabel('Age')
axe.set_ylabel('Average Absenteeism Hours')
axe.set_title('Average Absenteeism Hours by Age')
plt.show()

## Daily workload average by Age

In [None]:
daily_workload_avg = df.groupby('Age')[['Daily Work Load Average']].mean()
axe = daily_workload_avg.plot(kind="bar", figsize=(8,6), legend=False)
axe.set_xlabel('Age')
axe.set_ylabel('Daily Work Load Average')
axe.set_title('Daily Work Load Average by Age')
plt.show()

#### The workload seems to be the same irrespective of the age

## Average Absenteeism Hours by Distance to Work

In [None]:
dist_abs = df.groupby('Distance to Work')[['Absenteeism Time in Hours']].mean()
axe = dist_abs.plot(kind='bar', figsize=(8,6), legend=False)
axe.set_xlabel('Distance to Work, KM')
axe.set_ylabel('Average Absenteeism Hours')
axe.set_title('Average Absenteeism Hours by Distance to Work')
plt.show()

#### there isn't a clear linear correlation between absenteeism and distance. other factors like transportation modes, commute times, or the presence of remote work policies alongside distance might provide a more comprehensive understanding of absenteeism dynamics.

## Educationwise count of Employees

In [None]:
edu_employee_count = df.groupby('Education').agg({'ID': pd.Series.nunique})
axe = edu_employee_count.plot(kind='bar', figsize=(8,6), legend=False)
axe.set_xlabel('Education')
axe.set_ylabel('Number of Employees')
axe.set_title('Educationwise Employee Count')
plt.show()

#### Education is categorized as
1: 'High School',
2: 'Graduate',
3: 'Post Graduate',
4: 'Master & Doctor'
    
Based on the bar graph, employees with the high school degree is the highest.

## Children and Pets by Age

In [None]:
children_pets_age = df.groupby('Age')[['Children', 'Pets']].sum()
axe = children_pets_age.plot(figsize=(8,6))
axe.set_ylabel('Count')
axe.set_title('Count of Children and Pets by Age')
plt.show()

#### It is interesting to see that employees who have son mostly have pets also.

# Feature Engineering

## One Hot Encoding

One-hot encoding is a technique used in data processing and feature engineering, primarily in machine learning and data analysis tasks. It's a method to represent categorical variables as binary vectors, which helps in handling categorical data in a numerical format. In simpler terms, one-hot encoding converts categorical variables into a format that can be provided to machine learning algorithms to improve model performance. It creates binary columns for each category in the categorical variable, where each column corresponds to a unique category and is represented as 0 or 1. The column associated with the category for a particular observation is set to 1 (indicating presence), and all other columns for other categories are set to 0 (indicating absence).

### Examining the Reason for Absence column

In [None]:
reason_columns = pd.get_dummies(df['Reason for Absence'], dtype=int)
reason_columns.head()

We will drop the reason zero dummy variable in Python. We are going to do this because we want to avoid potential multicore linearity issues in our analysis. Two variables are considered perfectly collinear if their correlation coefficient is +/- 1.0. Multicollinearity among independent variables will result in less reliable statistical inferences.

In [None]:
reason_columns = pd.get_dummies(df['Reason for Absence'], drop_first=True, dtype=int)
reason_columns.head()

if we add all these dummy variables in data frame we end up adding nearly 40 columns. So in situation where we have 700 columns and similar dummy variables, we should always consider the possibility of grouping these variables. The grouping is also classification. Here on the basis of similar characteristics, we will group the reasons. Reason 1-14 releated to various diseases, reasons 15-17 related to pregranancy, reasons 18-21 as they are all about poisoning or signs not elsewhere categorized.

In [None]:
reason_type_1 = reason_columns.loc[:, 1:14].max(axis=1)
reason_type_2 = reason_columns.loc[:, 15:17].max(axis=1)
reason_type_3 = reason_columns.loc[:, 18:21].max(axis=1)
reason_type_4 = reason_columns.loc[:, 22:].max(axis=1)  ## from 22 to end of data frame

#### Concatenate the reasons to the data frame

In [None]:
df = pd.concat([df, reason_type_1, reason_type_2, reason_type_3, reason_type_4], axis =1)
df.head()

#### Renaming the column names  and rearranging the column names

In [None]:
df.columns.values

In [None]:
column_names  = ['ID', 'Month Value', 'Day of the Week', 'Reason for Absence',
       'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours', 'reason_text',
       'Month Name', 'Day Name', 'Reason_1', 'Reason_2', 'Reason_3', 'Reason_4']


In [None]:
df.columns = column_names
df

In [None]:
column_name_reordered = ['ID', 'Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value', 'Day of the Week',
       'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours', 'Reason for Absence', 'reason_text',
       'Month Name', 'Day Name']

In [None]:
df = df[column_name_reordered]

In [None]:
df.head()

### Examining the Education Column

In [None]:
df['Education'].value_counts()

Education column contains only the values 1 3 2 and 4.
1: higher degree
2: graduate
3: postgraduate
4: master or a doctor

We can see that with value_counts() method, nearly 600 people of high school education only while just above one hundred have a better degree than that. Therefore separating between graduate postgraduate and doctor degrees becomes less relevant for this study and it would make sense to combine these in a single category. Technically this should be done by overriding the content of the education column after typing dot map and opening parentheses. We must insert a dictionary whose key value pairs will be composed of the existing numbers that will act as keys and the new numbers which will stand for the values for instance up to this point. 

In [None]:
df['Education'] = df['Education'].map({1:0, 2:1, 3:1, 4:1})
df['Education'].unique()

In [None]:
df['Education'].value_counts()

In [None]:
df.head()

## Removing unrelated features

In [None]:
df = df.drop(['ID', 'Reason for Absence', 'reason_text', 'Month Name', 'Day Name'], axis=1)
df.head()

## Final Checkpoint

In [None]:
df_preprocessed = df
df_preprocessed.head()

In [None]:
df_preprocessed.shape

## Creating and classifying the target
Here the target feature is Absenteeism time in hours.

In [None]:
df_preprocessed['Absenteeism Time in Hours'].median()

In [None]:
targets = np.where(df_preprocessed['Absenteeism Time in Hours'] > df_preprocessed['Absenteeism Time in Hours'].median(), 1, 0)
targets

I told you that using the median as a cutoff line is numerically stable and rigid. That's because by using the median we have implicitly balance the dataset roughly half of the targets are zeros while the other half ones as you may remember this will prevent our model from learning to output one of the two classes exclusively thinking it did very well in order to prove that. Let's divide the number of targets that are ones by the total number of targets the number of targets that are ones can be found by summing up all values of targets while the total number of targets is simply the shape on axis 0.

In [None]:
targets.sum()/targets.shape[0]

The result is around 0.46. So around forty six percent of the targets are ones thus around 54 percent of the targets are zeros.So our result will do for this exercise let's proceed noting that our two groups have been distributed roughly equally finally let's drop the absenteeism time and hours from the data frame since we won't be needing it.

In [None]:
## Adding the targets to the data frame
df_preprocessed['Excessive Absenteeism'] = targets

In [None]:
df_preprocessed.shape

In [None]:
df_preprocessed = df_preprocessed.drop(['Absenteeism Time in Hours'], axis = 1)

In [None]:
df_preprocessed.head()

In [None]:
df_preprocessed.shape

## Slicing of Input Variables

In [None]:
unscaled_inputs = df_preprocessed.iloc[:, :-1]
unscaled_inputs.head()

# Feature Scaling or Standardization

In [None]:
from sklearn.preprocessing import StandardScaler
absenteeism_scaler = StandardScaler()

In [None]:
absenteeism_scaler.fit(unscaled_inputs)

In [None]:
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)

In [None]:
scaled_inputs

In [None]:
scaled_inputs.shape

## Splitting the data into training and test set

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
## Test size of 0.2 and setting random_state of 20 solves the problem of data shuffling
X_train, X_test, y_train, y_test = train_test_split(scaled_inputs, targets, test_size =0.2, random_state=20)

In [None]:
print(X_train.shape, y_train.shape)

In [None]:
print(X_test.shape, y_test.shape)

# Creating ML Models

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import  metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

### Model Training

In [None]:
#we must declare a new variable which will be a logistic regression object
reg = LogisticRegression()

#We must fit the regression.
reg.fit(X_train, y_train)

### Logistic Regression Training Model Score

In [None]:
reg.score(X_train, y_train)

## Testing the Model

In [None]:
### Use the trained model to predict the target labels for the test data

In [None]:
reg_predict = reg.predict(X_test)

In [None]:
reg.score(X_test, y_test)

In [None]:
model_accuracy = accuracy_score(y_test, reg_predict)
print('Accuracy Score is: ', model_accuracy)

In [None]:
model_precision = precision_score(y_test, reg_predict)
print('Precision Score is: ', model_precision)

In [None]:
model_recall = recall_score(y_test, reg_predict)
print('Recall Score is: ', model_recall)

In [None]:
model_f1 = f1_score(y_test, reg_predict)
print('F1 Score is: ', model_f1)

In [None]:
print(classification_report(y_test, reg_predict))

## Random Forest for SK learn

In [None]:
from sklearn.ensemble import RandomForestClassifier

## Model Training

In [None]:
rforest = RandomForestClassifier()
rforest.fit(X_train, y_train)

## Random Forest Training Model Score

In [None]:
rforest.score(X_train, y_train)

## Testing the model

In [None]:
rforest_predict = rforest.predict(X_test)

In [None]:
rforest_model = accuracy_score(y_test, rforest_predict)
print('RF model accuracy score is: ', rforest_model)

In [None]:
rforest_precision = precision_score(y_test, rforest_predict)
print('RF model precision score is: ', rforest_precision)

In [None]:
rforest_recall = recall_score(y_test, rforest_predict)
print('RF model recall score is: ', rforest_recall)

In [None]:
rforest_f1 = f1_score(y_test, rforest_predict)
print('RF model f1 score is: ', rforest_f1)

In [None]:
print(classification_report(y_test, rforest_predict))

Testing accuracy is always lower than the trained accuracy. If the test accuracy is lower than 10% - 20% then it will be the case of overfitting. In our case we are small percentage difference.