# Project: Absenteeism Analysis
### To determine for how many working hours a person presenting certain characteristics is expected to be away from work at certain point in time or not.

#### Importing pandas library

In [None]:
import pandas as pd

#### Reading the data
#### Since our data is in .csv extension, we'll use the followin format
#### Format: dataframe = pd.read_csv("data_file_name.csv")
##### Note: 'dataframe' is a name we chose. We can name it anything. It is used to store the data extracted from the main data file which can be later used for data related operations. 

In [None]:
data = pd.read_csv("Absenteeism_data.csv")
data

#### Let's say if we want to see the first five or first ten rows, then we can use the 'head' function.
#### Format: dataframe.head(Number of rows to display)

In [None]:
data.head(5)

#### Similarly, let's say if we want to see the last five or last ten rows, then we can use the 'tail' function.
#### Format: dataframe.tail(Number of rows to display)

In [None]:
data.tail(5)

#### When the data is very long, Jupyter will not show the entire data, but only a part of it, as seen above. This can be in terms of both, rows and columns. There are some cases where you would want to see all the data, for that you can use the 'max' function.
#### Format for columns: pd.options.display.max_columns = # #
#### Format for rows: pd.options.display.max_rows = # # 
##### # is the number of rows and columns that you want to be displayed. It can be any number. Incase you want to see all, type 'None'. 

In [None]:
pd.options.display.max_columns = None
pd.options.display.max_rows = None
display(data)

#### When the dataset is very large, as what we have here, and if we want to check for any missing values, we can check any missing values by generating a summary of the dataframe. This can be done by applying 'info' method dorectly to the dataframe.
#### Format: dataframe.info()

In [None]:
data.info()

#### Drop 'ID'
#### Format: dataframe.drop(['List of column names we want to get rid of'])

In [None]:
data = data.drop(['ID'],axis = 1)
data

#### Analysing 'Reason for Absence' column.
#### Format: dataframe['Column_name']

In [None]:
data['Reason for Absence']

In [None]:
data['Reason for Absence'].min()

In [None]:
data['Reason for Absence'].max()

In [None]:
data['Reason for Absence'].unique()

In [None]:
len(data['Reason for Absence'].unique())

In [None]:
sorted(data['Reason for Absence'].unique())

#### Creating dummies for 'Reason for Absence' column.
#### Format: reason_column = pd.get_dummies(dataframe['column_name'])

In [None]:
reason_columns = pd.get_dummies(data['Reason for Absence'], dtype = int)
reason_columns

#### #### To check if there are any missing values in the rows. To do that, we'll create a new column with sum of the values. If the sum of rows is 1. then the value is present. If the sum is 0, then value is absent. Note: axis = 1

In [None]:
reason_columns['check'] =reason_columns.sum(axis = 1)
reason_columns

#### To check if there are any missing values in the check column, we'll do a vertical sum. Note: axis = 0

In [None]:
reason_columns['check'].sum(axis = 0)

In [None]:
reason_columns['check'].unique()

#### Since we have checked all the values are present, and the content is unique, we can drop the check column.

In [None]:
reason_columns = reason_columns.drop(['check'], axis = 1)

In [None]:
reason_columns

In [None]:
reason_columns = pd.get_dummies(data['Reason for Absence'], drop_first = True, dtype = int)
reason_columns

#### Group the Reasons for Absence

In [None]:
data.columns.values

In [None]:
reason_columns.columns.values

In [None]:
data = data.drop(['Reason for Absence'], axis = 1)
data

In [None]:
reason_columns.loc[:, 1:14]

In [None]:
reason_columns.loc[:,15:17]

In [None]:
reason_columns.loc[:,18:21]

In [None]:
reason_columns.loc[:,22:28]

In [None]:
reason_type_1 = reason_columns.loc[:,1:14].max(axis = 1)
reason_type_2 = reason_columns.loc[:,15:17].max(axis = 1)
reason_type_3 = reason_columns.loc[:,18:21].max(axis = 1)
reason_type_4 = reason_columns.loc[:,22:].max(axis = 1)

In [None]:
data

#### Concatenate column values
#### Format: dataframe = pd.concat([dataframe, table_1, table_2, ...], axis = 1)

In [None]:
data = pd.concat([data, reason_type_1, reason_type_2, reason_type_3, reason_type_4], axis = 1)
data

In [None]:
data.columns.values

In [None]:
column_names = ['Date', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours', 'Reason_1', 'Reason_2', 'Reason_3', 'Reason_4']

In [None]:
data.columns = column_names
data

In [None]:
column_names_reordered = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4','Date', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours']

In [None]:
data = data[column_names_reordered]
data

#### Creating Checkpoints

In [None]:
data_reason_mod = data.copy()
data_reason_mod

#### Working on date column

In [None]:
type(data_reason_mod['Date'][0])

In [None]:
data_reason_mod['Date'] = pd.to_datetime(data_reason_mod['Date'], format = '%d/%m/%Y' )
data_reason_mod['Date']

#### Extracting Month Value:

In [None]:
data_reason_mod['Date'][0]

In [None]:
data_reason_mod['Date'][0].month

In [None]:
list_months = []

In [None]:
for i in range(data_reason_mod.shape[0]):
    list_months.append(data_reason_mod['Date'][i].month)

In [None]:
list_months

In [None]:
len(list_months)

In [None]:
data_reason_mod['Month Value'] = list_months

In [None]:
data_reason_mod.head(10)

#### Extract Day of the Week:

In [None]:
data_reason_mod['Date'][699].weekday()

In [None]:
data_reason_mod['Date'][699]

In [None]:
def date_to_weekday(date_value):
    return date_value.weekday()

In [None]:
data_reason_mod['Day of the week'] = data_reason_mod['Date'].apply(date_to_weekday)

In [None]:
data_reason_mod.head(10)

#### Drop Date Column

In [None]:
data_reason_mod = data_reason_mod.drop(['Date'], axis = 1)
data_reason_mod

In [None]:
data_reason_mod['Education'].unique()

In [None]:
data_reason_mod['Education'].value_counts()

In [None]:
data_reason_mod['Education'] = data_reason_mod['Education'].map({1:0, 2:1, 3:1, 4:1}) 

In [None]:
data_reason_mod['Education'].unique()

In [None]:
data_reason_mod['Education'].value_counts()

In [None]:
type(data_reason_mod['Transportation Expense'][0])

In [None]:
type(data_reason_mod['Distance to Work'][0])

In [None]:
type(data_reason_mod['Age'][0])

In [None]:
type(data_reason_mod['Daily Work Load Average'][0])

In [None]:
type(data_reason_mod['Body Mass Index'][0])

#### FInal Checkpoint

In [None]:
data_preprocessed = data_reason_mod.copy()
data_preprocessed

#### Exportind data to .csv file

In [None]:
data_preprocessed.to_csv('Absenteeism_preprocessed.csv', index = False)