Importing the Libraries

In [35]:
import numpy as np 
import pandas as pd 

In [36]:
#importing the csv file
raw_data = pd.read_csv('Absenteeism_data.csv')
raw_data.head()

Unnamed: 0,ID,Reason for Absence,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,11,26,07/07/2015,289,36,33,239.554,30,1,2,1,4
1,36,0,14/07/2015,118,13,50,239.554,31,1,1,0,0
2,3,23,15/07/2015,179,51,38,239.554,31,1,0,0,2
3,7,7,16/07/2015,279,5,39,239.554,24,1,2,0,4
4,11,23,23/07/2015,289,36,33,239.554,30,1,2,1,2


In [37]:
#Checking the data
raw_data.describe()

Unnamed: 0,ID,Reason for Absence,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
count,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0
mean,17.951429,19.411429,222.347143,29.892857,36.417143,271.801774,26.737143,1.282857,1.021429,0.687143,6.761429
std,11.028144,8.356292,66.31296,14.804446,6.379083,40.021804,4.254701,0.66809,1.112215,1.166095,12.670082
min,1.0,0.0,118.0,5.0,27.0,205.917,19.0,1.0,0.0,0.0,0.0
25%,9.0,13.0,179.0,16.0,31.0,241.476,24.0,1.0,0.0,0.0,2.0
50%,18.0,23.0,225.0,26.0,37.0,264.249,25.0,1.0,1.0,0.0,3.0
75%,28.0,27.0,260.0,50.0,40.0,294.217,31.0,1.0,2.0,1.0,8.0
max,36.0,28.0,388.0,52.0,58.0,378.884,38.0,4.0,4.0,8.0,120.0


In [38]:
#Checking if there are missing values
raw_data.isna().sum()

ID                           0
Reason for Absence           0
Date                         0
Transportation Expense       0
Distance to Work             0
Age                          0
Daily Work Load Average      0
Body Mass Index              0
Education                    0
Children                     0
Pets                         0
Absenteeism Time in Hours    0
dtype: int64

In [39]:
#Checking the data types of the data 
raw_data.dtypes

ID                             int64
Reason for Absence             int64
Date                          object
Transportation Expense         int64
Distance to Work               int64
Age                            int64
Daily Work Load Average      float64
Body Mass Index                int64
Education                      int64
Children                       int64
Pets                           int64
Absenteeism Time in Hours      int64
dtype: object

Cleaning the Data


In [40]:
#copying the raw data 
df = raw_data.copy()
df.head()

Unnamed: 0,ID,Reason for Absence,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,11,26,07/07/2015,289,36,33,239.554,30,1,2,1,4
1,36,0,14/07/2015,118,13,50,239.554,31,1,1,0,0
2,3,23,15/07/2015,179,51,38,239.554,31,1,0,0,2
3,7,7,16/07/2015,279,5,39,239.554,24,1,2,0,4
4,11,23,23/07/2015,289,36,33,239.554,30,1,2,1,2


In [41]:
#Converting the Date to numpy date object 

df['Date'] = pd.to_datetime(df['Date'],format = "%d/%m/%Y")
df.dtypes

ID                                    int64
Reason for Absence                    int64
Date                         datetime64[ns]
Transportation Expense                int64
Distance to Work                      int64
Age                                   int64
Daily Work Load Average             float64
Body Mass Index                       int64
Education                             int64
Children                              int64
Pets                                  int64
Absenteeism Time in Hours             int64
dtype: object

In [42]:
#Dropping the ID column 

df = df.drop('ID',axis=1)
df.head()

Unnamed: 0,Reason for Absence,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,26,2015-07-07,289,36,33,239.554,30,1,2,1,4
1,0,2015-07-14,118,13,50,239.554,31,1,1,0,0
2,23,2015-07-15,179,51,38,239.554,31,1,0,0,2
3,7,2015-07-16,279,5,39,239.554,24,1,2,0,4
4,23,2015-07-23,289,36,33,239.554,30,1,2,1,2


In [43]:
#Generating Dummies for the Reason for absence 
reason_dummy = pd.get_dummies(df['Reason for Absence'])
reason_dummy


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,18,19,21,22,23,24,25,26,27,28
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
1,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
3,False,False,False,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
696,False,False,False,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
697,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
698,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False


In [44]:
#Grouping the dummies in the categories Reason1, Reason2, Reason3, Reason4
reason_1 = reason_dummy.iloc[:,1:14].max(axis=1)
reason_2 = reason_dummy.iloc[:,15:17].max(axis=1)
reason_3 = reason_dummy.iloc[:,18:21].max(axis=1)
reason_4 = reason_dummy.iloc[:,22:-1].max(axis=1)


In [45]:
#Concanating the df and all reason

df = pd.concat([df,reason_1,reason_2,reason_3,reason_4],axis=1)
df.head()

Unnamed: 0,Reason for Absence,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,0,1,2,3
0,26,2015-07-07,289,36,33,239.554,30,1,2,1,4,False,False,False,True
1,0,2015-07-14,118,13,50,239.554,31,1,1,0,0,False,False,False,False
2,23,2015-07-15,179,51,38,239.554,31,1,0,0,2,False,False,False,True
3,7,2015-07-16,279,5,39,239.554,24,1,2,0,4,True,False,False,False
4,23,2015-07-23,289,36,33,239.554,30,1,2,1,2,False,False,False,True


In [46]:
#dropping the reason for absence column
df = df.drop('Reason for Absence',axis=1)
df.head()

Unnamed: 0,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,0,1,2,3
0,2015-07-07,289,36,33,239.554,30,1,2,1,4,False,False,False,True
1,2015-07-14,118,13,50,239.554,31,1,1,0,0,False,False,False,False
2,2015-07-15,179,51,38,239.554,31,1,0,0,2,False,False,False,True
3,2015-07-16,279,5,39,239.554,24,1,2,0,4,True,False,False,False
4,2015-07-23,289,36,33,239.554,30,1,2,1,2,False,False,False,True


In [47]:
#Getting the columns names for the reason 
df.columns.values

array(['Date', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours', 0, 1, 2, 3],
      dtype=object)

In [48]:
#Changing the column names for the reason 
df.columns= ['Date', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours', 'Reason 1', 'Reason 2', 'Reason 3', 'Reason 4']
df.head()

Unnamed: 0,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Reason 1,Reason 2,Reason 3,Reason 4
0,2015-07-07,289,36,33,239.554,30,1,2,1,4,False,False,False,True
1,2015-07-14,118,13,50,239.554,31,1,1,0,0,False,False,False,False
2,2015-07-15,179,51,38,239.554,31,1,0,0,2,False,False,False,True
3,2015-07-16,279,5,39,239.554,24,1,2,0,4,True,False,False,False
4,2015-07-23,289,36,33,239.554,30,1,2,1,2,False,False,False,True


In [50]:
#Bring the Reason Column to the front 
df=df[['Reason 1', 'Reason 2', 'Reason 3', 'Reason 4','Date', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours']]
df.head()

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,False,False,False,True,2015-07-07,289,36,33,239.554,30,1,2,1,4
1,False,False,False,False,2015-07-14,118,13,50,239.554,31,1,1,0,0
2,False,False,False,True,2015-07-15,179,51,38,239.554,31,1,0,0,2
3,True,False,False,False,2015-07-16,279,5,39,239.554,24,1,2,0,4
4,False,False,False,True,2015-07-23,289,36,33,239.554,30,1,2,1,2


In [53]:
#Getting the month values from the date column 
month_values =[]

for i in range(df.shape[0]):
    month_values.append(df['Date'][i].month)

#month_values


In [54]:
#Getting the Day of the week 

def get_day(date):
    return date.weekday()

weekday = df['Date'].apply(get_day)
weekday

0      1
1      1
2      2
3      3
4      3
      ..
695    2
696    2
697    3
698    3
699    3
Name: Date, Length: 700, dtype: int64

In [55]:
df['month'] = month_values
df['weekday'] = weekday
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['month'] = month_values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['weekday'] = weekday


Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,month,weekday
0,False,False,False,True,2015-07-07,289,36,33,239.554,30,1,2,1,4,7,1
1,False,False,False,False,2015-07-14,118,13,50,239.554,31,1,1,0,0,7,1
2,False,False,False,True,2015-07-15,179,51,38,239.554,31,1,0,0,2,7,2
3,True,False,False,False,2015-07-16,279,5,39,239.554,24,1,2,0,4,7,3
4,False,False,False,True,2015-07-23,289,36,33,239.554,30,1,2,1,2,7,3


In [56]:
#Changing the date column
df=df[['Reason 1', 'Reason 2', 'Reason 3', 'Reason 4','month','weekday','Date', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours']]
df.head()

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,month,weekday,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,False,False,False,True,7,1,2015-07-07,289,36,33,239.554,30,1,2,1,4
1,False,False,False,False,7,1,2015-07-14,118,13,50,239.554,31,1,1,0,0
2,False,False,False,True,7,2,2015-07-15,179,51,38,239.554,31,1,0,0,2
3,True,False,False,False,7,3,2015-07-16,279,5,39,239.554,24,1,2,0,4
4,False,False,False,True,7,3,2015-07-23,289,36,33,239.554,30,1,2,1,2


In [57]:
#dropping date column
df = df.drop('Date',axis = 1)
df.head()

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,month,weekday,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,False,False,False,True,7,1,289,36,33,239.554,30,1,2,1,4
1,False,False,False,False,7,1,118,13,50,239.554,31,1,1,0,0
2,False,False,False,True,7,2,179,51,38,239.554,31,1,0,0,2
3,True,False,False,False,7,3,279,5,39,239.554,24,1,2,0,4
4,False,False,False,True,7,3,289,36,33,239.554,30,1,2,1,2


In [60]:
#Cleaning the Education
df['Education'].unique()
df['Education'] = df['Education'].map({1:0,2:1,3:1,4:1})
df.head()

Unnamed: 0,Reason 1,Reason 2,Reason 3,Reason 4,month,weekday,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,False,False,False,True,7,1,289,36,33,239.554,30,0,2,1,4
1,False,False,False,False,7,1,118,13,50,239.554,31,0,1,0,0
2,False,False,False,True,7,2,179,51,38,239.554,31,0,0,0,2
3,True,False,False,False,7,3,279,5,39,239.554,24,0,2,0,4
4,False,False,False,True,7,3,289,36,33,239.554,30,0,2,1,2


In [None]:
#Saving the cleaned file
df.to_csv('absenteesim_cleaned.csv',index=False)

NameError: name 'df' is not defined