In [4]:
import pandas as pd
import numpy as np

# Cargar el archivo CSV
df = pd.read_csv('Absenteeism-data.csv')

In [5]:
df = df.drop('ID', axis=1)
df

Unnamed: 0,Reason for Absence,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,26,07/07/2015,289,36,33,239.554,30,1,2,1,4
1,0,14/07/2015,118,13,50,239.554,31,1,1,0,0
2,23,15/07/2015,179,51,38,239.554,31,1,0,0,2
3,7,16/07/2015,279,5,39,239.554,24,1,2,0,4
4,23,23/07/2015,289,36,33,239.554,30,1,2,1,2
...,...,...,...,...,...,...,...,...,...,...,...
695,10,23/05/2018,179,22,40,237.656,22,2,2,0,8
696,6,23/05/2018,225,26,28,237.656,24,1,1,2,3
697,10,24/05/2018,330,16,28,237.656,25,2,0,0,8
698,23,24/05/2018,235,16,32,237.656,25,3,0,0,2


In [6]:
# Crear dummies para todas las razones posibles (1-28)
reason_dummies = pd.get_dummies(df['Reason for Absence'], prefix='Reason')

# Crear un DataFrame con todas las columnas posibles (1-28)
all_reason_columns = [f'Reason_{i}' for i in range(1, 29)]
reason_dummies = reason_dummies.reindex(columns=all_reason_columns, fill_value=0)

# Definir los grupos según los rangos especificados
group1 = reason_dummies.loc[:, 'Reason_1':'Reason_14'].sum(axis=1)
group2 = reason_dummies[['Reason_15', 'Reason_16', 'Reason_17']].sum(axis=1)
group3 = reason_dummies[['Reason_18', 'Reason_19', 'Reason_20', 'Reason_21']].sum(axis=1)
group4 = reason_dummies.loc[:, 'Reason_22':'Reason_28'].sum(axis=1)

# Añadir los grupos al DataFrame original
df['Reason_Group1'] = group1
df['Reason_Group2'] = group2
df['Reason_Group3'] = group3
df['Reason_Group4'] = group4

# Eliminar la columna original 'Reason for Absence'
df = df.drop('Reason for Absence', axis=1)
df

Unnamed: 0,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Reason_Group1,Reason_Group2,Reason_Group3,Reason_Group4
0,07/07/2015,289,36,33,239.554,30,1,2,1,4,0,0,0,1
1,14/07/2015,118,13,50,239.554,31,1,1,0,0,0,0,0,0
2,15/07/2015,179,51,38,239.554,31,1,0,0,2,0,0,0,1
3,16/07/2015,279,5,39,239.554,24,1,2,0,4,1,0,0,0
4,23/07/2015,289,36,33,239.554,30,1,2,1,2,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,23/05/2018,179,22,40,237.656,22,2,2,0,8,1,0,0,0
696,23/05/2018,225,26,28,237.656,24,1,1,2,3,1,0,0,0
697,24/05/2018,330,16,28,237.656,25,2,0,0,8,1,0,0,0
698,24/05/2018,235,16,32,237.656,25,3,0,0,2,0,0,0,1


In [7]:
# Convertir la columna 'Date' a datetime
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')

# Extraer el mes y el día de la semana (0: Lunes, 6: Domingo)
df['Month'] = df['Date'].dt.month
df['Day_of_Week'] = df['Date'].dt.dayofweek

# Eliminar la columna 'Date'
df = df.drop('Date', axis=1)
df

Unnamed: 0,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Reason_Group1,Reason_Group2,Reason_Group3,Reason_Group4,Month,Day_of_Week
0,289,36,33,239.554,30,1,2,1,4,0,0,0,1,7,1
1,118,13,50,239.554,31,1,1,0,0,0,0,0,0,7,1
2,179,51,38,239.554,31,1,0,0,2,0,0,0,1,7,2
3,279,5,39,239.554,24,1,2,0,4,1,0,0,0,7,3
4,289,36,33,239.554,30,1,2,1,2,0,0,0,1,7,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,179,22,40,237.656,22,2,2,0,8,1,0,0,0,5,2
696,225,26,28,237.656,24,1,1,2,3,1,0,0,0,5,2
697,330,16,28,237.656,25,2,0,0,8,1,0,0,0,5,3
698,235,16,32,237.656,25,3,0,0,2,0,0,0,1,5,3


In [8]:
# Mapear: 0 -> 1, cualquier otro valor -> 0
df['Education'] = df['Education'].map({0: 1, 1: 0, 2: 0, 3: 0, 4: 0})

In [9]:
# Mostrar las primeras filas para verificar
print(df.head())

# Opcional: Guardar el DataFrame limpio para comparar con 'df-cleaned.csv'
df.to_csv('df-cleaned.csv', index=False)

   Transportation Expense  Distance to Work  Age  Daily Work Load Average  \
0                     289                36   33                  239.554   
1                     118                13   50                  239.554   
2                     179                51   38                  239.554   
3                     279                 5   39                  239.554   
4                     289                36   33                  239.554   

   Body Mass Index  Education  Children  Pets  Absenteeism Time in Hours  \
0               30          0         2     1                          4   
1               31          0         1     0                          0   
2               31          0         0     0                          2   
3               24          0         2     0                          4   
4               30          0         2     1                          2   

   Reason_Group1  Reason_Group2 Reason_Group3  Reason_Group4  Month  \
0        