In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

IN_PATH  = Path("Absenteeism-data.csv")
OUT_PATH = Path("df-final-report.csv")

df = pd.read_csv(IN_PATH)

id_cols = [c for c in df.columns if c.strip().lower() == "id"]
if id_cols:
    df = df.drop(columns=id_cols)

reason_col = None
for c in df.columns:
    if "reason" in c.lower():
        reason_col = c
        break
if reason_col is None:
    raise ValueError("No se encontró la columna 'Reason for Absence'.")

categories = list(range(1, 29))
reason_cat = pd.Categorical(df[reason_col], categories=categories, ordered=False)
reason_dummies = pd.get_dummies(reason_cat, prefix="Reason", prefix_sep="_", dummy_na=False)

def cols_for_range(start, end):
    return [f"Reason_{i}" for i in range(start, end + 1)]

grp1 = reason_dummies[cols_for_range(1, 14)].max(axis=1).astype(int)
grp2 = reason_dummies[cols_for_range(15, 17)].max(axis=1).astype(int)
grp3 = reason_dummies[cols_for_range(18, 21)].max(axis=1).astype(int)
grp4 = reason_dummies[cols_for_range(22, 28)].max(axis=1).astype(int)

df = df.assign(
    Reason_1=grp1,
    Reason_2=grp2,
    Reason_3=grp3,
    Reason_4=grp4
)

df = df.drop(columns=[reason_col])

date_col = None
for c in df.columns:
    if c.strip().lower() == "date" or ("date" in c.lower()):
        date_col = c
        break
if date_col is None:
    raise ValueError("No se encontró la columna 'Date'.")

dt = pd.to_datetime(df[date_col], errors="coerce", dayfirst=True)
df["Month"] = dt.dt.month.astype("Int64")
df["Day of the Week"] = dt.dt.weekday.astype("Int64")
df = df.drop(columns=[date_col])

edu_col = None
for c in df.columns:
    if "education" in c.lower():
        edu_col = c
        break
if edu_col is None:
    raise ValueError("No se encontró la columna 'Education'.")

edu_unique = pd.Series(df[edu_col].unique()).dropna().astype(float).astype(int).tolist()

def map_education(val):
    if pd.isna(val):
        return np.nan
    x = int(val)
    if set(edu_unique).issubset({0, 1}):
        return 1 if x == 0 else 0
    else:
        return 0 if x == 1 else 1

df[edu_col] = df[edu_col].map(map_education).astype("Int64")

df.to_csv(OUT_PATH, index=False)

print("Completado")
print(f"Archivo generado: {OUT_PATH}")

Completado
Archivo generado: df-final-report.csv


In [3]:
import pandas as pd

df = pd.read_csv("df-final-report.csv")

print(df.head())

   Transportation Expense  Distance to Work  Age  Daily Work Load Average  \
0                     289                36   33                  239.554   
1                     118                13   50                  239.554   
2                     179                51   38                  239.554   
3                     279                 5   39                  239.554   
4                     289                36   33                  239.554   

   Body Mass Index  Education  Children  Pets  Absenteeism Time in Hours  \
0               30          0         2     1                          4   
1               31          0         1     0                          0   
2               31          0         0     0                          2   
3               24          0         2     0                          4   
4               30          0         2     1                          2   

   Reason_1  Reason_2  Reason_3  Reason_4  Month  Day of the Week  
0         0 

In [4]:
print(df['Education'].value_counts()) 

Education
0    583
1    117
Name: count, dtype: int64


In [5]:
print(df[['Reason_1','Reason_2','Reason_3','Reason_4']].sum())

Reason_1    175
Reason_2      6
Reason_3     63
Reason_4    418
dtype: int64


In [6]:
print(df[['Month','Day of the Week']].drop_duplicates().sort_values(['Month','Day of the Week']))

     Month  Day of the Week
114      1                0
115      1                1
113      1                2
118      1                3
120      1                4
..     ...              ...
339     12                0
98      12                1
100     12                2
102     12                3
103     12                4

[63 rows x 2 columns]
