In [2]:
# IMPORTACIÓN DE LIBRERIAS NECESARIAS:

# Tratamiento de datos
# -----------------------------------------------------------------------
import pandas as pd
import numpy as np

# Imputación de nulos usando métodos avanzados estadísticos
# -----------------------------------------------------------------------
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer

# Librerías de visualización
# -----------------------------------------------------------------------
import seaborn as sns
import matplotlib.pyplot as plt

# Gestión de los warnings
# -----------------------------------------------------------------------
import warnings
warnings.filterwarnings("ignore")

# Configuración
# -----------------------------------------------------------------------
pd.set_option('display.max_columns', None) # para poder visualizar todas las columnas de los DataFrames

In [3]:
# CREACIÓN DE UNA FUNCIÓN PARA LEER Y EXPLORAR CADA DATASET.

def leer_y_explorar_df(ruta_fichero):

    # Lectura del fichero
    try:
        df = pd.read_csv(ruta_fichero)
        print(f" Fichero {ruta_fichero} cargado correctamente:\n")
        print("\nPRIMERAS 5 FILAS DEL DATAFRAME")
        display(df.head(5))
        
    except FileNotFoundError:
        print(f" ERROR: No se encontró el fichero '{df}' en la ruta '{ruta_fichero}'")
        return None
    
    except Exception as e:
        print(f" ERROR inesperado al leer el fichero: {e}")
        return None

    # Dimensiones del dataset
    print(f" DIMENSIONES: {df.shape[0]} filas y {df.shape[1]} columnas\n")
    
    # Info general 
    print(" INFORMACIÓN GENERAL")
    df.info()
    
    # Sumatorio de valores nulos por columna
    print("\n VALORES NULOS POR COLUMNA")
    display(df.isnull().sum())
    
    # Sumatorio de filas duplicadas
    print("\n FILAS DUPLICADAS")
    print(f"Número de filas duplicadas: {df.duplicated().sum()}")

    # Principales estadísticas de las variables
    if df.select_dtypes(include='number').shape[1]:
        print("\nESTADÍSTICAS NUMÉRICAS:")
        display(df.describe().T)
    else:
        print("\n ESTADÍSTICAS NUMÉRICAS: No existen variables numéricas.")
    
    
    if df.select_dtypes(include='object').shape[1]:
        print("\nESTADÍSTICAS CATEGÓRICAS:")
        display(df.describe(include='object').T)
    else:
        print("\n ESTADÍSTICAS CATEGÓRICAS: No existen variables categóricas.")

    return df


In [4]:
df_emp = leer_y_explorar_df("hr.csv")

 Fichero hr.csv cargado correctamente:


PRIMERAS 5 FILAS DEL DATAFRAME


Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,Over18,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41.0,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,2,Female,94,3,2,sALES eXECUTIVE,4.0,Single,5993.0,19479,8,Y,Yes,11,3,1,80.0,0,8,0.0,1,6,4,0,5.0
1,49.0,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,3,Male,61,2,2,rESEARCH sCIENTIST,2.0,Married,5130.0,24907,1,Y,No,23,4,4,,1,10,3.0,3,10,7,1,7.0
2,37.0,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,4,Male,92,2,1,lABORATORY tECHNICIAN,3.0,Single,2090.0,2396,6,Y,Yes,15,3,2,,0,7,3.0,3,0,0,0,0.0
3,33.0,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,4,Female,56,3,1,rESEARCH sCIENTIST,3.0,Married,2909.0,23159,1,Y,Yes,11,3,3,80.0,0,8,3.0,3,8,7,3,0.0
4,27.0,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,1,Male,40,3,1,lABORATORY tECHNICIAN,2.0,Married,3468.0,16632,9,Y,No,12,3,4,80.0,1,6,3.0,3,2,2,2,2.0


 DIMENSIONES: 1474 filas y 35 columnas

 INFORMACIÓN GENERAL
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1474 entries, 0 to 1473
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Age                       1401 non-null   float64
 1   Attrition                 1474 non-null   object 
 2   BusinessTravel            1357 non-null   object 
 3   DailyRate                 1474 non-null   int64  
 4   Department                1445 non-null   object 
 5   DistanceFromHome          1474 non-null   int64  
 6   Education                 1474 non-null   int64  
 7   EducationField            1416 non-null   object 
 8   EmployeeCount             1474 non-null   int64  
 9   EmployeeNumber            1474 non-null   int64  
 10  EnvironmentSatisfaction   1474 non-null   int64  
 11  Gender                    1474 non-null   object 
 12  HourlyRate                1474 non-null   int64  
 13  Jo

Age                          73
Attrition                     0
BusinessTravel              117
DailyRate                     0
Department                   29
DistanceFromHome              0
Education                     0
EducationField               58
EmployeeCount                 0
EmployeeNumber                0
EnvironmentSatisfaction       0
Gender                        0
HourlyRate                    0
JobInvolvement                0
JobLevel                      0
JobRole                       0
JobSatisfaction              29
MaritalStatus               132
MonthlyIncome                14
MonthlyRate                   0
NumCompaniesWorked            0
Over18                        0
OverTime                     44
PercentSalaryHike             0
PerformanceRating             0
RelationshipSatisfaction      0
StandardHours               164
StockOptionLevel              0
TotalWorkingYears             0
TrainingTimesLastYear        88
WorkLifeBalance               0
YearsAtC


 FILAS DUPLICADAS
Número de filas duplicadas: 4

ESTADÍSTICAS NUMÉRICAS:


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age,1401.0,36.944325,9.105009,18.0,30.0,36.0,43.0,60.0
DailyRate,1474.0,802.702171,403.53953,102.0,465.0,803.0,1157.0,1499.0
DistanceFromHome,1474.0,9.199457,8.104266,1.0,2.0,7.0,14.0,29.0
Education,1474.0,2.911126,1.024267,1.0,2.0,3.0,4.0,5.0
EmployeeCount,1474.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
EmployeeNumber,1474.0,1024.471506,602.421193,1.0,488.75,1020.5,1555.75,2068.0
EnvironmentSatisfaction,1474.0,2.723881,1.09328,1.0,2.0,3.0,4.0,4.0
HourlyRate,1474.0,65.887381,20.310444,30.0,48.0,66.0,83.75,100.0
JobInvolvement,1474.0,2.729986,0.712861,1.0,2.0,3.0,3.0,4.0
JobLevel,1474.0,2.063772,1.106055,1.0,1.0,2.0,3.0,5.0



ESTADÍSTICAS CATEGÓRICAS:


Unnamed: 0,count,unique,top,freq
Attrition,1474,2,No,1236
BusinessTravel,1357,3,Travel_Rarely,955
Department,1445,3,Research & Development,941
EducationField,1416,6,Life Sciences,582
Gender,1474,2,Male,885
JobRole,1474,9,sALES eXECUTIVE,327
MaritalStatus,1342,4,Married,604
Over18,1474,1,Y,1474
OverTime,1430,2,No,1025
