In [9]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import arff  # de liac-arff

# Configuración de visualización
sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (10, 5)

# Rutas
DATA_DIR = "../data/raw"
ADULT_PATH = os.path.join(DATA_DIR, "dataset")            # Adult Income
HAR_PATH   = os.path.join(DATA_DIR, "php0gEU7D.arff")     # HAR
GAS_PATH   = os.path.join(DATA_DIR, "phpN4gaxw.arff")     # Gas Sensor

# Carga de datasets
def cargar_arff(path):
    with open(path) as f:
        data = arff.load(f)
    columnas = [attr[0] for attr in data["attributes"]]
    df = pd.DataFrame(data["data"], columns=columnas)
    return df

# Adult Income
adult_df = cargar_arff(ADULT_PATH)
print("✔️ Adult dataset cargado")
print(f"Dimensiones: {adult_df.shape}")
print(f"Columnas: {adult_df.columns.tolist()}")

# HAR
har_df = cargar_arff(HAR_PATH)
print("\n✔️ HAR dataset cargado")
print(f"Dimensiones: {har_df.shape}")
print(f"Columnas: {har_df.columns.tolist()}")

# Gas Sensor
gas_df = cargar_arff(GAS_PATH)
print("\n✔️ Gas Sensor dataset cargado")
print(f"Dimensiones: {gas_df.shape}")
print(f"Columnas: {gas_df.columns.tolist()}")

# Validación básica de tipos
print("\n📊 Tipos de datos por dataset:")
print("\nAdult:")
print(adult_df.dtypes)
print("\nHAR:")
print(har_df.dtypes)
print("\nGas Sensor:")
print(gas_df.dtypes)

# Verificación de valores nulos ('?' como texto)
print("\n🔍 Conteo de valores '?' por columna:")

for name, df in [('Adult', adult_df), ('HAR', har_df), ('Gas', gas_df)]:
    conteo = (df == '?').sum()
    print(f"\n{name}:\n{conteo[conteo > 0]}")

✔️ Adult dataset cargado
Dimensiones: (32561, 15)
Columnas: ['age', 'workclass', 'fnlwgt', 'education', 'education.num', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'capital.gain', 'capital.loss', 'hours.per.week', 'native.country', 'income']

✔️ HAR dataset cargado
Dimensiones: (180, 68)
Columnas: ['Person', 'Activity', 'tBodyAcc-mean()-X', 'tBodyAcc-mean()-Y', 'tBodyAcc-mean()-Z', 'tBodyAcc-std()-X', 'tBodyAcc-std()-Y', 'tBodyAcc-std()-Z', 'tGravityAcc-mean()-X', 'tGravityAcc-mean()-Y', 'tGravityAcc-mean()-Z', 'tGravityAcc-std()-X', 'tGravityAcc-std()-Y', 'tGravityAcc-std()-Z', 'tBodyAccJerk-mean()-X', 'tBodyAccJerk-mean()-Y', 'tBodyAccJerk-mean()-Z', 'tBodyAccJerk-std()-X', 'tBodyAccJerk-std()-Y', 'tBodyAccJerk-std()-Z', 'tBodyGyro-mean()-X', 'tBodyGyro-mean()-Y', 'tBodyGyro-mean()-Z', 'tBodyGyro-std()-X', 'tBodyGyro-std()-Y', 'tBodyGyro-std()-Z', 'tBodyGyroJerk-mean()-X', 'tBodyGyroJerk-mean()-Y', 'tBodyGyroJerk-mean()-Z', 'tBodyGyroJerk-std()-X', 'tBodyGyroJerk-