In [1]:
import os
from pathlib import Path

ROOT = Path.cwd().resolve().parent if Path.cwd().name == "notebooks" else Path.cwd()
DATA_RAW = ROOT / "data" / "raw"
DATA_RAW.mkdir(parents=True, exist_ok=True)

print("Proyecto en:", ROOT)
print("Creando carpeta:", DATA_RAW)

Proyecto en: C:\Users\Usuario\OneDrive\Desktop\Penguin\analisis-de-datos
Creando carpeta: C:\Users\Usuario\OneDrive\Desktop\Penguin\analisis-de-datos\data\raw


In [1]:
import os
from pathlib import Path
from dotenv import load_dotenv

ROOT = Path.cwd().resolve().parent if Path.cwd().name == "notebooks" else Path.cwd()
load_dotenv(ROOT / ".env")  # busca y carga variables

KAGGLE_USERNAME = os.getenv("KAGGLE_USERNAME")
KAGGLE_KEY = os.getenv("KAGGLE_KEY")

assert KAGGLE_USERNAME and KAGGLE_KEY, "Faltan KAGGLE_USERNAME o KAGGLE_KEY en .env o variables de entorno."
print("Kaggle user:", KAGGLE_USERNAME)

Kaggle user: alejandroarriola


In [3]:
from kaggle.api.kaggle_api_extended import KaggleApi

api = KaggleApi()
api.authenticate()
print("Kaggle API OK")

Kaggle API OK


In [4]:
from pathlib import Path
from kaggle.api.kaggle_api_extended import KaggleApi
from zipfile import ZipFile

# Rutas base
ROOT = Path.cwd().resolve().parent if Path.cwd().name == "notebooks" else Path.cwd()
RAW = ROOT / "data" / "raw" / "california_housing"
RAW.mkdir(parents=True, exist_ok=True)
print("Destino:", RAW)

# Autenticación Kaggle 
api = KaggleApi()
api.authenticate()

# Descargar ZIP del dataset a la carpeta RAW
dataset_slug = "camnugent/california-housing-prices"
print("Descargando:", dataset_slug)
api.dataset_download_files(dataset_slug, path=RAW, force=True, quiet=False)

# Descomprimir el ZIP más nuevo (Kaggle usa .zip cuando hay varios archivos)
zips = sorted(RAW.glob("*.zip"), key=lambda p: p.stat().st_mtime, reverse=True)
if zips:
    zip_path = zips[0]
    print("Descomprimiendo:", zip_path.name)
    with ZipFile(zip_path, "r") as zf:
        zf.extractall(RAW)
    try:
        zip_path.unlink()
    except Exception:
        pass

print("Descarga completada")


Destino: C:\Users\Usuario\OneDrive\Desktop\Penguin\analisis-de-datos\data\raw\california_housing
Descargando: camnugent/california-housing-prices
Dataset URL: https://www.kaggle.com/datasets/camnugent/california-housing-prices
Downloading california-housing-prices.zip to C:\Users\Usuario\OneDrive\Desktop\Penguin\analisis-de-datos\data\raw\california_housing


100%|██████████| 400k/400k [00:00<00:00, 98.2MB/s]


Descomprimiendo: california-housing-prices.zip
Descarga completada





In [5]:
import pandas as pd

def listar_archivos(carpeta: Path, exts=(".csv", ".parquet", ".xls", ".xlsx")):
    filas = []
    for p in carpeta.rglob("*"):
        if p.is_file() and p.suffix.lower() in exts:
            filas.append({"archivo": str(p.relative_to(carpeta)),
                          "tam_mb": round(p.stat().st_size/1_000_000, 3)})
    return pd.DataFrame(filas).sort_values("archivo")

display(listar_archivos(RAW))


Unnamed: 0,archivo,tam_mb
0,housing.csv,1.424


In [6]:
import pandas as pd

# Detectar el primer CSV que haya
csvs = sorted(RAW.glob("*.csv"))
assert csvs, "No se encontraron CSVs en la carpeta de California."
csv_path = csvs[0]
print("Leyendo:", csv_path.name)

# Cargar el CSV con manejo de posibles errores de codificación
try:
    df_ca = pd.read_csv(csv_path, low_memory=False)
except UnicodeDecodeError:
    df_ca = pd.read_csv(csv_path, encoding="latin-1", low_memory=False)

display(df_ca.head(5))
print("Shape:", df_ca.shape)
print("Columnas:", list(df_ca.columns))
display(df_ca.dtypes.to_frame("dtype"))


Leyendo: housing.csv


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


Shape: (20640, 10)
Columnas: ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'median_house_value', 'ocean_proximity']


Unnamed: 0,dtype
longitude,float64
latitude,float64
housing_median_age,float64
total_rooms,float64
total_bedrooms,float64
population,float64
households,float64
median_income,float64
median_house_value,float64
ocean_proximity,object
