In [1]:
# | -------------------------------- |
# | PREPROCESSING                    |
# | -------------------------------- |

# Librerías estándar
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import sys
import os
sys.path.append(os.path.join(os.getcwd(), '../utils'))

# Módulos personalizados
from toolbox_ML import *
from bootcampviztools import *

# Desactivamos los warnings
import warnings
warnings.simplefilter("ignore")

# Configuración para mejorar la visualización
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = [12, 6]

In [2]:
# Cargar el dataset
df = pd.read_csv("../data_sample/scrap_data_asturias.csv", sep=";")

In [3]:
# Limpiar y convertir columnas numéricas

# Lista de columnas numéricas problemáticas
numeric_columns_to_clean = [
    'Metros cuadrados construidos', 
    'Latitud', 
    'Longitud'
]

# Función para limpiar y convertir a numérico
for col in numeric_columns_to_clean:
    df[col] = df[col].str.replace(',', '.')
    df[col] = pd.to_numeric(df[col], errors='coerce')

In [4]:
# Convertir variables binarias ("Sí/No") a booleanas

# Lista de columnas binarias
binary_columns = [
    'Ascensor (Sí/No)', 
    'Obra nueva (Sí/No)', 
    'Piscina (Sí/No)', 
    'Terraza (Sí/No)', 
    'Parking (Sí/No)', 
    'Parking incluído en el precio (Sí/No)', 
    'Aire acondicionado (Sí/No)', 
    'Trastero (Sí/No)', 
    'Jardín (Sí/No)'
]

# Mapear "Sí" -> 1 y "No" -> 0
for col in binary_columns:
    df[col] = df[col].map({'Sí': 1, 'No': 0})

In [5]:
conteo_barrio = df["Barrio"].value_counts().get("-", 0)
print(conteo_barrio)
conteo_distrito = df["Distrito"].value_counts().get("-", 0)
print(conteo_distrito)
conteo_planta = df["Planta"].value_counts().get("-", 0)
print(conteo_planta)

1869
938
37


In [6]:
# Barrio y distrito tienen valores "-" que no aportan información, se sustituirán por "Desconocido" ya que pueden aportar información sobre el precio de la vivienda
# Sin embargo, la columna "Planta" no aporta información tan relevante, por lo que se eliminarán las filas con valor "-"
df["Barrio"] = df["Barrio"].replace("-", "Desconocido")
df["Distrito"] = df["Distrito"].replace("-", "Desconocido")
df = df[df["Planta"] != "-"]

In [7]:
# Borrar los anuncios con ids duplicados
df = df.drop_duplicates(subset="Id del anuncio")

In [8]:
# -----------------------------------------
# FEATURE ENGINEERING
# -----------------------------------------
# Crear nuevas variables relevantes
df['Precio_por_habitacion'] = df['Precio'] / df['Habitaciones']
df['Precio_por_bano'] = df['Precio'] / df['Baños']

# Mostrar las nuevas features
display(df[['Precio_por_habitacion', 'Precio_por_bano']].head())

Unnamed: 0,Precio_por_habitacion,Precio_por_bano
0,77000.0,77000.0
1,26666.666667,80000.0
2,35000.0,70000.0
3,110000.0,110000.0
4,160000.0,160000.0


In [9]:
describe_df(df)

Unnamed: 0,Id del anuncio,Tipo de inmueble,Tipo de vendedor,Barrio,Municipio,Distrito,Precio,Euros/m2,Metros cuadrados construidos,Habitaciones,Baños,Planta,Latitud,Longitud,Ascensor (Sí/No),Obra nueva (Sí/No),Piscina (Sí/No),Terraza (Sí/No),Parking (Sí/No),Parking incluído en el precio (Sí/No),Aire acondicionado (Sí/No),Trastero (Sí/No),Jardín (Sí/No),Precio_por_habitacion,Precio_por_bano
DATA_TYPE,int64,object,object,object,object,object,float64,float64,float64,float64,float64,object,float64,float64,int64,int64,int64,int64,int64,int64,int64,int64,int64,float64,float64
MISSINGS(%),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
UNIQUE_VALUES,3548,4,2,59,60,53,714,1885,240,11,8,19,3254,3251,2,2,2,2,2,2,2,2,2,944,782
CARDIN(%),100.0,0.113,0.056,1.663,1.691,1.494,20.124,53.129,6.764,0.31,0.225,0.536,91.714,91.629,0.056,0.056,0.056,0.056,0.056,0.056,0.056,0.056,0.056,26.607,22.041


In [10]:
# Guardar el dataset limpio
ruta_procesado = "../data_sample/cleaned_dataset.csv"
df.to_csv(ruta_procesado, index=False)
print(f"✅ Dataset limpio guardado en: {ruta_procesado}")

✅ Dataset limpio guardado en: ../data_sample/cleaned_dataset.csv
