In [None]:
import pandas as pd
import re

# Load the file (ignoring malformed lines)
df = pd.read_csv('casas_idealista.csv', encoding='utf-16', sep=';')

# Drop the 'Titulo' column
df = df.drop(columns=['Titulo'])


In [None]:
# Remove rows with NaN in specific columns
columnas_a_verificar = df.columns  # This checks all columns

# Remove rows with NaN values in any column
df_limpio = df.dropna(subset=columnas_a_verificar)

In [None]:
# Create a function to extract built and usable square meters
def extraer_m2(texto):
    # Use a regular expression to find the first value ending with 'm²'
    match_construidos = re.search(r'(\d+)\s*m² construidos', texto)
    match_utiles = re.search(r'(\d+)\s*m² útiles', texto)
    
    # Extract values and convert to integers if found
    m2_construidos = int(match_construidos.group(1)) if match_construidos else None
    m2_utiles = int(match_utiles.group(1)) if match_utiles else None
    
    return m2_construidos, m2_utiles

# Apply the function to the 'Caracteristicas_basicas' column
df[['m2_construidos', 'm2_utiles']] = df['Caracteristicas_basicas'].apply(lambda x: pd.Series(extraer_m2(str(x))))

# Ensure the columns are of integer type
df['m2_construidos'] = df['m2_construidos'].astype('Int64')
df['m2_utiles'] = df['m2_utiles'].astype('Int64')

# Show the result
print(df[['Caracteristicas_basicas', 'm2_construidos', 'm2_utiles']])


In [None]:
# Function to extract number of bedrooms and bathrooms, considering singular and plural
def extraer_habitaciones_banos(texto):
    # Use regex to search for number of bedrooms and bathrooms (singular or plural)
    match_habitaciones = re.search(r'(\d+)\s*habitaci[oó]n(?:es)?', texto)
    match_banos = re.search(r'(\d+)\s*bañ(?:o|os)', texto)
    
    # Extract values and convert to integers if found
    habitaciones = int(match_habitaciones.group(1)) if match_habitaciones else None
    banos = int(match_banos.group(1)) if match_banos else None
    
    return habitaciones, banos

# Apply the function to the 'Caracteristicas_basicas' column
df[['habitaciones', 'banos']] = df['Caracteristicas_basicas'].apply(lambda x: pd.Series(extraer_habitaciones_banos(str(x))))

# Ensure the columns are of integer type
df['habitaciones'] = df['habitaciones'].astype('Int64')
df['banos'] = df['banos'].astype('Int64')

# Show the result
print(df[['Caracteristicas_basicas', 'habitaciones', 'banos']])


In [None]:
# Function to extract the condition of the property
def extraer_estado_vivienda(texto):
    # Convert text to lowercase to avoid case sensitivity issues
    texto = texto.lower()
    
    # Search for condition options
    if "segunda mano/buen estado" in texto:
        return "Segunda mano/Buen estado"
    elif "segunda mano/para reformar" in texto:
        return "Segunda mano/para reformar"
    elif "promoción de obra nueva" in texto:
        return "Promoción de obra nueva"
    else:
        return None  # If no condition is found, return None

# Apply the function to the 'Caracteristicas_basicas' column
df['estado_vivienda'] = df['Caracteristicas_basicas'].apply(lambda x: extraer_estado_vivienda(str(x)))

# Show the result
print(df[['Caracteristicas_basicas', 'estado_vivienda']])


In [None]:
# Create a new column 'trastero' with 1 if "Trastero" appears and 0 if it does not
df['trastero'] = df['Caracteristicas_basicas'].apply(lambda x: 1 if 'Trastero' in str(x) else 0)

# Show the first rows to verify
print(df[['Caracteristicas_basicas', 'trastero']].head())


In [None]:
# Create a new column 'terraza' with 1 if "Terraza" appears and 0 if it does not
df['terraza'] = df['Caracteristicas_basicas'].apply(lambda x: 1 if 'Terraza' in str(x) else 0)

# Create a new column 'balcon' with 1 if "Balcón" appears and 0 if it does not
df['balcon'] = df['Caracteristicas_basicas'].apply(lambda x: 1 if 'Balcón' in str(x) else 0)

# Show the first rows to verify
print(df[['Caracteristicas_basicas', 'terraza', 'balcon']].head())


In [None]:
# Create a function to detect orientations and return a dictionary of 0s and 1s
def detectar_orientaciones(texto):
    orientaciones = {'orientacion_este': 0, 'orientacion_oeste': 0, 'orientacion_norte': 0, 'orientacion_sur': 0}
    if pd.isna(texto):
        return orientaciones
    texto = texto.lower()  # Convert to lowercase for consistency
    if 'este' in texto:
        orientaciones['orientacion_este'] = 1
    if 'oeste' in texto:
        orientaciones['orientacion_oeste'] = 1
    if 'norte' in texto:
        orientaciones['orientacion_norte'] = 1
    if 'sur' in texto:
        orientaciones['orientacion_sur'] = 1
    return orientaciones

# Apply the function to the 'Caracteristicas_basicas' column and expand the result into new columns
orientaciones_df = df['Caracteristicas_basicas'].apply(detectar_orientaciones).apply(pd.Series)

# Concatenate the new orientation columns to the original DataFrame
df = pd.concat([df, orientaciones_df], axis=1)


In [None]:
# Create a new column for 'Armarios empotrados' or 'Armario empotrado', handling NaN values
df['armarios_empotrados'] = df['Caracteristicas_basicas'].apply(
    lambda x: 1 if isinstance(x, str) and ('armarios empotrados' in x.lower() or 'armario empotrado' in x.lower()) else 0
)

# Show the first rows to verify
print(df[['Caracteristicas_basicas', 'armarios_empotrados']].head())


In [None]:
# Create a new column for 'Con ascensor' (with elevator), handling NaN values
df['ascensor'] = df['Caracteristicas_basicas'].apply(
    lambda x: 1 if isinstance(x, str) and 'con ascensor' in x.lower() else 0
)

# Show the first rows to verify
print(df[['Caracteristicas_basicas', 'ascensor']].head())


In [None]:
# Use a regular expression to extract only the heating information until the first semicolon
df['Calefacción'] = df['Caracteristicas_basicas'].str.extract(r'Calefacción ([^;]+)').fillna('sin calefacción')

# Convert to 'category' type for use in Random Forest
df['Calefacción'] = df['Calefacción'].astype('category')

# Show the first rows to verify the result
print(df[['Caracteristicas_basicas', 'Calefacción']].head())



In [None]:
# Use a regular expression to extract the number that follows the word "Planta"
df['planta_numero'] = df['Caracteristicas_basicas'].str.extract(r'Planta (\d+)')

# Convert the new column to a numeric (integer) data type
df['planta_numero'] = pd.to_numeric(df['planta_numero'], errors='coerce').astype('Int64')

# Show the first rows to verify
print(df[['Caracteristicas_basicas', 'planta_numero']].head())


In [None]:
# Create the columns 'planta_numero', 'planta_exterior', and 'planta_interior'
# Extract the floor number
df['planta_numero'] = df['Caracteristicas_basicas'].str.extract(r'Planta (\d+)')

# Convert the floor number column to integer
df['planta_numero'] = pd.to_numeric(df['planta_numero'], errors='coerce').astype('Int64')

# Create the columns for exterior and interior, based on the presence of those words
df['Exterior'] = df['Caracteristicas_basicas'].apply(lambda x: 1 if isinstance(x, str) and 'exterior' in x.lower() else 0)
df['Interior'] = df['Caracteristicas_basicas'].apply(lambda x: 1 if isinstance(x, str) and 'interior' in x.lower() else 0)

# Show some rows to verify the result
print(df[['Caracteristicas_basicas', 'planta_numero', 'Exterior', 'Interior']].head())


In [None]:
# Create a new column to indicate if there is a garage
df['plaza_garaje'] = df['Caracteristicas_basicas'].apply(lambda x: 1 if isinstance(x, str) and 'garaje' in x.lower() else 0)

# Create a new column for 'Garage included'
df['garaje_incluido'] = df['Caracteristicas_basicas'].apply(lambda x: 1 if isinstance(x, str) and 'incluida' in x.lower() else 0)

# Create a new column for 'Additional garage'
df['garaje_adicional'] = df['Caracteristicas_basicas'].apply(lambda x: 1 if isinstance(x, str) and 'adicionales' in x.lower() else 0)

# Show some rows to verify the result
print(df[['Caracteristicas_basicas', 'plaza_garaje', 'garaje_incluido', 'garaje_adicional']].head())


In [None]:
# Create a new column to indicate if the access and the property are adapted for reduced mobility
df['adaptado_movilidad_reducida'] = df['Caracteristicas_basicas'].apply(lambda x: 1 if isinstance(x, str) and 'movilidad reducida' in x.lower() else 0)

# Show some rows to verify the result
print(df[['Caracteristicas_basicas', 'adaptado_movilidad_reducida']].head())


In [None]:
# Function to extract the year of construction
def extraer_ano_construccion(texto):
    # Use regular expressions to find the number following "Construido en"
    match_ano = re.search(r'construido en\s*(\d+)', texto.lower())
    
    # Extract the value as an integer if found
    return int(match_ano.group(1)) if match_ano else None

# Apply the function to the 'Caracteristicas_basicas' column to extract the year of construction
df['ano_construccion'] = df['Caracteristicas_basicas'].apply(lambda x: extraer_ano_construccion(str(x)))

# Convert to integers making sure there are no decimals
df['ano_construccion'] = df['ano_construccion'].astype('Int64')

# View the first results
print(df[['Caracteristicas_basicas', 'ano_construccion']].head())


In [None]:
# Function to detect extra features
def extraer_caracteristicas_extras(texto):
    # Convert the text to lowercase for standardization
    texto = texto.lower()
    
    # Detect the presence of each feature
    jardin = 1 if 'jardín' in texto else 0
    piscina = 1 if 'piscina' in texto else 0
    aire_acondicionado = 1 if 'aire acondicionado' in texto else 0
    zonas_verdes = 1 if 'zonas verdes' in texto else 0
    
    return pd.Series([jardin, piscina, aire_acondicionado, zonas_verdes])

# Apply the function to the 'Caracteristicas_extra' column and create the new columns
df[['jardin', 'piscina', 'aire_acondicionado', 'zonas_verdes']] = df['Caracteristicas_extra'].apply(lambda x: extraer_caracteristicas_extras(str(x)))

# View the first results
print(df[['Caracteristicas_extra', 'jardin', 'piscina', 'aire_acondicionado', 'zonas_verdes']].head())


In [None]:
# Drop the columns 'Caracteristicas_basicas' and 'Caracteristicas_extra'
df = df.drop(columns=['Caracteristicas_basicas', 'Caracteristicas_extra'])

# Save the DataFrame to a CSV file
df.to_csv('casas_idealista_procesado_extra.csv', index=False, sep=';', encoding='utf-8')
