# Data cleaning

**2025-10-07**

In [1]:
import pandas as pd
import numpy as np
import re, unicodedata
from pathlib import Path

In [2]:
# Visualizamos todas las columnas en el output
pd.set_option("display.max_columns", None)

In [3]:
# Importamos el CSV
input_path = Path("data") / "staySpain_raw.csv"
df = pd.read_csv(input_path)

In [4]:
# Hacemos una copia de df para no modificar el df original
df_clean = df.copy()

### Data Overview

In [5]:
# Info de df_clean
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 35 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   apartment_id                 10000 non-null  int64  
 1   name                         9997 non-null   object 
 2   description                  9862 non-null   object 
 3   host_id                      10000 non-null  int64  
 4   neighbourhood_name           10000 non-null  object 
 5   neighbourhood_district       6079 non-null   object 
 6   room_type                    10000 non-null  object 
 7   accommodates                 10000 non-null  int64  
 8   bathrooms                    9926 non-null   float64
 9   bedrooms                     9930 non-null   float64
 10  beds                         9955 non-null   float64
 11  amenities_list               9983 non-null   object 
 12  price                        9746 non-null   float64
 13  minimum_nights   

In [6]:
# Comprobamos si hay filas enteras duplicadas
df_clean.duplicated().any()

np.False_

In [7]:
# Comprobamos si hay duplicados en "apartment_id"
df_clean.duplicated(subset="apartment_id").sum()

np.int64(350)

In [8]:
# Buscamos duplicados en "apartment_id"
duplicados = df_clean[df_clean.duplicated("apartment_id", keep=False)]

### Delete duplicated apartments - keep newer

In [9]:
# Ordenamos los registros para eliminar los duplicados con "insert_date" más antigua
df_clean = df_clean.sort_values("insert_date", ascending=False)\
                     .drop_duplicates("apartment_id", keep='first')\
                     .sort_index()

### Change Datatypes

In [10]:
# df_clean.dtypes

#### Datatype: dates

In [11]:
# Conversión de datos a date en las columnas que muestran fechas
df_clean["first_review_date"] = pd.to_datetime(df_clean['first_review_date'], errors='coerce', format='%d/%m/%Y')
df_clean["last_review_date"] = pd.to_datetime(df_clean['last_review_date'], errors='coerce', format='%d/%m/%Y')
df_clean["insert_date"] = pd.to_datetime(df_clean['insert_date'], errors='coerce', format='%d/%m/%Y')

#### Datatype: numbers

In [12]:
# Conversión de float a int
df_clean["bathrooms"] = df_clean['bathrooms'].astype("Int64")
df_clean["bedrooms"] = df_clean['bedrooms'].astype("Int64")
df_clean["beds"] = df_clean['beds'].astype("Int64")

#### Datatype: booleans

In [13]:
# Creamos una columna nueva para convertir "is_instant_bookable" en booleano
df_clean["is_instant_bookable_bool"] = df_clean["is_instant_bookable"].replace({"VERDADERO": True, "FALSO": False}).astype(bool)

# Eliminamos la columna original
df_clean = df_clean.drop(columns="is_instant_bookable")

# Renombramos la nueva columna
df_clean = df_clean.rename({"is_instant_bookable_bool":"is_instant_bookable"}, axis=1)

  df_clean["is_instant_bookable_bool"] = df_clean["is_instant_bookable"].replace({"VERDADERO": True, "FALSO": False}).astype(bool)


In [14]:
# # Repetimos proceso con has_availability pero con map() para mantener los NaN
mapping={"VERDADERO":True, "FALSO":False}
df_clean["has_availability"] = df_clean["has_availability"].map(mapping)


In [15]:
df_clean["has_availability"].isna().sum()

np.int64(534)

### Names String Imputation

In [16]:
# Imputación de nombre para registros con NaN en "name"
df_clean['name'] = df_clean['name'].fillna(df_clean['room_type'] + ' ' + df_clean['neighbourhood_name'])

In [17]:
# Imputación de descripción para registros con NaN en "description"
df_clean['description'] = df_clean['description'].fillna(df_clean['name'])

### Clean strings - remove characters not possible to encode

In [18]:
def clean_string(input_string):
    
    # Check for NaN values
    if pd.isna(input_string):
        return input_string  # Return NaN as is
        
    s = "" if input_string is None else str(input_string)

    # Normaliza Unicode (arregla acentos rotos y el carácter �)
    s = unicodedata.normalize("NFKC", s).replace("�", "")

    # Conserva letras latinas (con tildes), números, espacios, apóstrofes y guiones
    s = re.sub(r"[^0-9A-Za-zÀ-ÖØ-öø-ÿ' _-]", " ", s)  # updated to accept underscores - week 3 update

    # Colapsa espacios y recorta extremos
    s = re.sub(r"\s+", " ", s).strip()
    
    return s.title()



def clean_string_with_capitalize(input_string):

    # Check for NaN values
    if pd.isna(input_string):
        return input_string  # Return NaN as is
        
    s = "" if input_string is None else str(input_string)




    # Normaliza Unicode (arregla acentos rotos y el carácter �)
    s = unicodedata.normalize("NFKC", s).replace("�", "")

    # Conserva letras latinas (con tildes), números, espacios, apóstrofes y guiones
    s = re.sub(r"[^0-9A-Za-zÀ-ÖØ-öø-ÿ' _-]", " ", s)  # updated to accept underscores - week 3 update

    # Colapsa espacios y recorta extremos
    s = re.sub(r"\s+", " ", s).strip()
    
    # Pone mayúscula la primera letra de CADA palabra (mejor para nombres propios)
    s = s.capitalize()
    
    return s.title()


# Aplicamos la función de limpieza 
cols_to_clean = ["room_type", "description"] # apply to more string columns, week 3 update
for col in cols_to_clean:
    if col in df_clean.columns:
        df_clean[col] = df_clean[col].apply(clean_string)

# Cleaning AND capitalize
cols_to_clean = ["neighbourhood_name", "neighbourhood_district", "country", "name", "city"] 
for col in cols_to_clean:
    if col in df_clean.columns:
        df_clean[col] = df_clean[col].apply(clean_string_with_capitalize)

### Standardize score scales - subgroup scores scales x/10 - global scores scales X/100

In [19]:
"""Formatear las variables de review"""
    
df_clean['review_scores_rating'] = df_clean['review_scores_rating'] / 10
df_clean['review_scores_accuracy'] = df_clean['review_scores_accuracy'] / 10
df_clean['review_scores_cleanliness'] = df_clean['review_scores_cleanliness'] / 10
df_clean['review_scores_checkin'] = df_clean['review_scores_checkin'] / 10
df_clean['review_scores_communication'] = df_clean['review_scores_communication'] / 10
df_clean['review_scores_location'] = df_clean['review_scores_location'] / 10
df_clean['review_scores_value'] = df_clean['review_scores_value'] / 10


### Price Imputation

In [20]:
def input_price(df_clean, price, room_type, accommodates, neighbourhood_name):
    """Imputación de precios"""
    df = df_clean.copy()  # Copia del DataFrame original
    avgs = df.groupby([room_type, accommodates, neighbourhood_name])[price].transform('mean')
    df[price] = df[price].fillna(avgs)
    return df

# Aplicamos la función y guardamos el resultado en df_clean para conservar las transformaciones anteriores
df_clean = input_price(df_clean, 'price', 'room_type', 'accommodates', 'neighbourhood_name')


In [21]:
# Ver las filas donde 'price' es nulo luego de aplicar la formula de inputacion
cols = [c for c in ['id', 'room_type', 'accommodates', 'neighbourhood_name', 'price'] if c in df_clean.columns]
filas_nulas = df_clean[df_clean['price'].isna()][cols]
print(f"Filas con 'price' nulo: {len(filas_nulas)}")
#filas_nulas


Filas con 'price' nulo: 66


In [22]:
# Segunda imputación para lo que no pudo completarse con la media grupal
df_clean['price'] = df_clean['price'].fillna(df_clean['price'].mean())

### Review Scores Imputation - fill specific reviews with general rating if available

In [23]:
# Identify the review score columns (all columns except apartment_id and number_of_reviews)
review_columns = ['review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 
                  'review_scores_communication', 'review_scores_location', 'review_scores_value']

# For rows with positive number_of_reviews, fill NaN in review columns with review_scores_rating / 10
mask = df_clean['number_of_reviews'] > 0

for col in review_columns:
    df_clean.loc[mask & df_clean[col].isna(), col] = df_clean.loc[mask & df_clean[col].isna(), 'review_scores_rating'] / 10

### Clean Data Overview

In [24]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9650 entries, 0 to 9999
Data columns (total 35 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   apartment_id                 9650 non-null   int64         
 1   name                         9650 non-null   object        
 2   description                  9650 non-null   object        
 3   host_id                      9650 non-null   int64         
 4   neighbourhood_name           9650 non-null   object        
 5   neighbourhood_district       5860 non-null   object        
 6   room_type                    9650 non-null   object        
 7   accommodates                 9650 non-null   int64         
 8   bathrooms                    9576 non-null   Int64         
 9   bedrooms                     9581 non-null   Int64         
 10  beds                         9605 non-null   Int64         
 11  amenities_list               9634 non-null   obj

### Output clean data to CSV and PICKLE

In [25]:
# 1) Carpeta de salida (crea /data si no existe)
out_dir = Path("data")
out_dir.mkdir(parents=True, exist_ok=True)

# 2) Rutas de salida
pkl_path = out_dir / "staySpain_cleaned.pkl"
csv_path = out_dir / "staySpain_cleaned.csv"

# 3) Guardar el df LIMPIO
df_clean.to_pickle(pkl_path)
df_clean.to_csv(csv_path, index=False)

# 4) Verificación inmediata: recargar y comparar
df_check = pd.read_pickle(pkl_path)

print("Guardado en:")
print(" - PKL:", pkl_path.resolve())
print(" - CSV:", csv_path.resolve())
print("\nVerificación:")
print(" - Shapes iguales:", df_check.shape == df_clean.shape)
print(" - Columnas iguales:", list(df_check.columns) == list(df_clean.columns))
print(" - Tipos iguales:", (df_check.dtypes == df_clean.dtypes).all())



Guardado en:
 - PKL: C:\Users\Cristina\ProjecteData\Equip_17\week_03_2025-10-06\data\staySpain_cleaned.pkl
 - CSV: C:\Users\Cristina\ProjecteData\Equip_17\week_03_2025-10-06\data\staySpain_cleaned.csv

Verificación:
 - Shapes iguales: True
 - Columnas iguales: True
 - Tipos iguales: True
