## DATA TRANSFORMATION

In [1]:
import pandas as pd
import numpy as np
import pickle
from pathlib import Path

Importamos el CSV

In [2]:
input_path = Path("data") / "staySpain_cleaned.pkl"
df = pd.read_pickle(input_path)

Creamos una copia del DF para hacer los cambios

In [3]:
df_transf = df.copy()

Creación columna precio por persona

In [4]:
posicion = 13
df_transf.insert(posicion, 'pricexperson', df_transf['price'] / df_transf['accommodates'])

In [5]:
df_transf.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7693 entries, 0 to 7999
Data columns (total 36 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   apartment_id                 7693 non-null   int64         
 1   name                         7693 non-null   object        
 2   description                  7693 non-null   object        
 3   host_id                      7693 non-null   int64         
 4   neighbourhood_name           7693 non-null   object        
 5   neighbourhood_district       4669 non-null   object        
 6   room_type                    7693 non-null   object        
 7   accommodates                 7693 non-null   int64         
 8   bathrooms                    7650 non-null   Int64         
 9   bedrooms                     7655 non-null   Int64         
 10  beds                         7685 non-null   Int64         
 11  amenities_list               7677 non-null   obj

Transformación de columnas availabilty a positivo + porcentaje

In [6]:
posicion = 18
df_transf.insert(posicion, 'ocupation30', 30 - df_transf['availability_30'])

In [7]:
posicion = 19
df_transf.insert(posicion, 'ocup%30', (df_transf['ocupation30'] / 30) * 100)

Vamos a trabajar con la columna de disponibilidad mensual ya que el KPI se evalúa mensualmente, por lo que aunque estén creadas las columnas de porcentaje de tasa de ocupación según las diferentes variables temporales, no las vamos a utilizar.

In [8]:
posicion = 21
df_transf.insert(posicion, 'ocupation60', 60 - df_transf['availability_60'])

In [9]:
#df['ocup60%'] = (df['ocupation60'] / 60) * 100

In [10]:
posicion = 23  # Cambia según tu necesidad
df_transf.insert(posicion, 'ocupation90', 90 - df_transf['availability_90'])

In [11]:
#df['ocup90%'] = (df['ocupation90'] / 90) * 100

In [12]:
posicion = 25  # Cambia según tu necesidad
df_transf.insert(posicion, 'ocupation365', 365 - df_transf['availability_365'])

In [13]:
#df['ocup365%'] = (df['ocupation365'] / 365) * 100

In [14]:
df_transf.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7693 entries, 0 to 7999
Data columns (total 41 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   apartment_id                 7693 non-null   int64         
 1   name                         7693 non-null   object        
 2   description                  7693 non-null   object        
 3   host_id                      7693 non-null   int64         
 4   neighbourhood_name           7693 non-null   object        
 5   neighbourhood_district       4669 non-null   object        
 6   room_type                    7693 non-null   object        
 7   accommodates                 7693 non-null   int64         
 8   bathrooms                    7650 non-null   Int64         
 9   bedrooms                     7655 non-null   Int64         
 10  beds                         7685 non-null   Int64         
 11  amenities_list               7677 non-null   obj

Creación de columna índice de satisfacción general

In [15]:
general_satisf = {
    'accuracy':      df_transf.review_scores_accuracy.mean(),
    'cleanliness':   df_transf.review_scores_cleanliness.mean(),
    'checkin':       df_transf.review_scores_checkin.mean(),
    'communication': df_transf.review_scores_communication.mean(),
    'location' :     df_transf.review_scores_location.mean()}

sorted_avg_scores = sorted(general_satisf.items(), key=lambda item: item[1], reverse=True)

df_transf['general_satisf'] = df_transf[
    ['review_scores_accuracy', 'review_scores_cleanliness', 
     'review_scores_checkin', 'review_scores_communication', 
     'review_scores_location']
].mean(axis=1)

posicion = 36
col_general_satisf = df_transf.pop('general_satisf')
df_transf.insert(posicion, 'general_satisf', col_general_satisf)

Creación de la columna amenities score

In [16]:
# Define amenities for each category/group
# Groups are made in relation to the aspects of an accomodation customers find more important based on sector research
amenities_ac = ['heating', 'heated floors', 'air conditioning']
amenities_outdoor = ['balcony', 'terrace', 'backyard', 'patio', 'outdoor dining area', 'outdoor seating']
amenities_parking = ['parking']
amenities_view = ['beach view', 'beachfront', 'waterfront', 'lake', 'mountain view']
amenities_internet = ['ethernet', 'wifi', 'internet']
amenities_work = ['workspace', 'mbps', 'office']
amenities_kitchen = ['dishwasher', 'refrigerator', 'freezer', 'washer', 'kitchen', 'stove', 'oven', 'toaster', 'full kitchen', 'microwave', 'kettle']
amenities_bathroom = ['essentials', 'tub', 'shower', 'bathtub', 'hair dryer']
amenities_sport = ['gym', 'pool']
amenities_disabled = ['wheelchair', 'hoist', 'single level', 'wide hallways', 'wide doorway', 'grab bars', 'shower chair', 'wide clearance', 'elevator', 'accessible', 'bath chair']


# Some groups have multiple small items why other are some but big items
# i.e. lots of appliances to make a kitchen, but not so many to make a parking
# Define scores for each amenities list based on the amount and importance of items
amenities_scores = {
    'amenities_ac': (amenities_ac, 5.0), 
    'amenities_outdoor': (amenities_outdoor, 5.0),   
    'amenities_parking': (amenities_parking, 10.0),   
    'amenities_view': (amenities_view, 10.0),
    'amenities_internet': (amenities_internet, 10.0),
    'amenities_work': (amenities_work, 10.0),
    'amenities_kitchen': (amenities_kitchen, 2.0),
    'amenities_bathroom': (amenities_bathroom, 3.0),
    'amenities_sport': (amenities_sport, 10.0),
    'amenities_disabled': (amenities_disabled, 2.0)
}


# Function to count occurrences with type checking and lowercase conversion
def count_amenities(amenities_string, amenities, score):
    if isinstance(amenities_string, str):  # Check if the cell is a string
        amenities_string = amenities_string.lower()  # Convert to lowercase
        total_score = sum(score for amenity in amenities if amenity in amenities_string)
        return min(total_score, 10)  # Cap the score at 10
    else:
        return np.nan  # Return NaN for non-string values


# Iterate through each amenities list and create new columns with score for each one
for key, (amenities, score) in amenities_scores.items():
    df_transf[f'{key}_score'] = df_transf['amenities_list'].apply(lambda x: count_amenities(x, amenities, score))


# The following columns are related to amenities score
amenities_score_columns = [
    'amenities_ac_score', 
    'amenities_outdoor_score',   
    'amenities_parking_score',   
    'amenities_view_score',
    'amenities_internet_score',
    'amenities_work_score',
    'amenities_kitchen_score',
    'amenities_bathroom_score',
    'amenities_sport_score',
    'amenities_disabled_score'
]


# Based on the previous lists, a total score of amenities is calculated
df_transf['amenities_score'] = df_transf[amenities_score_columns].sum(axis=1)

# To keep the dataframe simple, detailed group scores are removed
df_transf = df_transf.drop(columns=amenities_score_columns)

In [17]:
df_transf.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7693 entries, 0 to 7999
Data columns (total 43 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   apartment_id                 7693 non-null   int64         
 1   name                         7693 non-null   object        
 2   description                  7693 non-null   object        
 3   host_id                      7693 non-null   int64         
 4   neighbourhood_name           7693 non-null   object        
 5   neighbourhood_district       4669 non-null   object        
 6   room_type                    7693 non-null   object        
 7   accommodates                 7693 non-null   int64         
 8   bathrooms                    7650 non-null   Int64         
 9   bedrooms                     7655 non-null   Int64         
 10  beds                         7685 non-null   Int64         
 11  amenities_list               7677 non-null   obj

In [18]:
df_transf.head(50)

Unnamed: 0,apartment_id,name,description,host_id,neighbourhood_name,neighbourhood_district,room_type,accommodates,bathrooms,bedrooms,...,review_scores_communication,review_scores_location,review_scores_value,general_satisf,reviews_per_month,country,city,insert_date,is_instant_bookable,amenities_score
0,11964,A ROOM WITH A VIEW,Private bedroom in our attic apartment. Right ...,45553,Centro,,Private Room,2,2.0,1,...,10.0,10.0,10.0,10.0,75.0,spain,malaga,2018-07-31,False,40.0
1,21853,Bright and airy room,We have a quiet and sunny room with a good vie...,83531,Crmenes,Latina,Private Room,1,1.0,1,...,10.0,8.0,9.0,9.2,52.0,spain,madrid,2020-01-10,False,58.0
2,32347,Explore Cultural Sights from a Family-Friendly...,Open French doors and step onto a plant-filled...,139939,San Vicente,Casco Antiguo,Entire Home Apt,4,1.0,2,...,10.0,10.0,10.0,10.0,142.0,spain,sevilla,2019-07-29,True,48.0
3,35379,Double 02 CasanovaRooms Barcelona,Room at a my apartment. Kitchen and 2 bathroom...,152232,L'Antiga Esquerra De L'Eixample,Eixample,Private Room,2,2.0,1,...,10.0,10.0,9.0,9.8,306.0,spain,barcelona,2020-01-10,True,56.0
4,35801,Can Torras Farmhouse Studio Suite,Lay in bed & watch sunlight change the mood of...,153805,Quart,,Private Room,5,1.0,2,...,10.0,10.0,10.0,10.0,39.0,spain,girona,2019-02-19,False,66.0
5,48764,18th C Stone House near Costa Brava,Casa Fluvia is a charming stone village house ...,220145,Torroella De Fluvi,,Entire Home Apt,8,2.0,4,...,10.0,9.0,10.0,9.8,27.0,spain,girona,2019-02-19,False,45.0
6,58512,Stylish & cozy 3BR near Sagrada Familia,Welcome to my home!<br /><br />My lovely 3 bed...,280070,El Camp De L'Arpa Del Clot,Sant Mart�,Entire Home Apt,6,2.0,3,...,9.0,9.0,9.0,9.0,329.0,spain,barcelona,2020-10-12,True,60.0
7,71603,PENTHOUSE1 BEST PRICE 15/21.07 PROMO LAST MINUTE!,The apartment you are about to book has everyt...,366654,La Dreta De L'Eixample,Eixample,Entire Home Apt,3,2.0,1,...,9.0,10.0,9.0,9.8,42.0,spain,barcelona,2017-07-06,False,39.0
8,72150,Sunny attic duplex flat with terrace next to Sol,"The apartment is a quiet, secluded idyll in th...",364585,Embajadores,Centro,Entire Home Apt,5,2.0,3,...,10.0,10.0,9.0,9.6,91.0,spain,madrid,2020-11-06,False,69.0
9,73683,Sagrada Familia area for 12 people,"An ideal location for a big group, two apartme...",135703,El Camp D'En Grassot I Grcia Nova,Gr�cia,Entire Home Apt,12,2.0,4,...,10.0,9.0,9.0,9.4,14.0,spain,barcelona,2018-06-09,True,52.0


In [19]:
output_path = r"data/staySpain_transformed.pkl"
df_transf.to_pickle(output_path)
output_path2 = r"data/staySpain_transformed.csv"
df_transf.to_csv(output_path2)
print(f"Datos limpios guardados en: {output_path}")
print(f"Datos limpios guardados en: {output_path2}")

Datos limpios guardados en: data/staySpain_transformed.pkl
Datos limpios guardados en: data/staySpain_transformed.csv
