# 3.1 - Proceso completo

In [1]:
# librerias

import numpy as np
import pandas as pd

pd.set_option('display.max_columns', None)

from sklearn.preprocessing import StandardScaler

from catboost import CatBoostRegressor as CTR

from sklearn.model_selection import train_test_split as tts 

from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK

from sklearn.metrics import mean_squared_error as mse 
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import r2_score as r2

import pickle

In [2]:
# columnas iniciales escogidas

COLS_INICIAL=['amenities', 'accommodates', 'availability_30', 'availability_365', 'availability_60', 'availability_90', 
              'bathrooms', 'bedrooms', 'beds', 'calculated_host_listings_count', 
              'calculated_host_listings_count_entire_homes', 'calculated_host_listings_count_private_rooms', 
              'calculated_host_listings_count_shared_rooms','cleaning_fee', 'extra_people', 'guests_included', 
              'latitude', 'longitude', 'maximum_nights', 'minimum_nights', 'number_of_reviews',
              'number_of_reviews_ltm', 'security_deposit', 'price']

In [3]:
# columnas a normalizar

COLS_NORMAL=['accommodates', 'bathrooms', 'bedrooms', 'beds',
              'security_deposit', 'cleaning_fee', 'guests_included', 'extra_people',
              'minimum_nights', 'maximum_nights', 'availability_30',
              'availability_60', 'availability_90', 'availability_365',
              'number_of_reviews', 'number_of_reviews_ltm',
              'calculated_host_listings_count',
              'calculated_host_listings_count_entire_homes',
              'calculated_host_listings_count_private_rooms',
              'calculated_host_listings_count_shared_rooms']

In [4]:
# columnas finales escogidas

COLS_FINAL=['accommodates', 'air_conditioning', 'availability_30', 'availability_365', 'availability_60', 'availability_90',
            'bathrooms', 'bedrooms', 'beds', 'calculated_host_listings_count', 'calculated_host_listings_count_entire_homes',
            'calculated_host_listings_count_private_rooms', 'calculated_host_listings_count_shared_rooms', 'cleaning_fee',
            'dishwasher', 'extra_people', 'guests_included','latitude', 'longitude', 'maximum_nights', 'minimum_nights', 
            'number_of_reviews', 'number_of_reviews_ltm', 'room_type_private_room', 'room_type_shared_room', 'security_deposit',
            'price']

In [5]:
AMENITIES=[e for e in COLS_FINAL if e not in COLS_INICIAL]

AMENITIES

['air_conditioning',
 'dishwasher',
 'room_type_private_room',
 'room_type_shared_room']

In [6]:
# pipeline

class Airbnb:
    
    def __init__(self, path=None):
        
        self.df=pd.DataFrame()
        
        self.X_train=pd.DataFrame()
        self.X_test=pd.DataFrame()
        self.y_train=pd.Series(dtype=float)
        self.y_test=pd.Series(dtype=float)
        
        if not path:
            self.modelo=CTR(verbose=0)
        else:
            self.modelo=pickle.load(open(path, 'rb'))
            
    
    
    @staticmethod
    def _check_nan(df: pd.DataFrame) -> bool:
        """
        Este métod comprueba los nulos de un dataframe.
        
        param df: pandas DataFrame a ser checkeado.

        return: True si el df tiene nulos False en Caso de estar limpio.
        """
        nan=df.isna().mean()

        nan_cols=nan[nan>0]
        
        return len(nan_cols)!=0
    
    
    @staticmethod
    def _price_clean(x: [str, float]) -> float:
    
        """
        Esta función limpia una string, quitando $ y la coma y devuelve un float

        param x: string a limpiar

        return: float
        """
        x=str(x)
        x=x.replace('$', '').replace(',', '')

        return float(x)
    
    
    @staticmethod
    def _amenities(df: pd.DataFrame) -> list:
        return [[1 if e in fila else 0 for e in AMENITIES] for fila in df.amenities]
    
                
        
    def extract(self, path: str, ret: bool=False):
        
        """
        Método para procesar el dataframe en crudo (listings.csv.gz), para transformalo 
        completamente y alimentar el modelo.
        
        param path: ruta al archivo listings.csv.gz
        
        return: dataframe transformado y limpio
        
        """
        
        # carga archivo
        df=pd.read_csv(path, compression='gzip', low_memory=False, usecols=COLS_INICIAL)
        
        # cambio tipo de dato
        for c in df.select_dtypes(include='int'):
            df[c]=pd.to_numeric(df[c], downcast='integer')
        for c in df.select_dtypes(include='float'):
            df[c]=pd.to_numeric(df[c], downcast='float')
        

        # rellena depósito y limpieza con cero
        df[['security_deposit', 'cleaning_fee']]=df[['security_deposit', 'cleaning_fee']].fillna('0')
        
        # borra nulos que puedan quedar
        if Airbnb()._check_nan(df):
            df.dropna(inplace=True)
        else:
            pass
            
        # las columnas bathrooms, bedrooms y beds son enteros
        df.bathrooms=df.bathrooms.apply(lambda x: int(x))
        df.bedrooms=df.bedrooms.apply(lambda x: int(x))
        df.beds=df.beds.apply(lambda x: int(x))
        
        # las columnas price, security_deposit, cleaning_fee, guests_included y extra_people se pasan a float
        df.price=df.price.apply(Airbnb()._price_clean)
        df.security_deposit=df.security_deposit.apply(Airbnb()._price_clean)
        df.cleaning_fee=df.cleaning_fee.apply(Airbnb()._price_clean)
        df.guests_included=df.guests_included.apply(Airbnb()._price_clean)
        df.extra_people=df.extra_people.apply(Airbnb()._price_clean)
        
        # transformación amenities
        df[AMENITIES]=Airbnb()._amenities(df)
        df.drop('amenities', axis=1, inplace=True)
        
        # normalización
        df[COLS_NORMAL]=StandardScaler().fit_transform(df[COLS_NORMAL])
        
        # eliminacion de outliers
        df=df[(df.price>=10) & (df.price<=196)]  
        
        # seleccion final
        df=df[COLS_FINAL]
        
        self.df=df
        
        if ret:
            return df
       
    
    
    def fit(self) -> None:
        
        X=self.df.drop('price', axis=1)
        y=self.df.price

        self.X_train, self.X_test, self.y_train, self.y_test = tts(X, y,train_size=0.8, 
                                                                        test_size=0.2, 
                                                                        random_state=42)
        
        self.modelo.fit(self.X_train, self.y_train)
        pickle.dump(self.modelo, open('models/catboost_airbnb.pk', 'wb'))
        
        
        
    def predict(self) -> np.array:
        return self.modelo.predict(self.X_test)
    
    
    def evaluate(self):
        
        self.fit()
        
        y_pred=self.predict()
        
        print(f'RMSE: {mse(self.y_test, y_pred, squared=False)}')
        print(f'MAE: {mae(self.y_test, y_pred)}')
        print(f'R2: {r2(self.y_test, y_pred)}')
        
        

In [7]:
airbnb=Airbnb()

In [8]:
airbnb.extract('../data/raw_data/listings.csv.gz')

In [9]:
airbnb.evaluate()

RMSE: 20.885500995247874
MAE: 14.436085977132974
R2: 0.7097575058293573
