# 3.1 - Proceso completo

In [28]:
# librerias

import numpy as np
import pandas as pd

pd.set_option('display.max_columns', None)

from sklearn.preprocessing import StandardScaler

from catboost import CatBoostRegressor as CTR

from sklearn.model_selection import train_test_split as tts 

from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK

from sklearn.metrics import mean_squared_error as mse 
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import r2_score as r2

import pickle

In [2]:
# columnas iniciales escogidas

COLS_INICIAL=['amenities', 'accommodates', 'availability_30', 'availability_365', 'availability_60', 'availability_90', 
              'bathrooms', 'bedrooms', 'beds', 'calculated_host_listings_count', 
              'calculated_host_listings_count_entire_homes', 'calculated_host_listings_count_private_rooms', 
              'calculated_host_listings_count_shared_rooms','cleaning_fee', 'extra_people', 'guests_included', 
              'latitude', 'longitude', 'maximum_nights', 'minimum_nights', 'number_of_reviews',
              'number_of_reviews_ltm', 'security_deposit', 'price']

In [3]:
# columnas a normalizar

COLS_NORMAL=['accommodates', 'bathrooms', 'bedrooms', 'beds',
              'security_deposit', 'cleaning_fee', 'guests_included', 'extra_people',
              'minimum_nights', 'maximum_nights', 'availability_30',
              'availability_60', 'availability_90', 'availability_365',
              'number_of_reviews', 'number_of_reviews_ltm',
              'calculated_host_listings_count',
              'calculated_host_listings_count_entire_homes',
              'calculated_host_listings_count_private_rooms',
              'calculated_host_listings_count_shared_rooms']

In [4]:
# columnas finales escogidas

COLS_FINAL=['accommodates', 'air_conditioning', 'availability_30', 'availability_365', 'availability_60', 'availability_90',
            'bathrooms', 'bedrooms', 'beds', 'calculated_host_listings_count', 'calculated_host_listings_count_entire_homes',
            'calculated_host_listings_count_private_rooms', 'calculated_host_listings_count_shared_rooms', 'cleaning_fee',
            'dishwasher', 'extra_people', 'guests_included','latitude', 'longitude', 'maximum_nights', 'minimum_nights', 
            'number_of_reviews', 'number_of_reviews_ltm', 'room_type_private_room', 'room_type_shared_room', 'security_deposit',
            'price']

In [5]:
AMENITIES=[e for e in COLS_FINAL if e not in COLS_INICIAL]

AMENITIES

['air_conditioning',
 'dishwasher',
 'room_type_private_room',
 'room_type_shared_room']

In [6]:
# pipeline

class Airbnb:
    
    def __init__(self, path=None):
        
        self.df_train=pd.DataFrame()
        self.df_test=pd.DataFrame()
        if not path:
            self.modelo=CTR(verbose=0)
        else:
            #self.modelo=open()
            pass
    
    
    @staticmethod
    def _check_nan(df: pd.DataFrame) -> bool:
        """
        Este métod comprueba los nulos de un dataframe.
        
        param df: pandas DataFrame a ser checkeado.

        return: True si el df tiene nulos False en Caso de estar limpio.
        """
        nan=df.isna().mean()

        nan_cols=nan[nan>0]
        
        return len(nan_cols)!=0
    
    
    @staticmethod
    def _price_clean(x: [str, float]) -> float:
    
        """
        Esta función limpia una string, quitando $ y la coma y devuelve un float

        param x: string a limpiar

        return: float
        """
        x=str(x)
        x=x.replace('$', '').replace(',', '')

        return float(x)
    
    
    @staticmethod
    def _amenities(df: pd.DataFrame) -> list:
        return [[1 if e in fila else 0 for e in AMENITIES] for fila in df.amenities]
                
        
        
    
    def extract(self, path: str):
        
        """
        Método para procesar el dataframe en crudo (listings.csv.gz), para transformalo 
        completamente y alimentar el modelo.
        
        param path: ruta al archivo listings.csv.gz
        
        return: dataframe transformado y limpio
        
        """
        
        # carga archivo
        df=pd.read_csv(path, compression='gzip', low_memory=False, usecols=COLS_INICIAL)
        
        # cambio tipo de dato
        for c in df.select_dtypes(include='int'):
            df[c]=pd.to_numeric(df[c], downcast='integer')
        for c in df.select_dtypes(include='float'):
            df[c]=pd.to_numeric(df[c], downcast='float')
        

        # rellena depósito y limpieza con cero
        df[['security_deposit', 'cleaning_fee']]=df[['security_deposit', 'cleaning_fee']].fillna('0')
        
        # borra nulos que puedan quedar
        if Airbnb()._check_nan(df):
            df.dropna(inplace=True)
        else:
            pass
            
        # las columnas bathrooms, bedrooms y beds son enteros
        df.bathrooms=df.bathrooms.apply(lambda x: int(x))
        df.bedrooms=df.bedrooms.apply(lambda x: int(x))
        df.beds=df.beds.apply(lambda x: int(x))
        
        # las columnas price, security_deposit, cleaning_fee, guests_included y extra_people se pasan a float
        df.price=df.price.apply(Airbnb()._price_clean)
        df.security_deposit=df.security_deposit.apply(Airbnb()._price_clean)
        df.cleaning_fee=df.cleaning_fee.apply(Airbnb()._price_clean)
        df.guests_included=df.guests_included.apply(Airbnb()._price_clean)
        df.extra_people=df.extra_people.apply(Airbnb()._price_clean)
        
        # transformación amenities
        df[AMENITIES]=Airbnb()._amenities(df)
        df.drop('amenities', axis=1, inplace=True)
        
        # normalización
        df[COLS_NORMAL]=StandardScaler().fit_transform(df[COLS_NORMAL])
        
        
        self.df_test=self.df_train.copy()
        self.df_train=df
       
    
    
    
    
    def fit(self, X_train, y_train) -> object:
        self.modelo.fit(X_train, y_train)
        self
        

In [19]:
airbnb=Airbnb()

In [24]:
airbnb.extract('../data/raw_data/listings.csv.gz')

In [26]:
X=airbnb.df_train.drop('price', axis=1)
y=airbnb.df_train.price
    
X_train, X_test, y_train, y_test = tts(X, y, train_size=0.8, test_size=0.2, random_state=42)

In [27]:
airbnb.fit(X_train, y_train)

airbnb.modelo.predict(X_test)

array([ 77.71919978,  50.21081002,  41.63488418, ..., 143.20382882,
        22.04349961, 258.12841511])

In [25]:
airbnb.df_test

Unnamed: 0,latitude,longitude,accommodates,bathrooms,bedrooms,beds,price,security_deposit,cleaning_fee,guests_included,extra_people,minimum_nights,maximum_nights,availability_30,availability_60,availability_90,availability_365,number_of_reviews,number_of_reviews_ltm,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,air_conditioning,dishwasher,room_type_private_room,room_type_shared_room
0,40.456280,-3.677630,-0.644974,-0.347540,-0.411885,-1.292520,70.0,-0.459430,-0.556307,0.163768,0.396863,-0.157375,-0.007995,0.832554,1.035725,1.062036,-0.593135,0.531621,-0.040983,-0.326717,-0.301668,-0.131311,-0.103262,0,0,0,0
1,40.403412,-3.740840,-1.139239,-0.347540,-0.411885,-0.641884,17.0,-0.459430,-0.702940,-0.548341,-0.031862,-0.018461,-0.008420,-1.115878,-1.231121,-1.303559,-0.007080,-0.076978,-0.678381,-0.295739,-0.301668,0.095059,-0.103262,0,0,0,0
2,40.386951,-3.693040,1.332085,1.052925,1.913700,1.960659,50.0,0.861237,0.176860,-0.548341,0.090631,0.490888,-0.007517,-0.938748,-1.143935,-1.245861,0.366531,-0.579073,-0.678381,-0.171825,-0.112635,-0.357680,-0.103262,0,0,0,0
3,40.422020,-3.703950,-0.150709,-0.347540,-1.574677,-0.641884,80.0,0.421014,0.176860,0.163768,-0.521834,0.027843,-0.007517,1.275379,1.253691,1.206280,1.458060,-0.442138,-0.359682,-0.326717,-0.270162,-0.357680,-0.103262,0,0,0,0
4,40.419949,-3.697640,0.343555,-0.347540,0.750908,0.659387,115.0,0.421014,-0.702940,1.587987,-0.521834,-0.064766,-0.008459,1.009684,1.122911,0.917793,1.311546,0.440331,0.004545,-0.326717,-0.270162,-0.357680,-0.103262,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21490,40.422703,-3.704256,1.332085,3.853856,-0.411885,2.611294,17.0,-0.459430,-0.702940,-0.548341,0.519356,-0.111070,-0.007001,0.921119,1.079318,1.090885,0.732816,-0.579073,-0.678381,-0.264760,-0.301668,-0.357680,1.888874,0,0,0,0
21491,40.422703,-3.704256,-1.139239,3.853856,-0.411885,0.008752,15.0,-0.459430,-0.702940,-0.548341,-0.521834,-0.111070,-0.007001,1.541075,1.384470,1.292826,1.480037,-0.579073,-0.678381,-0.264760,-0.301668,-0.357680,1.888874,0,0,0,0
21492,40.414917,-3.707237,0.837820,1.052925,0.750908,0.659387,200.0,-0.459430,1.056661,0.875878,0.703096,-0.111070,-0.007001,1.541075,1.384470,1.292826,0.520371,-0.579073,-0.678381,-0.295739,-0.238657,-0.357680,-0.103262,0,0,0,0
21493,40.422241,-3.700251,-0.644974,-0.347540,-1.574677,-1.292520,40.0,-0.459430,-0.702940,-0.548341,-0.521834,-0.111070,-0.007001,-0.318792,0.294641,0.571608,-0.058359,-0.579073,-0.678381,-0.233782,-0.175646,-0.357680,-0.103262,0,0,0,0
