In [20]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
import itertools
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from airbnbCABATransformer import AirbnbCABATransformer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [9]:
#Cargamos el dataset limpio 
data = pd.read_csv('data/listings.csv')
data.sample(3)


Unnamed: 0,id,listing_url,scrape_id,last_scraped,source,name,description,neighborhood_overview,picture_url,host_id,...,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
3740,20418069,https://www.airbnb.com/rooms/20418069,20220922013624,2022-09-22,city scrape,"PALERMO, BONITO Y BARATO",El departamento es cómodo y cálido. Está equip...,"Cercano a restaurantes y bares, todos los medi...",https://a0.muscache.com/pictures/c5c3386d-0261...,13505939,...,,,,,f,3,3,0,0,
15764,662170811185291656,https://www.airbnb.com/rooms/662170811185291656,20220922013624,2022-09-22,city scrape,Amplio y luminoso dpto 3 ambiente en Villa Cre...,Amplio y luminoso departamento de 3 ambientes ...,"Es un barrio lleno de opciones gastronómicas, ...",https://a0.muscache.com/pictures/431cc014-5e60...,165617905,...,,,,,f,1,1,0,0,
15685,663791921269219889,https://www.airbnb.com/rooms/663791921269219889,20220922013624,2022-09-22,city scrape,Habitación Doble con Desayuno y Est. gratis,"Hostel ViaVia Buenos Aires, is one of the 17 V...",,https://a0.muscache.com/pictures/miso/Hosting-...,62092352,...,,,,,t,4,0,4,0,


In [10]:
dataclean = AirbnbCABATransformer().transform(data)
dataclean.head()

213 outliers found out of 17947 data points, 1.1868278820972864% of the data. 45077.0 is the max
18 outliers found out of 17734 data points, 0.10149994361114245% of the data. 8.0 is the max
88 outliers found out of 17716 data points, 0.4967261232783924% of the data. 8.0 is the max
331 outliers found out of 17628 data points, 1.8776945768096212% of the data. 47.0 is the max


  return func(self, *args, **kwargs)


Unnamed: 0,host_is_superhost,host_has_profile_pic,host_identity_verified,accommodates,bathrooms,bedrooms,beds,price,minimum_nights,maximum_nights,...,property_type_Apartment,property_type_Hotel,property_type_House,property_type_Other,room_type_Entire home/apt,room_type_Hotel room,room_type_Private room,room_type_Shared room,bathroomtype_private,bathroomtype_shared
0,0,1,1,2,1,1,1,9823,3,1125,...,1,0,0,0,1,0,0,0,1,0
1,1,1,1,2,1,1,1,5687,2,730,...,1,0,0,0,1,0,0,0,1,0
2,0,1,1,2,1,1,1,3728,7,1125,...,1,0,0,0,1,0,0,0,1,0
3,1,1,1,4,2,1,1,6514,14,1125,...,1,0,0,0,1,0,0,0,1,0
4,1,1,0,2,1,1,1,3584,5,730,...,1,0,0,0,1,0,0,0,1,0


In [11]:
#Genero mi variable X
X=dataclean.drop(['price'],axis=1)

#Genero la variable y
y = dataclean['price'].squeeze()

#Creamos las muestras
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 10,shuffle=True)

folds=StratifiedKFold(n_splits=5,shuffle=True, random_state=42)

In [12]:

pasos = [('preprocessor', MinMaxScaler()), 
         ('regresor', LinearRegression())]

pipe=Pipeline(pasos)



In [13]:
def make_param_grids(steps, param_grids):

    final_params=[]

    for estimator_names in itertools.product(*steps.values()):
        current_grid = {}

        for step_name, estimator_name in zip(steps.keys(), estimator_names):            
            for param, value in param_grids.get(estimator_name).items():
                if param == 'object':
                    # Set actual estimator in pipeline
                    current_grid[step_name]=[value]
                else:
                    # Set parameters corresponding to above estimator
                    current_grid[step_name+'__'+param]=value
        #Append this dictionary to final params            
        final_params.append(current_grid)

    return final_params

pipeline_steps = {'preprocessor':['ssc', 'mms','none'],
                  'regresor':['lr', 'rf','xgb']}

# fill parameters to be searched in this dict
all_param_grids = {'lr':{'object':LinearRegression()
                         }, 

                   'rf':{'object':RandomForestRegressor(),
                         'n_estimators':[50,100,500,1000],
                         'random_state': [42],
                         'n_jobs': [-1]
                        },

                   'ssc':{'object':StandardScaler()
                         },

                   'mms':{'object':MinMaxScaler()
                         },
                   'none':{'object':None
                         },
                   'xgb':{'object': xgb.XGBRegressor(),
                          'n_estimators': [500,1000,2000], 
                          'max_depth': [4,5,6], 
                          'learning_rate': [0.005,0.01]
                         }
                  }  


# Call the method on the above declared variables
param_grids_list = make_param_grids(pipeline_steps, all_param_grids)

In [14]:
grid=GridSearchCV(pipe, param_grid = param_grids_list, cv=folds)
    
grid.fit(X_train,y_train)

















GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
             estimator=Pipeline(steps=[('preprocessor', MinMaxScaler()),
                                       ('regresor', LinearRegression())]),
             param_grid=[{'preprocessor': [StandardScaler()],
                          'regresor': [LinearRegression()]},
                         {'preprocessor': [StandardScaler()],
                          'regresor': [RandomForestRegressor()],
                          'regresor__n_estimators': [50, 100,...
                                                    min_child_weight=None,
                                                    missing=nan,
                                                    monotone_constraints=None,
                                                    n_estimators=2000,
                                                    n_jobs=None,
                                                    num_parallel_tree=None,
                              

In [18]:
print(grid.best_estimator_)

Pipeline(steps=[('preprocessor', StandardScaler()),
                ('regresor',
                 XGBRegressor(base_score=0.5, booster='gbtree',
                              colsample_bylevel=1, colsample_bynode=1,
                              colsample_bytree=1, gamma=0, gpu_id=-1,
                              importance_type='gain',
                              interaction_constraints='', learning_rate=0.01,
                              max_delta_step=0, max_depth=6, min_child_weight=1,
                              missing=nan, monotone_constraints='()',
                              n_estimators=2000, n_jobs=16, num_parallel_tree=1,
                              random_state=0, reg_alpha=0, reg_lambda=1,
                              scale_pos_weight=1, subsample=1,
                              tree_method='exact', validate_parameters=1,
                              verbosity=None))])


In [19]:
grid.best_estimator_.score(X_test,y_test)



0.4428865327298259

In [21]:
y_train_grid_preds = grid.best_estimator_.predict(X_train)
y_test_grid_preds = grid.best_estimator_.predict(X_test)
print('MSE train: %.3f, test: %.3f' % (mean_squared_error(y_train, y_train_grid_preds), mean_squared_error(y_test, y_test_grid_preds)))
print('Forest R^2 train: %.3f, test: %.3f' % (r2_score(y_train, y_train_grid_preds),r2_score(y_test, y_test_grid_preds)))
print('-----------------------------------------------------')


XGB MSE train: 9583513.600, test: 18402437.126
XGB Forest R^2 train: 0.690, test: 0.443
-----------------------------------------------------


