In [21]:
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.stats import iqr
from sklearn.metrics import mean_squared_error, median_absolute_error
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.svm import SVR
from sklearn.linear_model import SGDRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import ElasticNet
import xgboost as xgb

In [2]:
lista_c = pd.read_csv('C:/Google Drive/Provas/cognitivo.ai/Dados/listings.csv', encoding = 'UTF-8')
#lista_c.head(3)

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
lista_c['price'] = pd.to_numeric(lista_c.loc[:, 'price'].str[1:].str.replace(',', ''))

In [4]:
target = lista_c.price.values
features = lista_c[["room_type", "accommodates", "bathrooms", "bedrooms", "beds", 
         "minimum_nights", "number_of_reviews", "review_scores_rating"]]

In [5]:
preprocess = make_column_transformer(
    (OneHotEncoder(), ['room_type']),
    (SimpleImputer(missing_values=np.nan, strategy='mean'), ["accommodates", "bathrooms", "bedrooms", "beds", 
         "minimum_nights", "number_of_reviews", "review_scores_rating"])
)

In [6]:
features = preprocess.fit_transform(features)
X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=0)

In [10]:
sgd_param_dist = {"loss": ['squared_loss', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'],
              "penalty": ['none', 'l2', 'l1', 'elasticnet'],
              "alpha":  np.arange(10**-3, 10**3, 10**1),
              "l1_ratio":  np.arange(0, 1, 10**-1)}
sgd = SGDRegressor()

sgd_random_search = RandomizedSearchCV(sgd, param_distributions=sgd_param_dist,
                                    n_iter=1, cv=5, iid=False, scoring = ['neg_mean_squared_error'], 
                                       refit = 'neg_mean_squared_error')
sgd_random_search.fit(X_train, y_train)
sgd_random_search.best_params_

sgd_preds = sgd_random_search.predict(X_test)
sgd_error = y_test - sgd_preds
print("SGD RMSE: %f" % np.sqrt(mean_squared_error(y_test, sgd_preds)))
print("SGD MAE: %f" % median_absolute_error(y_test, sgd_preds))
print("SGD IQR: %f" % iqr(sgd_error))



SGD RMSE: 1765.866179
SGD MAE: 137.335713
SGD IQR: 449.000000


In [None]:
svr_param_dist = {"C":  np.arange(10**-3, 10**3, 10),
                 "kernel": ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed']}
svr = SVR(gamma = 'auto')

svr_random_search = RandomizedSearchCV(svr, param_distributions=svr_param_dist,
                                   n_iter=20, cv=5, iid=False, scoring = ['neg_mean_squared_error'], 
                                       refit = 'neg_mean_squared_error')
svr_random_search.fit(X_train, y_train)
svr_random_search.best_params_

svr_preds = svr_random_search.predict(X_test)
svr_error = y_test - svr_preds
print("SVR RMSE: %f" % np.sqrt(mean_squared_error(y_test, svr_preds)))
print("svr MAE: %f" % median_absolute_error(y_test, svr_preds))
print("svr IQR: %f" % iqr(svr_preds))

In [12]:
xgb_param_dist = {"max_depth": range(1, 5),
              "learning_rate ":  np.arange(10**-3, 10**3, 10),
              "tree_method  ":['auto', 'exact', 'approx', 'hist']}
xgb_reg = xgb.XGBRegressor()

xgb_random_search = RandomizedSearchCV(xgb_reg, param_distributions=xgb_param_dist,
                                    n_iter=1, cv=5, iid=False, scoring = ['neg_mean_squared_error'], 
                                       refit = 'neg_mean_squared_error')
xgb_random_search.fit(X_train, y_train)
xgb_random_search.cv_results_

xgbreg_preds = xgb_random_search.predict(X_test)
xgbreg_error = y_test - xgbreg_preds
print("XGboost RMSE: %f" % np.sqrt(mean_squared_error(y_test, xgbreg_preds)))
print("SGD MAE: %f" % median_absolute_error(y_test, xgbreg_preds))
print("SGD IQR: %f" % iqr(xgbreg_preds))

XGboost RMSE: 1495.608868
SGD MAE: 179.546021
SGD IQR: 485.344788


In [13]:
mlp_param_dist = {"activation": ['identity', 'logistic', 'tanh', 'relu'],
              "solver": ['lbfgs', 'sgd', 'adam'],
              "alpha":  np.arange(0, 1, 10**-1)}
mlp = MLPRegressor()

mlp_random_search = RandomizedSearchCV(mlp, param_distributions=mlp_param_dist,
                                    n_iter=1, cv=5, iid=False, scoring = ['neg_mean_squared_error'], 
                                       refit = 'neg_mean_squared_error')
mlp_random_search.fit(X_train, y_train)
mlp_random_search.cv_results_

mlp_preds = mlp_random_search.predict(X_test)
mlp_error = y_test - mlp_preds
print("MLP RMSE: %f" % np.sqrt(mean_squared_error(y_test, mlp_preds)))
print("SGD MAE: %f" % median_absolute_error(y_test, mlp_preds))
print("SGD IQR: %f" % iqr(mlp_preds))



MLP RMSE: 1608.193977
SGD MAE: 190.886337
SGD IQR: 546.582632




In [14]:
kr_param_dist = {"alpha":  np.arange(10**-3, 10**3, 10)}
kr = KernelRidge()

kr_random_search = RandomizedSearchCV(kr, param_distributions=kr_param_dist,
                                    n_iter=1, cv=5, iid=False, scoring = ['neg_mean_squared_error'], 
                                       refit = 'neg_mean_squared_error')
kr_random_search.fit(X_train, y_train)
kr_random_search.cv_results_

krreg_preds = kr_random_search.predict(X_test)
krreg_error = y_test - krreg_preds
print("KernelRidge RMSE: %f" % np.sqrt(mean_squared_error(y_test, krreg_preds)))
print("SGD MAE: %f" % median_absolute_error(y_test, krreg_preds))
print("SGD IQR: %f" % iqr(krreg_preds))

KernelRidge RMSE: 1597.962554
SGD MAE: 252.488897
SGD IQR: 688.037645
