In [45]:
%matplotlib inline

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV 
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, mean_absolute_error
from sklearn import preprocessing
import warnings
warnings.filterwarnings('ignore')

In [12]:
df = pd.read_csv('data_wrangled.csv')

In [13]:
df.head()

Unnamed: 0,bathrooms,floor,price,price_m2,rooms,size,latitude,longitude,floor_was_missing,price_cut
0,1.0,4.0,226000.0,2897.0,2.0,78.0,40.430409,-3.557889,False,"(209000.0, 230000.0]"
1,1.0,0.0,98500.0,1790.0,2.0,55.0,40.423733,-3.561187,False,"(93000.0, 110000.0]"
2,1.0,2.420332,129000.0,1842.0,3.0,70.0,40.430736,-3.635022,True,"(124000.0, 133900.0]"
3,1.0,2.420332,220000.0,3098.0,3.0,71.0,40.384267,-3.663003,True,"(209000.0, 230000.0]"
4,1.0,0.0,148000.0,2144.0,2.0,69.0,40.343037,-3.708971,False,"(143417.5, 155000.0]"


# Data preprocessing

In [14]:
#Eliminamos 'price_m2' para evitar el leakage
df_sk = df.drop(['price_m2', 'price_cut'], axis = 1)

In [15]:
before = len(df_sk)
df_sk = df_sk.drop_duplicates()
after = len(df_sk)
print(before, after)

11509 11494


Seleccionamos las columnas que representan las features y la columna que representa el target (precio):

In [70]:
X = df_sk.drop('price', axis = 1)
y = df_sk['price']

In [71]:
X['floor_was_missing'] = X['floor_was_missing'].apply(lambda x: 1 if x else 0)

In [72]:
min_max_scaler = preprocessing.MinMaxScaler()
X_scaled = min_max_scaler.fit_transform(X)
train_X, val_X, train_y, val_y = train_test_split(X_scaled, y, test_size=0.2)

# Modeling and Evaluation

In [21]:
# Definición del error cuadrático medio
def rmse_cv(model, X, y):
    rmse = np.sqrt(-cross_val_score(model, X, y, scoring = "neg_mean_squared_error", cv=5))
    return rmse

#Escogemos 10 modelos:


In [63]:
from sklearn.linear_model import LinearRegression, Lasso, SGDRegressor, Ridge
from sklearn.ensemble import ExtraTreesRegressor, GradientBoostingRegressor, RandomForestRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor

models = [LinearRegression(),
             Ridge(),
             Lasso(alpha=0.01, max_iter=10000),
             RandomForestRegressor(max_depth= 25, min_samples_split= 5, n_estimators= 600),
             GradientBoostingRegressor(),
             SVR(),
             SGDRegressor(max_iter=1000, tol = 1e-3),
             ExtraTreesRegressor(),
             XGBRegressor()
         ]

names = ['LR','Ridge','Lasso','RF','GBR','SVR','SGDR','XTreeR', 'XGBR']                    

In [64]:
for model,name in zip(models,names):
    score = rmse_cv(model, X_scaled, y_log)
    print("{}: {:.6f}, {:4f}".format(name,score.mean(),score.std()))

LR: 0.596227, 0.140617
Ridge: 0.522746, 0.026838
Lasso: 0.559382, 0.027774
RF: 0.268781, 0.030383
GBR: 0.289523, 0.024935
SVR: 0.455645, 0.026515
SGDR: 0.580483, 0.026295
XTreeR: 0.288868, 0.027658
XGBR: 0.290249, 0.023919


In [57]:
#Función grid para la ejecución de grid search

def grid_get(model, X, y, param_grid):
    grid_search = GridSearchCV(model, param_grid, cv = 5)
    grid_search.fit(X, y)
    print('Best score and params: ', grid_search.best_score_, grid_search.best_params_ )
    print(pd.DataFrame(grid_search.cv_results_)[['params','mean_test_score','std_test_score']])
    '''
    print(grid_search.best_params_, np.sqrt(-grid_search.best_score_))
    grid_search.cv_results_['mean_test_score'] = np.sqrt(-grid_search.cv_results_['mean_test_score'])
    print(pd.DataFrame(grid_search.cv_results_)[['params','mean_test_score','std_test_score']])
    '''

In [56]:
#Grid search para modelo Lasso
grid_get(Lasso(), X_scaled, y_log, 
         {
             'alpha':[0.01,0.001,0.0001,0.0002,0.0003,0.0004,0.0005,0.0006,0.0007,0.0009], 
             'max_iter':[10000]
         })

Best score and params:  0.5160902333052514 {'alpha': 0.0009, 'max_iter': 10000}
                                 params  mean_test_score  std_test_score
0    {'alpha': 0.01, 'max_iter': 10000}         0.452732        0.047244
1   {'alpha': 0.001, 'max_iter': 10000}         0.515564        0.059775
2  {'alpha': 0.0001, 'max_iter': 10000}         0.371149        0.306194
3  {'alpha': 0.0002, 'max_iter': 10000}         0.411509        0.224689
4  {'alpha': 0.0003, 'max_iter': 10000}         0.445326        0.157536
5  {'alpha': 0.0004, 'max_iter': 10000}         0.472596        0.106184
6  {'alpha': 0.0005, 'max_iter': 10000}         0.493319        0.073310
7  {'alpha': 0.0006, 'max_iter': 10000}         0.507506        0.060244
8  {'alpha': 0.0007, 'max_iter': 10000}         0.515148        0.059634
9  {'alpha': 0.0009, 'max_iter': 10000}         0.516090        0.060245


In [58]:
#Grid search Ridge
grid_get(Ridge(), X_scaled,y_log,
                       {
                           'alpha':[10,20,25,30,35,40,45,50,55,57,60,65,70,75,80,100],
                           'max_iter':[10000]
                       })

Best score and params:  0.48951783504307617 {'alpha': 10, 'max_iter': 10000}
                               params  mean_test_score  std_test_score
0    {'alpha': 10, 'max_iter': 10000}         0.489518        0.047090
1    {'alpha': 20, 'max_iter': 10000}         0.450273        0.039640
2    {'alpha': 25, 'max_iter': 10000}         0.432654        0.037068
3    {'alpha': 30, 'max_iter': 10000}         0.416275        0.034972
4    {'alpha': 35, 'max_iter': 10000}         0.401008        0.033245
5    {'alpha': 40, 'max_iter': 10000}         0.386737        0.031814
6    {'alpha': 45, 'max_iter': 10000}         0.373365        0.030624
7    {'alpha': 50, 'max_iter': 10000}         0.360809        0.029633
8    {'alpha': 55, 'max_iter': 10000}         0.348995        0.028808
9    {'alpha': 57, 'max_iter': 10000}         0.344463        0.028517
10   {'alpha': 60, 'max_iter': 10000}         0.337861        0.028119
11   {'alpha': 65, 'max_iter': 10000}         0.327350        0.027544


In [41]:
grid_get(SVR(), X_scaled,y_log, {
    'C':[11,13,15], 
    'kernel':["rbf"], 
    'gamma':[0.0003,0.0005],
    'epsilon':[0.006,0.009]
})

{'C': 15, 'epsilon': 0.008, 'gamma': 0.0004, 'kernel': 'rbf'} 0.644659288782284
                                               params  mean_test_score  \
0   {'C': 11, 'epsilon': 0.008, 'gamma': 0.0003, '...         0.691794   
1   {'C': 11, 'epsilon': 0.008, 'gamma': 0.0004, '...         0.669699   
2   {'C': 11, 'epsilon': 0.009, 'gamma': 0.0003, '...         0.691751   
3   {'C': 11, 'epsilon': 0.009, 'gamma': 0.0004, '...         0.669706   
4   {'C': 13, 'epsilon': 0.008, 'gamma': 0.0003, '...         0.679264   
5   {'C': 13, 'epsilon': 0.008, 'gamma': 0.0004, '...         0.656462   
6   {'C': 13, 'epsilon': 0.009, 'gamma': 0.0003, '...         0.679231   
7   {'C': 13, 'epsilon': 0.009, 'gamma': 0.0004, '...         0.656453   
8   {'C': 15, 'epsilon': 0.008, 'gamma': 0.0003, '...         0.668000   
9   {'C': 15, 'epsilon': 0.008, 'gamma': 0.0004, '...         0.644659   
10  {'C': 15, 'epsilon': 0.009, 'gamma': 0.0003, '...         0.667919   
11  {'C': 15, 'epsilon': 0.009, 

In [60]:
grid_get(RandomForestRegressor(), X_scaled, y_log, {
    'n_estimators': [300, 600, 1000], 
    'max_depth':[2, 5, 10, 25], 
    'min_samples_split': [2, 5]
})

Best score and params:  0.8711922557316767 {'max_depth': 25, 'min_samples_split': 5, 'n_estimators': 600}
                                               params  mean_test_score  \
0   {'max_depth': 2, 'min_samples_split': 2, 'n_es...         0.546764   
1   {'max_depth': 2, 'min_samples_split': 2, 'n_es...         0.546983   
2   {'max_depth': 2, 'min_samples_split': 2, 'n_es...         0.548417   
3   {'max_depth': 2, 'min_samples_split': 5, 'n_es...         0.547535   
4   {'max_depth': 2, 'min_samples_split': 5, 'n_es...         0.547203   
5   {'max_depth': 2, 'min_samples_split': 5, 'n_es...         0.546717   
6   {'max_depth': 5, 'min_samples_split': 2, 'n_es...         0.771629   
7   {'max_depth': 5, 'min_samples_split': 2, 'n_es...         0.771873   
8   {'max_depth': 5, 'min_samples_split': 2, 'n_es...         0.771780   
9   {'max_depth': 5, 'min_samples_split': 5, 'n_es...         0.771985   
10  {'max_depth': 5, 'min_samples_split': 5, 'n_es...         0.771897   
11  {'

# MODELO FINAL: RANDOM FOREST

In [73]:
modelo_final = RandomForestRegressor(max_depth= 25, min_samples_split= 5, n_estimators= 600)
modelo_final_fit = model.fit(train_X, train_y)



In [92]:
#imput = np.array([[1, 1, 3, 72, 40.3845857, -3.7820604, 0], [2, 1, 3, 84, 40.4421587, -3.5646838, 0]])
#https://www.pisos.com/comprar/piso-las_aguilas28024-95018871320_100500/
#209.000
size = 72
rooms = 3
baths = 1
floor = 1

#https://www.pisos.com/comprar/piso-el_canaveral-9389221662_109700/
#240000
size = 84
rooms = 3
baths = 2
floor = 1
#40.4421587,-3.5646838

input1 = np.array([[1, 1, 3, 72, 40.3845857, -3.7820604, 0], [2, 1, 3, 84, 40.4421587, -3.5646838, 0]])
print(type(input1))


<class 'numpy.ndarray'>


In [93]:
min_max_scaler = preprocessing.MinMaxScaler()
input_scaled = min_max_scaler.fit(X).transform(input1)

In [94]:
prices_estimated = modelo_final_fit.predict(input_scaled)
print(prices_estimated)

[133172.7  219072.64]
