In [20]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression
import xgboost as xgb
import numpy as np
import keras
import tensorflow
from sklearn.ensemble import RandomForestRegressor
import joblib


In [21]:
dataframe = pd.read_csv("data/raw/houses_rent_info_nh.csv")
df = dataframe.copy()

#### Para redondear los precios, funciona un poco mejor, si se quita no creo que suponga un problema

In [119]:
df["Price"] = df["Price"].map(lambda x: round(x, -2))

#### Renombramos las columnas, para que encaje con otra parte del proyecto

In [23]:
df.rename(columns={"Place": "Tipo", "Location": "Distrito", "NeighborHood": "Barrio", "Rooms": "Habitaciones", "Toilets": "Banos"}, inplace=True)

#### Dividimos en X, y - Train , Test

In [24]:
X = df[['Tipo', 'Distrito', 'Barrio', 'Habitaciones', 'Banos', 'Area',
       'Furnished']]
y = df["Price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=999)

### Concatenamos para modificar solo el train

In [25]:
x_train = pd.concat([X_train, y_train], axis=1)

In [26]:
x_train.drop(x_train[x_train['Area'] == 1400].index, inplace = True)
print(x_train[x_train.Area == 1400])

Empty DataFrame
Columns: [Tipo, Distrito, Barrio, Habitaciones, Banos, Area, Furnished, Price]
Index: []


In [27]:
#x_train.drop(x_train[x_train['Price'] >= 15000].index, inplace = True)
# Al quitar este, o quitar más registros bajando el precio funciona peor.

### Volvemos a dividir X e Y

In [28]:
X_train = x_train[['Tipo', 'Distrito', 'Barrio', "Habitaciones", 'Banos', 'Area',
       'Furnished']]
y_train = x_train["Price"]

#### Seleccionamos variables categóricas y numéricas

In [29]:
one_hot_columns = ["Tipo", "Distrito", "Barrio"]
numerical_columns = ["Area", "Habitaciones", "Banos"]

## XgBoost

In [124]:
ct = ColumnTransformer([
        ('standard', StandardScaler(), numerical_columns),
        ('onehot', OneHotEncoder(handle_unknown="ignore"), one_hot_columns)
    ], remainder='passthrough'
    )

pipeline = Pipeline([
        ('column_transformer', ct),
        ('boost', xgb.XGBRegressor())
    ])

param_grid = {
    'boost__learning_rate': [0.1], # [0.01, 0.1, 0.2]
    'boost__n_estimators': [500], # 400
    'boost__max_depth': [6],
    'boost__min_child_weight': [1, 2, 3], # [1, 2, 3]
}

xgBoost_pipe = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, refit=True)
xgBoost_pipe.fit(X_train, y_train)
y_predict = xgBoost_pipe.predict(X_test)
y_predict = np.around(y_predict, -2) # Se redondean también las predicciones
print(r2_score(y_test, y_predict))
print(xgBoost_pipe.best_params_)
print(np.sqrt(mean_squared_error(y_test, y_predict)))
print(y_test)
print(y_predict)
#joblib.dump(xgBoost_pipe, 'xgboost_regressor.pkl')

0.8130559777122198
{'boost__learning_rate': 0.1, 'boost__max_depth': 6, 'boost__min_child_weight': 3, 'boost__n_estimators': 500}
661.9916536579923
2016     800
5772    2500
7250    3500
1475    1000
2521    2000
        ... 
318     1400
202     2300
1904    1000
1561    1000
7108    1900
Name: Price, Length: 2258, dtype: int64
[ 800. 2700. 4100. ... 1000. 1000. 1500.]


### Método para registrar valores (get_features), método para predicción (prediction)

In [119]:
def get_features():

    answers_afirmative = ["Si", "si", "s"]

    localization = input("Introduce localización:   ")
    neighborhood = input("Introduce la zona/barrio: ")
    area = input("Introduce los metros cuadrados del inmueble: ")
    toilets = input("Introduce los baños del inmueble:  ")
    air_conditioning = input("Tiene aire acondicionado: (Si / No)")
    if air_conditioning in answers_afirmative:
        air_conditioning = 1
    else:
        air_conditioning = 0
    built_in_wardrobes = input("Tiene armarios empotrados? (Si / No)")
    if built_in_wardrobes in answers_afirmative:
        built_in_wardrobes = 1
    else:
        built_in_wardrobes = 0
    elevator = input("Tiene ascensor? (Si / No)")
    if elevator in answers_afirmative:
        elevator = 1
    else:
        elevator = 0
    heating = input("Tiene calefacción? (Si / No)")
    if heating in answers_afirmative:
        heating = 1
    else:
        heating = 0
    garage = input("Tiene garaje? (Si / No)")
    if garage in answers_afirmative:
        garage = 1
    else:
        garage = 0
    terrace = input("Tiene terraza? (Si / No)")
    if terrace in answers_afirmative:
        terrace = 1
    else:
        terrace = 0
    furnished = input("Está amueblado? (Si / No)")
    if furnished in answers_afirmative:
        furnished = 1
    else:
        furnished = 0
    balcony = input("Tiene balcón? (Si / No)")
    if balcony in answers_afirmative:
        balcony = 1
    else:
        balcony = 0
    garden = input("Tiene jardín? (Si / No)")
    if garden in answers_afirmative:
        garden = 1
    else:
        garden = 0
    pool = input("Tiene piscina? (Si / No)")
    if pool in answers_afirmative:
        pool = 1
    else:
        pool = 0

    features = {"Location": localization,
        "NeighborHood": neighborhood,
        "Area": area,
        "Toilets": toilets,
        "Air Conditioning": air_conditioning,
        "Built-in Wardrobes": built_in_wardrobes,
        "Elevator": elevator,
        "Heating": heating,
        "Garage": garage,
        "Terrace": terrace,
        "Furnished": furnished,
        "Balcony": balcony,
        "Garden": garden,
        "Pool": pool
        }
    
    ls = []
    ls.append(features)
    return ls
    
def prediction(ls):
    data = pd.DataFrame(ls)
    prediction = xgBoost_pipe.predict(data)
    house_price_prediction = np.around(prediction, -2)
    print("Predicción del precio del inmueble: {} euros".format(int(house_price_prediction[0])))

In [194]:
for i in df.Location.unique():
    print("Localizacion: {} --> {}".format(i,len(df[df["Location"] == i])))


Localizacion: Arganzuela --> 1026
Localizacion: Barajas --> 22
Localizacion: Carabanchel --> 1187
Localizacion: Centro --> 1050
Localizacion: Chamartín --> 479
Localizacion: Chamberí --> 632
Localizacion: Ciudad Lineal --> 137
Localizacion: Fuencarral --> 127
Localizacion: Hortaleza --> 220
Localizacion: Latina --> 62
Localizacion: Moncloa --> 212
Localizacion: Moratalaz --> 16
Localizacion: Puente de Vallecas --> 63
Localizacion: Retiro --> 315
Localizacion: Salamanca --> 1185
Localizacion: San Blas --> 76
Localizacion: Tetuán --> 493
Localizacion: Usera --> 57
Localizacion: Vicálvaro --> 38
Localizacion: Villa de Vallecas --> 40
Localizacion: Villaverde --> 87


#### Método de fiabilidad según el número de registros en el dataset

In [151]:
def fiability(location):
    number = len(df[df["Location"] == location])
    if number >= 1000:
        print("Fiabilidad Alta")
    elif 400 < number < 1000:
        print("Fiabilidad Media")
    elif number < 400:
        print("Fiabilidad Baja")


#### Ejecutar la siguiente celda para meter datos de inmueble y comprobar fiabilidad del modelo

In [196]:
house = get_features()
prediction(house)
fiability(house[0]["Location"])


Predicción del precio del inmueble: 1500 euros
Fiabilidad Baja


#### Primera prueba fuera del Test

In [264]:
dicc = {"Location": ["Puente de Vallecas"],
        "NeighborHood": ["Numancia"],
        "Area": [30],
        "Toilets": [1],
        "Air Conditioning": [1],
        "Built-in Wardrobes": [1],
        "Elevator": [0],
        "Heating": [1],
        "Garage": [0],
        "Terrace": [0],
        "Furnished": [1],
        "Balcony": [0],
        "Garden": [0],
        "Pool": [0]
        }
l = []
l.append(dicc)
d = pd.DataFrame.from_dict(dicc)
y_predict = xgBoost_pipe.predict(d)
y_predict = np.around(y_predict, -2)
print(int(y_predict[0]))


600


## RandomForestRegressor

In [190]:
ct = ColumnTransformer([
        ('onehot', OneHotEncoder(handle_unknown="ignore"), one_hot_columns)
                       ], remainder='passthrough'
                          )

pipeline = Pipeline([
        ('column_transformer', ct),
        ('forest', RandomForestRegressor())
                        ])

param_grid = {
    'forest__n_estimators': [600],
    'forest__max_depth': [30],
    'forest__min_samples_split': [2],
    'forest__min_samples_leaf': [1],
    'forest__max_features': ['sqrt'],
}

grid_pipeline = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, refit=True)
grid_pipeline.fit(X_train, y_train)
y_predict = grid_pipeline.predict(X_test)
"""
best_forest_model = grid_pipeline.best_estimator_.named_steps['forest']
importances = best_forest_model.feature_importances_
one_hot_encoder = grid_pipeline.best_estimator_.named_steps['column_transformer'].named_transformers_['onehot']
one_hot_feature_names = one_hot_encoder.get_feature_names_out(input_features=one_hot_columns)
column_names = list(one_hot_feature_names) + list(X_train.columns.difference(one_hot_columns))
importance_df = pd.DataFrame({'Feature': column_names, 'Importance': importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)
print(importance_df)
"""
y_predict = np.around(y_predict, -2)
print(r2_score(y_test, y_predict))
print(grid_pipeline.best_params_)
print(np.sqrt(mean_squared_error(y_test, y_predict)))
print(y_test)
print(y_predict)

0.8151038925825113
{'forest__max_depth': 30, 'forest__max_features': 'sqrt', 'forest__min_samples_leaf': 1, 'forest__min_samples_split': 2, 'forest__n_estimators': 600}
658.3557100239726
2016     800
5772    2500
7250    3500
1475    1000
2521    2000
        ... 
318     1400
202     2300
1904    1000
1561    1000
7108    1900
Name: Price, Length: 2258, dtype: int64
[ 800. 2800. 3400. ... 1000. 1000. 1800.]
