# Price's recommendation and visualization

#### In this notebook we provide the recommended price of an apartment or room based on its characteristics. We provide the solution using a heat map.

In [24]:
import pandas as pd
import numpy as np
import plotly.express as px
import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output
import dash_bootstrap_components as dbc
import pickle
from datetime import datetime
import plotly.graph_objects as go
import nltk
from nltk.corpus import stopwords
import dash
import dash_core_components as dcc
import dash_html_components as html
import matplotlib.pyplot as plt
from tqdm import tqdm
from datetime import datetime, timedelta
import pymongo
from pymongo import MongoClient

### Data pre-processing

#### We have all the information in MongoDB. To calculate the recommended price we will work just with the apartments or rooms in the same neighbourhood:

In [25]:
client = MongoClient('localhost', 27017)
db = client.proyecto
# Let's supose the apartment is located in el Retiro:
barrio = "Retiro"
alojamientos = db.alojamientos.find({"neighbourhood_group_cleansed" : barrio})

#### We need to upload the BoW created previously and a list with all the neighbourhoods in Madrid:

In [26]:
with open(r"C:\Users\usuario\Desktop\Nebulova\Curso\Proyecto\Datos\Pickles\BoW.pk", "rb") as file:
    BoW = pickle.load(file)

#### Let's create the dataset we are going to use:

In [27]:
aloj = pd.DataFrame(list(alojamientos))
aloj = aloj[aloj.columns.difference(["_id", "location", "description", "traduccion"])]

#### We convert now the categorical variable into dummy variables:

In [28]:
data_dum = pd.get_dummies(aloj)

#### We merge this table with the BoW created in other notebook:

In [29]:
dataset = pd.merge(data_dum, BoW, 
                        left_on='id', right_on='id', how='left')

In [30]:
dataset.head()

Unnamed: 0,24-hour check-in,Accessible-height bed,Baby bath,Babysitter recommendations,Bathtub,Breakfast,Building staff,Buzzer/wireless intercom,Cable TV,Carbon monoxide detector,...,ponzano,diurno,sujeto,establecimiento,cerradas,encima,tale,registros,taquillas,hostel
0,0,0,0,0,0,0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0,1,0,1,0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,0,1,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,0,0,0,0,1,0,1,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
dataset.isna().sum()

24-hour check-in               0
Accessible-height bed          0
Baby bath                      0
Babysitter recommendations     0
Bathtub                        0
                              ..
encima                        67
tale                          67
registros                     67
taquillas                     67
hostel                        67
Length: 1306, dtype: int64

#### There are 67 rows with NaNs because when we have created the BoW we have removed the empty descriptions. To solve this issue, we will remove those rows:

In [32]:
dataset = dataset.dropna()

In [33]:
dataset.isna().sum()

24-hour check-in              0
Accessible-height bed         0
Baby bath                     0
Babysitter recommendations    0
Bathtub                       0
                             ..
encima                        0
tale                          0
registros                     0
taquillas                     0
hostel                        0
Length: 1306, dtype: int64

### Model training

#### First step is to split the features and the target:

In [34]:
X = dataset[dataset.columns.difference(["price", "id"])]
X = X.apply(pd.to_numeric, errors='ignore')
y = dataset["price"]

#### Now we need to split the dataset into a training dataset and a test dataset:

In [35]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
     X, y, test_size=0.2, random_state=42)

#### We have use the XGBoost algorithm to predict the price:

In [36]:
from xgboost import XGBRegressor
xgb = XGBRegressor(n_estimators = 50, max_depth = 7)
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)
from sklearn.metrics import r2_score
r2 = r2_score(y_test, y_pred)

In [37]:
print(r2)

0.8419227012637911


#### We are dealing with a regression problem, so to compare easily the result of each model, we will consider a good result if it's inside a range of 10 dollars:

In [38]:
y_test.reset_index(drop = True, inplace = True)
aciertos = 0
totales = 0
for res in range(len(y_pred)):
    totales += 1
    if abs(y_pred[res] - y_test[res]) < 10:
        aciertos += 1
acc = aciertos / totales
print(acc)

0.6371681415929203


#### With this model and for the neighbourhood of "Retiro", we obtain an accuracy of 63,71%.
#### Logically, it improves when the range is wider (around 80% when it's 20 dollars instead of 10)

### Result visualization

In [39]:
path = r"C:\Users\usuario\Desktop\Nebulova\Curso\Proyecto"
from plotly.offline import plot
fig3 = px.density_mapbox(dataset, lat=dataset["latitude"], lon=dataset["longitude"], z='price', radius=10,
                    center=dict(lat=dataset["latitude"].mean(), lon=dataset["longitude"].mean()), 
                    zoom=12,
                    mapbox_style="carto-positron")
fig3.update_layout(transition_duration=500, height=650)
fig3.add_traces(go.Scattermapbox(
    lat=[X_test.loc[578, "latitude"]],#coord[0]
    lon=[X_test.loc[578, "longitude"]],#coord[1]
    mode='markers',
    marker=go.scattermapbox.Marker(
        size=20,
        color = "black"
    ),
    text="Rec.price: " + str(y_pred[30]) + " vs " + "Real price: " + str(y_test[30]),
))

#### Results for each neighbourhood

In [40]:
#We load a pickle with the name of each Neighbourhood
with open(r"C:\Users\usuario\Desktop\Nebulova\Curso\Proyecto\Datos\Pickles\barrios.pk", "rb") as file:
    barrios = pickle.load(file)

#### With a loop, we get the result for each neighbourhood:

In [41]:
resultados_finales = {}

for barrio in barrios: 
    alojamientos = db.alojamientos.find({"neighbourhood_group_cleansed" : barrio})
    
    #DataFrame with just the apartments/rooms of the neighbourhood
    aloj = pd.DataFrame(list(alojamientos))
    aloj = aloj[aloj.columns.difference(["_id", "location", "description", "traduccion"])]
    
    data_dum = pd.get_dummies(aloj)
    dataset = pd.merge(data_dum, BoW, left_on='id', right_on='id', how='left')
    dataset = dataset.dropna()
    
     # =============================================================================
     # TRAIN DATASET
     # =============================================================================
     
    X = dataset[dataset.columns.difference(["price", "id"])]
    X = X.apply(pd.to_numeric, errors='ignore')
    y = dataset["price"]
     
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(
         X, y, test_size=0.2, random_state=42)
    
     # =============================================================================
     # XGBOOST
     # =============================================================================
     
    from xgboost import XGBRegressor     
    xgb = XGBRegressor(n_estimators = 50, max_depth = 7)
    xgb.fit(X_train, y_train)
    y_pred = xgb.predict(X_test)
    from sklearn.metrics import r2_score
    r2 = r2_score(y_test, y_pred)
     
    y_test.reset_index(drop = True, inplace = True)
    aciertos = 0
    totales = 0
    for res in range(len(y_pred)):
        totales += 1
        if abs(y_pred[res] - y_test[res]) < 10:
            aciertos += 1
    acc = aciertos / totales
    
    resultados_finales[barrio] = round(acc, 4)

In [42]:
print(resultados_finales)

{'Chamartín': 0.7045, 'Centro': 0.7047, 'Arganzuela': 0.745, 'Fuencarral - El Pardo': 0.7708, 'Tetuán': 0.7192, 'Chamberí': 0.7333, 'Hortaleza': 0.7544, 'Moncloa - Aravaca': 0.6957, 'Carabanchel': 0.822, 'Latina': 0.8257, 'Salamanca': 0.5656, 'Retiro': 0.6372, 'Ciudad Lineal': 0.8426, 'San Blas - Canillejas': 0.7541, 'Barajas': 0.7931, 'Usera': 0.9375, 'Villa de Vallecas': 0.8125, 'Villaverde': 0.8214, 'Moratalaz': 0.8095, 'Puente de Vallecas': 0.8718, 'Vicálvaro': 0.6667}


#### We can conclude that in general, we are obtaining a good prediction using a +/- 10 dollars range.