# Petition to API Idealista

Due to selenium is not working properly, because Idealista have a anti-Scraping system that redenrize the page with javascript and in Heroku is not possible to open the browser to renderize the page, we will use our key to access the API of Idealista.

In [1]:
# Libraries needed
import pickle
import os
import base64
import urllib
import requests as rq
import json
import pandas as pd
import time
import warnings

from dotenv import load_dotenv, find_dotenv

# Warning to avoid the warning message
warnings.filterwarnings("ignore")

# get the API key from the .env file
def get_oauth_token():

    url = "https://api.idealista.com/oauth/token"

    load_dotenv(find_dotenv('../utils/creds.env')) # Load .env file
    apikey = os.environ.get("API_KEY")
    secret = os.environ.get("SECRET")
    apikey_secret = apikey + ':' + secret

    auth = str(base64.b64encode(bytes(apikey_secret, 'utf-8')))[2:][:-1] # Get base64 encoded string

    headers = {'Authorization' : 'Basic ' + auth,'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8'}
    params = urllib.parse.urlencode({'grant_type':'client_credentials'}) #,'scope':'read'
    content = rq.post(url,headers = headers, params=params) # Get response
    bearer_token = json.loads(content.text)['access_token'] # Get access token

    return bearer_token

# Get the data from the API
def search_api(token, params):
    url = "https://api.idealista.com/3.5/es/search"

    headers = {'Content-Type': 'Content-Type: multipart/form-data;', 'Authorization' : 'Bearer ' + token} 
    content = rq.post(url, headers=headers, params=params) # Get response
    
    print(content)
    return content

If we search in de documentation of the API we can see that it is a parameter that its called adID, so we will try to get the properties of the house by the ID.

Example:
https://www.idealista.com/inmueble/98830505/

In [2]:
# Parameters needed for the petition
param = {
    "country" : 'es',
    "operation" : "rent",
    "propertyType" : "homes",
    "adIds": [98830505]
}

# Get the data
token = get_oauth_token()
data = search_api(token, param)

print(data.text)

<Response [400]>
{"message":"center or locationId is required","httpStatus":400}


So it will be limited to the province of Valencia...

In [3]:
# Parameters needed for the petition
param = {
    "country" : 'es',
    "operation" : "rent",
    "propertyType" : "homes",
    "locationId" : "0-EU-ES-46",
    "adIds": [98830505]
}

# Get the data
token = get_oauth_token()
data = search_api(token, param)

print(data.text)

<Response [200]>
{"elementList":[{"propertyCode":"98830505","thumbnail":"https://img3.idealista.com/blur/WEB_LISTING/0/id.pro.es.image.master/20/d1/48/1025198459.jpg","externalReference":"LD00206","numPhotos":6,"floor":"1","price":1400.0,"propertyType":"flat","operation":"rent","size":58.0,"exterior":true,"rooms":1,"bathrooms":1,"address":"Plaza Mercado","province":"València","municipality":"València","district":"Ciutat Vella","country":"es","neighborhood":"El Mercat","latitude":39.4744315,"longitude":-0.3788229,"showAddress":false,"url":"https://www.idealista.com/inmueble/98830505/","description":"SOLO ALQUILERES/GRUPO CONECTA INMOBILIARIA Alquila Gran Ocasión. Piso totalmente reformado y a estrenar. Totalmente céntrico con la comodidad de plaza de Garaje, lo cual es muy cómodo para vivir en el centro de Valencia, disfrutar teniendo un piso con todas las comodidades, luminoso y totalmente nuevo. Se alquila amueblado, con electrodomésticos, Pavimento de parquet, aire acondicionado Frío

This is what we need, let format the data to give it to the model.

In [4]:
df = pd.DataFrame(json.loads(data.text)["elementList"])
df

Unnamed: 0,propertyCode,thumbnail,externalReference,numPhotos,floor,price,propertyType,operation,size,exterior,...,parkingSpace,priceByArea,detailedType,suggestedTexts,hasPlan,has3DTour,has360,hasStaging,topNewDevelopment,superTopHighlight
0,98830505,https://img3.idealista.com/blur/WEB_LISTING/0/...,LD00206,6,1,1400.0,flat,rent,58.0,True,...,"{'hasParkingSpace': True, 'isParkingSpaceInclu...",24.0,{'typology': 'flat'},"{'subtitle': 'El Mercat, València', 'title': '...",False,False,False,False,False,False


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 37 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   propertyCode       1 non-null      object 
 1   thumbnail          1 non-null      object 
 2   externalReference  1 non-null      object 
 3   numPhotos          1 non-null      int64  
 4   floor              1 non-null      object 
 5   price              1 non-null      float64
 6   propertyType       1 non-null      object 
 7   operation          1 non-null      object 
 8   size               1 non-null      float64
 9   exterior           1 non-null      bool   
 10  rooms              1 non-null      int64  
 11  bathrooms          1 non-null      int64  
 12  address            1 non-null      object 
 13  province           1 non-null      object 
 14  municipality       1 non-null      object 
 15  district           1 non-null      object 
 16  country            1 non-null 

In [6]:
df = df[['propertyCode', 
'price', 
'numPhotos',
'floor',
'exterior',
'hasLift',
'rooms',
'bathrooms',
'size',
'parkingSpace',
'hasPlan',
'hasVideo',
'has360',
'has3DTour',
'propertyType',
'latitude',
'longitude'
]]

df

Unnamed: 0,propertyCode,price,numPhotos,floor,exterior,hasLift,rooms,bathrooms,size,parkingSpace,hasPlan,hasVideo,has360,has3DTour,propertyType,latitude,longitude
0,98830505,1400.0,6,1,True,True,1,1,58.0,"{'hasParkingSpace': True, 'isParkingSpaceInclu...",False,False,False,False,flat,39.474432,-0.378823


In [7]:
df.set_index('propertyCode', inplace=True)
df['parkingSpace']

propertyCode
98830505    {'hasParkingSpace': True, 'isParkingSpaceInclu...
Name: parkingSpace, dtype: object

In [8]:
def process_parkingSpace(df):
            '''
            This function process the bad formatted parkingSpace

            Parameters:
            -----------
            df: DataFrame with the raw data of parkingSpace

            Returns:
            --------
            df: DataFrame with the raw data processed.
            '''
            # replace all the ' to " in the parkingSpace column
            df['parkingSpace'] = df['parkingSpace'].str.replace('\'', '"')

            # convert the string to a dictionary
            df['parkingSpace'] = df['parkingSpace'].apply(
                lambda x: ast.literal_eval(x) 
                if type(x) == str else x
                )

            # get the hasParkingSpace of the dict of the parkingSpace column
            df['hasParkingSpace'] = df['parkingSpace'].apply(
                lambda x: x['hasParkingSpace'] if type(x) == dict else False
                )

            # get isParkingSpaceIncludedInPrice 
            df['isParkingSpaceIncludedInPrice'] = df['parkingSpace'].apply(
                lambda x: x['isParkingSpaceIncludedInPrice'] if type(x) == dict else False
                )

            # get the parkingSpacePrice 
            df['parkingSpacePrice'] = df['parkingSpace'].apply(
                lambda x: x['parkingSpacePrice'] 
                if type(x) == dict and 'parkingSpacePrice' in x else 0
                )

            # drop the parkingSpace column
            df.drop(columns=['parkingSpace'], inplace=True)
            
            print("process_parkingSpace process was successful")
            return df

df = process_parkingSpace(df)
df

process_parkingSpace process was successful


Unnamed: 0_level_0,price,numPhotos,floor,exterior,hasLift,rooms,bathrooms,size,hasPlan,hasVideo,has360,has3DTour,propertyType,latitude,longitude,hasParkingSpace,isParkingSpaceIncludedInPrice,parkingSpacePrice
propertyCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
98830505,1400.0,6,1,True,True,1,1,58.0,False,False,False,False,flat,39.474432,-0.378823,False,False,0


In [9]:
def retype_data(df):
            ''' 
            This function forcetype all columns of the dataset

            Parameters:
            -----------
            df: DataFrame with the raw data

            Returns:
            --------
            df: DataFrame with secure types.
            '''

            # int types
            df['numPhotos'] = df['numPhotos'].astype(int)
            df['floor'] = df['floor'].astype(int)
            df['rooms'] = df['rooms'].astype(int)
            df['bathrooms'] = df['bathrooms'].astype(int)

            # float types
            df['price'] = df['price'].astype(float)
            df['size'] = df['size'].astype(float)
            df['parkingSpacePrice'] = df['parkingSpacePrice'].astype(float)
            df['latitude'] = df['latitude'].astype(float)
            df['longitude'] = df['longitude'].astype(float)

            # boolean types
            df['exterior'] = df['exterior'].astype(bool)
            df['hasParkingSpace'] = df['hasParkingSpace'].astype(bool)
            df['isParkingSpaceIncludedInPrice'] = df['isParkingSpaceIncludedInPrice'].astype(bool)
            df['hasLift'] = df['hasLift'].astype(bool)
            df['hasPlan'] = df['hasPlan'].astype(bool)
            df['has360'] = df['has360'].astype(bool)
            df['has3DTour'] = df['has3DTour'].astype(bool)
            df['hasVideo'] = df['hasVideo'].astype(bool)

            # object types
            df['propertyType'] = df['propertyType'].astype(str)

            order_of_cols = ['price', 'numPhotos', 'floor', 'rooms', 'bathrooms',
            'size', 'parkingSpacePrice', 'latitude', 'longitude', 'exterior', 
            'hasParkingSpace', 'isParkingSpaceIncludedInPrice', 
            'hasLift', 'hasPlan', 'has360', 'has3DTour', 'hasVideo',
            'propertyType']
            
            df = df[order_of_cols]
            
            print("retype_data process was successful")
            return df

df = retype_data(df)
df

retype_data process was successful


Unnamed: 0_level_0,price,numPhotos,floor,rooms,bathrooms,size,parkingSpacePrice,latitude,longitude,exterior,hasParkingSpace,isParkingSpaceIncludedInPrice,hasLift,hasPlan,has360,has3DTour,hasVideo,propertyType
propertyCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
98830505,1400.0,6,1,1,1,58.0,0.0,39.474432,-0.378823,True,False,False,True,False,False,False,False,flat


In [10]:
# Clustering address
cluster = pickle.load(open('../models/kmeans_clustering.pkl', 'rb'))
df['direction'] = cluster.predict(df[['latitude', 'longitude']])
df['direction'] = df['direction'].map({0: 'central', 1: 'south', 2: 'north', 3: 'west'})

df.drop(columns=['latitude', 'longitude'], inplace=True)

df

Unnamed: 0_level_0,price,numPhotos,floor,rooms,bathrooms,size,parkingSpacePrice,exterior,hasParkingSpace,isParkingSpaceIncludedInPrice,hasLift,hasPlan,has360,has3DTour,hasVideo,propertyType,direction
propertyCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
98830505,1400.0,6,1,1,1,58.0,0.0,True,False,False,True,False,False,False,False,flat,central


A few steps and our data is ready to be predicted...

In [11]:
from scipy.special import inv_boxcox

# Function to inverse Box-Cox transform a column
def inv_box_cox_transform(column, lamda):
    column = inv_boxcox(column, lamda)
    return column

In [13]:
# Import model
model = pickle.load(open('../models/my_model_histgb.pkl', 'rb'))
preprocessor = pickle.load(open('../models/preprocessor.pkl', 'rb'))
lamda = pickle.load(open('../models/lamda_value.pkl', 'rb'))

# preprocess the data
df_predict = preprocessor.transform(df)

# predict the price
prediction = model.predict(df_predict)

# inverse the transformation
prediction = inv_box_cox_transform(prediction, lamda)

print(f'Predicted price: {prediction}')

Predicted price: [748.71066631]


Woa, what a bad prediction, but we can see that all is working!! So let's try with this solution