# Webscrapping Idealista

In this notebook we will try to take the atributes of the houses in Idealista given a property code.

In [1]:
# Libraries needed to webscraping
import requests
import pickle 
import time

import pandas as pd
import regex as re

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [2]:
import scipy.stats as stats
from scipy.special import boxcox, inv_boxcox

# Function to Box-Cox transform a column
def box_cox_transform(column, lamda= None):
    if lamda is None:
        column, lamda = stats.boxcox(column)
        
        return column, lamda
    else:
        column = boxcox(column, lamda)
        return column

# Function to inverse Box-Cox transform a column
def inv_box_cox_transform(column, lamda):
    column = inv_boxcox(column, lamda)
    return column

In [3]:
# Load processed data 
df = pd.read_csv('../data/processed/rent_Valencia.csv', index_col=0)
df.sample()

Unnamed: 0_level_0,price,numPhotos,floor,rooms,bathrooms,size,parkingSpacePrice,exterior,hasParkingSpace,isParkingSpaceIncludedInPrice,hasLift,hasPlan,has360,has3DTour,hasVideo,propertyType,direction
propertyCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
98060513,2500.0,47,0,3,2,155.0,0.0,False,False,False,False,False,False,False,False,chalet,north


In [4]:
# Example url
url = "https://www.idealista.com/inmueble/96766481/" 

# Create a new instance of the Chrome driver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

# Go to the page
driver.get(url)

# Get the price (by xpath)
xpath = '//*[@id="main"]/div/main/section[2]/div[3]/span/span'
price = driver.find_element(by=By.XPATH, value=xpath).text
price

'450'

Excellent! So we need to do this to get all the information we need.

In [5]:
# Try with another url
url = "https://www.idealista.com/inmueble/98276550/"
driver.get(url)

# =============================================================================
# Property Code
property_code = int(re.findall(r'\d+', url)[0])
print(f"property_code: {property_code}")

# =============================================================================
# Price
xpath = '//*[@id="main"]/div/main/section[2]/div[3]/span/span'
price = float(driver.find_element(by=By.XPATH, value=xpath).text)

# because it doesn't covert well to float (Ex: 1.500 = 1.5)
# so we catch that error and convert it to real price
if price < 99:
    price = price * 1000

print(f"price: {price}")

# =============================================================================
# numPhotos
xpath = '//*[@id="main"]/div/main/section[2]/div[1]/button[1]/span'
numPhotos = driver.find_element(by=By.XPATH, value=xpath).text
numPhotos = int(re.findall(r'\d+', numPhotos)[0])
print(f"numPhotos: {numPhotos}")

# =============================================================================
# floor, exterior, hasLift
xpath = '//*[@id="main"]/div/main/section[2]/div[4]/span[3]/span'
floor = driver.find_element(by=By.XPATH, value=xpath).text
floor = int(re.findall(r'\d+', floor)[0])
print(f"floor: {floor}")

# =============================================================================
xpath = '//*[@id="main"]/div/main/section[2]/div[4]/span[3]' 
exterior = driver.find_element(by=By.XPATH, value=xpath).text

# make exterior in lowercase
exterior = exterior.lower()

# Check if exterior contains 'exterior'
if 'exterior con ascensor' in exterior:
    exterior = True
    hasLift = True
elif 'exterior sin ascensor' in exterior:
    exterior = True
    hasLift = False
else:
    exterior = False
    hasLift = False

print(f"exterior: {exterior}")
print(f"hasLift: {hasLift}")

# =============================================================================
# numRooms
xpath = '//*[@id="main"]/div/main/section[2]/div[4]/span[2]/span'
numRooms = int(driver.find_element(by=By.XPATH, value=xpath).text)
print(f"numRooms: {numRooms}")

# =============================================================================
# numBathrooms
xpath = '//*[@id="details"]/div[3]/div[1]/div[1]/ul/li[3]'
numBathrooms = driver.find_element(by=By.XPATH, value=xpath).text
numBathrooms = int(re.findall(r'\d+', numBathrooms)[0])
print(f"numBathrooms: {numBathrooms}")

# =============================================================================
# size 
xpath = '//*[@id="main"]/div/main/section[2]/div[4]/span[1]/span'
size = float(driver.find_element(by=By.XPATH, value=xpath).text)
print(f"size: {size}")

# =============================================================================
# parkingSpacePrice, hasParkingSpace, isParkingSpaceIncludedInPrice
xpath = '//*[@id="main"]/div/main/section[2]/div[5]/span[4]/span/text()[2]'
try:
    parkingSpacePrice = float(driver.find_element(by=By.XPATH, value=xpath).text)
    
    hasParkingSpace = True
    isParkingSpaceIncludedInPrice = False
    
    print(f"parkingSpacePrice: {parkingSpacePrice}")
    print(f"hasParkingSpace: {hasParkingSpace}")
    print(f"isParkingSpaceIncludedInPrice: {isParkingSpaceIncludedInPrice}")

except Exception as e:
    parkingSpacePrice = 0
    isParkingSpaceIncludedInPrice = False
    
    print(f"parkingSpacePrice: {parkingSpacePrice}")
    print(f"isParkingSpaceIncludedInPrice: {isParkingSpaceIncludedInPrice}")

# =============================================================================
xpath = '//*[@id="main"]/div/main/section[2]/div[4]/span[4]/span'
try:
    garage = driver.find_element(by=By.XPATH, value=xpath).text

    if 'Garaje incluido' in garage:
        hasParkingSpace = True
        isParkingSpaceIncludedInPrice = True

        print(f"hasParkingSpace: {hasParkingSpace}")
        print(f"isParkingSpaceIncludedInPrice: {isParkingSpaceIncludedInPrice}")
        
    else:
        hasParkingSpace = False
        
        print (f"hasParkingSpace: {hasParkingSpace}")

except Exception as e:
    hasParkingSpace = False
    isParkingSpaceIncludedInPrice = False

    print(f"hasParkingSpace: {hasParkingSpace}")
# =============================================================================
# hasPlan
xpath = '//*[@id="main"]/div/main/section[2]/div[1]/button[2]/span'
try:
    hasPlan = driver.find_element(by=By.XPATH, value=xpath).text

    if 'Plano' in hasPlan:
        hasPlan = True
    else:
        hasPlan = False

    print(f"hasPlan: {hasPlan}")

except Exception as e:
    hasPlan = False

    print(f"hasPlan: {hasPlan}")

# =============================================================================
# has360
xpath = '//*[@id="main"]/div/main/section[2]/div[1]/button[2]/span'
try:
    has360 = driver.find_element(by=By.XPATH, value=xpath).text

    if 'Virtual Tour' in has360:
        has360 = True
    else:
        has360 = False

    print(f"has360: {has360}")

except Exception as e:
    has360 = False

    print(f"has360: {has360}")

# =============================================================================	
# has3DTour
xpath = '//*[@id="main"]/div/main/section[2]/div[1]/button[3]/span'
try:
    has3DTour = driver.find_element(by=By.XPATH, value=xpath).text

    if 'Visita 3D' in has3DTour:
        has3DTour = True
    else:
        has3DTour = False

    print(f"has3DTour: {has3DTour}")

except Exception as e:
    has3DTour = False

    print(f"has3DTour: {has3DTour}")
# =============================================================================
# hasVideo
xpath = '//*[@id="main"]/div/main/section[2]/div[1]/button[3]/span'
try:
    hasVideo = driver.find_element(by=By.XPATH, value=xpath).text

    if 'Vídeo' in hasVideo:
        hasVideo = True
    else:
        hasVideo = False

    print(f"hasVideo: {hasVideo}")

except Exception as e:
    hasVideo = False

    print(f"hasVideo: {hasVideo}")
# =============================================================================
# propertyType
xpath = '//*[@id="main"]/div/main/section[2]/div[2]/h1/span'
propertyType = driver.find_element(by=By.XPATH, value=xpath).text

# put propertyType in lowercase
propertyType = propertyType.lower()

if 'casa de pueblo' in propertyType:
    propertyType = 'countryHouse'
elif 'casa de campo' in propertyType:
    propertyType = 'countryHouse'
elif 'estudio' in propertyType:
    propertyType = 'studio'
elif 'dúplex' in propertyType:
    propertyType = 'duplex'
elif 'ático' in propertyType:
    propertyType = 'penthouse'
elif 'penthouse' in propertyType:
    propertyType = 'penthouse'
elif 'chalet' in propertyType:
    propertyType = 'chalet'
elif 'casa' in propertyType:
    propertyType = 'chalet'
else:
    propertyType = 'flat'

print(f"propertyType: {propertyType}")
# =============================================================================
# direction 
id = "sMap"

# roll down the page until the map is visible
while True:
    try:
        driver.find_element(by=By.ID, value=id)
        break
    except Exception as e:
        driver.execute_script("window.scrollBy(0, 100)")
        time.sleep(0.5)
        
url_map = driver.find_element(by=By.ID, value=id).get_attribute('src')

# get the center=lat,lng from the url
center = re.findall(r'center=(-?\d+\.\d+),(-?\d+\.\d+)', url_map)[0]

# transform the center into a tuple
center = tuple(map(float, center))

latitude = float(center[0])
longitude = float(center[1])

print(f'latitude: {latitude}')
print(f'longitude: {longitude}')

property_code: 98276550
price: 1500.0
numPhotos: 30
floor: 11
exterior: True
hasLift: True
numRooms: 3
numBathrooms: 2
size: 80.0
parkingSpacePrice: 0
isParkingSpaceIncludedInPrice: False
hasParkingSpace: True
isParkingSpaceIncludedInPrice: True
hasPlan: False
has360: False
has3DTour: False
hasVideo: False
propertyType: penthouse
latitude: 39.1034767
longitude: -0.2236471


So this is all the informatio we need to make a prediction.

In [10]:
# Make dictionary with all the data
data = {
    'propertyCode': property_code,
    'price': price,
    'numPhotos': numPhotos,
    'floor': floor,
    'rooms': numRooms,
    'bathrooms': numBathrooms,
    'size': size,
    'parkingSpacePrice': parkingSpacePrice,
    'exterior': exterior,
    'hasParkingSpace': hasParkingSpace,
    'isParkingSpaceIncludedInPrice': isParkingSpaceIncludedInPrice,
    'hasLift': hasLift,
    'hasPlan': hasPlan,
    'has360': has360,
    'has3DTour': has3DTour,
    'hasVideo': hasVideo,
    'propertyType': propertyType,
    'latitude': latitude,
    'longitude': longitude
}

df_predict = pd.DataFrame(data, index=[0])

# Make propertyCode column as index
df_predict.set_index('propertyCode', inplace=True)

# Clustering address
cluster = pickle.load(open('../models/kmeans_clustering.pkl', 'rb'))
df_predict['direction'] = cluster.predict(df_predict[['latitude', 'longitude']])
df_predict['direction'] = df_predict['direction'].map({0: 'central', 1: 'south', 2: 'north', 3: 'west'})

df_predict.drop(columns=['latitude', 'longitude'], inplace=True)

df_predict

Unnamed: 0_level_0,price,numPhotos,floor,rooms,bathrooms,size,parkingSpacePrice,exterior,hasParkingSpace,isParkingSpaceIncludedInPrice,hasLift,hasPlan,has360,has3DTour,hasVideo,propertyType,direction
propertyCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
98276550,1500.0,30,11,3,2,80.0,0,True,True,True,True,False,False,False,False,penthouse,south


In [11]:
# Import model
model = pickle.load(open('../models/new_model.pkl', 'rb'))
preprocessor = pickle.load(open('../models/preprocessor.pkl', 'rb'))
lamda = pickle.load(open('../models/lamda_value.pkl', 'rb'))

# preprocess the data
df_predict = preprocessor.transform(df_predict)

# predict the price
prediction = model.predict(df_predict)

# inverse the transformation
prediction = inv_box_cox_transform(prediction, lamda)

print(f'Predicted price: {prediction}')

Predicted price: [1303.42078669]


Excellent.. So let's make a script that will do this for us.