In [15]:
import requests
from bs4 import BeautifulSoup
import time
import numpy as np
import pandas as pd
import random
from itertools import cycle
from requests.exceptions import ProxyError
from lxml.html import fromstring
with open('../.env', 'r') as f:
    key = f.read()

In [2]:
#Functions to get proxis

#Proxies obtained from the file proxies.txt. where the proxies are pasted from https://www.proxy-list.download/es/HTTPS
#Before launching the process, it is recommended to paste the latest proxies
def get_proxies():
    proxie_file = 'proxies.txt'
    try:
        with open(proxie_file) as f:
            proxies = f.readlines()
        return set(proxies)
    except:
        pass

#Proxies obtained by scraping https://free-proxy-list.net/
def get_proxies2():
    url = 'https://free-proxy-list.net/'
    headers = {}
    agent = ua_random()
    headers['user-agent'] = agent
    response = requests.get(url, headers = headers)
    parser = fromstring(response.text)
    proxies = set()
    for i in parser.xpath('//tbody/tr')[:10]:
        if i.xpath('.//td[7][contains(text(),"yes")]'):
            #Grabbing IP and corresponding PORT
            proxy = ":".join([i.xpath('.//td[1]/text()')[0], i.xpath('.//td[2]/text()')[0]])
            proxies.add(proxy)
    return proxies

In [5]:
#Function to get a random agent for each request, from the file agents.txt
def ua_random():
    random_ua = ''
    ua_file = 'agents.txt'
    
    try:
        
        with open(ua_file) as f:
            lines = f.readlines()
        if len(lines) > 0:
            random_ua = random.choice(lines)
            random_ua = random_ua.split('\n')
            
    except Exception as ex:
        print('Exception in ua_random')
        print(str(ex))
        
    finally:
        return random_ua[0] 

In [6]:
#Function to get the content of a given URL as parameter

def scrape_url(url, headers):
    proxies = get_proxies()
    proxy_pool = cycle(proxies)
    flag = False
    
    while flag == False:
        proxy = next(proxy_pool)
        try:
            r = requests.get(url, headers = headers, proxies={"http": proxy})
            r.raise_for_status()
            flag = True
            return BeautifulSoup(r.content)
        except requests.exceptions.HTTPError as e_http:
            pass
        except requests.exceptions.RequestException as e:
            pass
        
    if flag == False:
        # none of the proxies worked
        raise ProxyError

In [7]:
#Function to parse each web page obtained in every request

def parser_page(content):
    anuncios = content.find_all('div', 'information')
    pisos = []
    
    for piso in anuncios:
        items = {}
        items['location1'] = piso.find('a', 'anuncioLink').string.replace('Piso en ', '')
        items['location2'] = str(piso.find('div', 'location').string).strip()
        items['price'] = piso.find('div', 'price').contents[0].strip().split(' ')[0]
        items['description'] = str(piso.find('div', 'description').string).strip()
        chars = piso.find('div', 'characteristics').find_all('div', 'item')
        
        for char in chars:
            if '€/m²' in char.contents[0].strip():
                items['price_m2'] = char.contents[0].strip().split(' ')[0].strip()
            elif 'm²' in char.contents[0]:
                items['size'] = char.contents[0].strip().split(' ')[0]
            elif char.find('span', 'icoBed') != None:
                items['rooms'] = int(char.contents[0].strip().split(' ')[0])
                #print(items)
            elif char.find('span', 'icoBath') != None:
                items['bathrooms'] = int(char.contents[0].strip().split(' ')[0])
            elif 'planta' in char.contents[0]:
                items['floor'] = char.contents[0].strip().split(' ')[0]
            elif 'Bajo' in char.contents[0]:
                items['floor'] = 0 
                
        pisos.append(items)
    return pisos

In [10]:
#Function to launch the process, for a given URL, a given range of pages and header values 
#adapted to the web that is going to be scraped.

def kickstart(URL_general, range_to_scrape, headers):
    datos_pisos = []
    
    for i in range_to_scrape:
        agent = ua_random()
        headers['user-agent'] = agent
        content = scrape_url(URL_general % i, headers)
        datos_pisos += parser_page(content)
        time.sleep(random.randint(2, 5))
        
    return datos_pisos

In [11]:
#Function to obtain the coordinates from Google Geocode API from the address
def get_position(x):
    url = 'https://maps.googleapis.com/maps/api/geocode/json?address='
    try:
        r = requests.get(url + '+'.join(x.split(' ')) + '&key=' + key)
        datos = r.json()
        location = datos['results'][0]['geometry']['location']
        #time.sleep(2)
        return location
    except:
        return None

In [1]:
#Function to generate the column with latitude and longitude for the dataframe provided
def get_value_from_dict(dict, x):
    try:
        return dict[x]
    except:
        return None

def create_positions_field(df):
    df['location'] = df['location1'] + ', ' + df['location2'] + 'Comunidad de Madrid, España'
    dict_pos = {}
    
    for location in df['location']:
        dict_pos[location] = get_position(location)
    df['position'] = df.loc[:, 'location'].apply(lambda x: get_value_from_dict(dict_pos, x))
    
    return df

In [None]:
#Final execution of the process to obtain the raw dataset

URL_general = 'https://www.pisos.com/venta/piso-madrid/%s/'
range_to_scrape = range(1, 520)
headers = {'Accept-Encoding': 'gzip, deflate, br',
          'Accept-Language': 'en-US,en;q=0.9,es-ES;q=0.8,es;q=0.7',
          'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
          'Cache-Control': 'max-age=0',
          'Upgrade-Insecure-Requests': '1',
          'Refereer': 'https://www.pisos.com/venta/piso-madrid/',
          'Host': 'www.pisos.com'}

items = kickstart(URL_general, range_to_scrape, headers)

data_set_raw = pd.DataFrame(items)
data_set_raw_w_loc = create_positions_field(data_set_raw)

#Dataframe exported as CSV
data_set_raw_w_loc.to_csv('house_data_total.csv', index = False)