# Renting Scrapper

#### imports

In [75]:
import urllib.request
from bs4 import BeautifulSoup
import re
import pandas as pd

#### HTML data finders

In [89]:
def findTopology(result):
    topologia = result.find('img')['alt']
    pattern = re.compile('T[0-2]([+][0-2])?')
    topologia = re.search(pattern, topologia).group()
    return topologia

def findLocation(result):
    location = result.find('p', {'class': 'searchPropertyLocation'}).text.strip().split(',')
    cidade = location[-1].strip()
    freguesia = location[-2].strip()
    return [cidade, freguesia]

def findLink(result):
    link = result.find('a', { 'id': re.compile('MC_PropertyInList.*')})
    link = 'https://casa.sapo.pt' + link['href']
    return link

def findPrice(result):
    price = result.find('div', {'class': 'searchPropertyPrice'})
    price = price.find('span').text.strip()  
    if '/' in price:
        price = price.split('/',1)[1] 
    price = re.sub('[^0-9]+', '', price)
    return float(price)


#### Testing for single result page

In [6]:
url = "https://casa.sapo.pt/Alugar/Apartamentos/T0-ate-T2/Lisboa/?sa=11&pn=1"

In [7]:
with urllib.request.urlopen(url) as response:
   page = response.read()
   soup = BeautifulSoup(page, "html.parser")

In [37]:
results = soup.findAll('div', {'class' : 'searchResultProperty'})

In [78]:
col_names =  ['link', 'city', 'freguesia','topology', 'price']
results_df  = pd.DataFrame(columns = col_names)

for result in results:
    
    link = findLink(result)
    location = findLocation(result)
    city = location[0]
    freguesia = location[1]
    topology = findTopology(result)
    price = findPrice(result)
    
    results_df.loc[len(results_df)] = [link,city,freguesia,topology,price]

In [79]:
results_df

Unnamed: 0,link,city,freguesia,topology,price
0,https://casa.sapo.pt//Apartamento-T1-Alugar-Li...,Lisboa,Arroios,T1,2850.0
1,https://casa.sapo.pt//Apartamento-T1-Venda-Alu...,Lisboa,Penha de França,T1+1,700.0
2,https://casa.sapo.pt//Apartamento-T1-Alugar-Li...,Lisboa,Parque das Nações,T1,1400.0
3,https://casa.sapo.pt//Apartamento-T2-Alugar-Li...,Lisboa,Alvalade,T2,1250.0
4,https://casa.sapo.pt//Apartamento-T2-Alugar-Li...,Lisboa,Estrela,T2,645.0
5,https://casa.sapo.pt//Apartamento-T0-Venda-Alu...,Lisboa,Ajuda,T0+1,700.0
6,https://casa.sapo.pt//Apartamento-T1-Alugar-Li...,Lisboa,Arroios,T1+1,850.0
7,https://casa.sapo.pt//Apartamento-T2-Alugar-Li...,Lisboa,Avenidas Novas,T2,1100.0
8,https://casa.sapo.pt//Apartamento-T2-Venda-Alu...,Lisboa,Santo António,T2+1,2350.0
9,https://casa.sapo.pt//Apartamento-T2-Alugar-Li...,Lisboa,Alvalade,T2,2100.0


#### Getting all that data

In [103]:
col_names =  ['link', 'city', 'freguesia','topology', 'price']
results_df  = pd.DataFrame(columns = col_names)

In [104]:
page_total = 20
base_url = "https://casa.sapo.pt/Alugar/Apartamentos/T0-ate-T2/Lisboa/?sa=11&pn="

In [105]:
for i in range(1,page_total+1):
    print('on page ' + str(i))
    url = base_url + str(i)
    with urllib.request.urlopen(url) as response:
        page = response.read()
    
    soup = BeautifulSoup(page, "html.parser")
    results = soup.findAll('div', {'class' : 'searchResultProperty'})
    
    for index, result in enumerate(results):
        try: 
            link = findLink(result)
            location = findLocation(result)
            city = location[0]
            freguesia = location[1]
            topology = findTopology(result)
            price = findPrice(result)
            
            results_df.loc[len(results_df)] = [link,city,freguesia,topology,price]
        except AttributeError:
            print('result ' + str(index) + ' of page ' + str(i) + 'not parsed')


on page 1
on page 2
on page 3
on page 4
on page 5
on page 6
on page 7
on page 8
on page 9
on page 10
on page 11
on page 12
on page 13
on page 14
on page 15
on page 16
on page 17
on page 18
on page 19
result 18 of page 19not parsed
result 21 of page 19not parsed
result 22 of page 19not parsed
on page 20
result 1 of page 20not parsed
result 2 of page 20not parsed
result 5 of page 20not parsed
result 8 of page 20not parsed
result 12 of page 20not parsed
result 13 of page 20not parsed
result 15 of page 20not parsed


In [108]:
with open('rentingData.json', 'w') as f:
    f.write(results_df.to_json(orient='records', lines=True))