In [1]:
import requests
from bs4 import BeautifulSoup
from unidecode import unidecode
from copy import copy
import pandas as pd

In [2]:
headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"}

In [3]:
url_base = 'https://www.vivareal.com.br'

In [4]:
# Inputs:
business_type = 'aluguel' # ['aluguel', 'venda']
state = 'Paraná'
city = 'Curitiba'
prop_type = 'apartamento' # ['apartamento', 'casa']
n_park = 1
n_bath = 1
n_room = 2
min_price = 500
max_price = 2000
min_area = None
max_area = None
n_pages = 5

In [5]:
def get_search_terms():
    s_state = unidecode(state).lower().strip().replace(' ', '-')
    s_state = 'rj' if s_state == 'rio-de-janeiro' else s_state

    search_list = [business_type, s_state]

    if city is not None:
        s_city = unidecode(city).lower().strip().replace(' ', '-')
        search_list.append(s_city)

    s_prop = prop_type + "_residencial"
    search_list.append(s_prop)

    search_terms = "/".join(search_list)

    return search_terms

In [6]:
# Fragment filters:

def get_fragment_filters():
    dict_frag_values = {}
    dict_frag_values['area-ate']= max_area
    dict_frag_values['area-desde'] = min_area
    dict_frag_values['banheiros'] = n_bath
    dict_frag_values['preco-ate'] = max_price
    dict_frag_values['preco-desde'] = min_price
    dict_frag_values['quartos'] = n_room
    dict_frag_values['vagas'] = n_park

    fragment_filter = None
    for frag in dict_frag_values.keys():
        frag_value = dict_frag_values[frag]
        if frag_value is not None:
            if fragment_filter is None:
                fragment_filter = '#' + frag + "=" + str(frag_value)
            else:
                fragment_filter = fragment_filter + "&" + frag + "=" + str(frag_value)

                
    return fragment_filter

In [7]:
# url:

def get_url():
    url = url_base + "/" + get_search_terms() + "/" + get_fragment_filters()
    return url

In [8]:
# Extrai o grid principal:

def get_main_grid(main_page_soup):
    dict_main_grid = {'name': 'div', 'attrs': {'class': 'results-list js-results-list'}}
    main_grid = main_page_soup.find(name=dict_main_grid['name'], attrs=dict_main_grid['attrs'])
    return main_grid

In [9]:
# Extrai a lista de imoveis:

def get_properties_list(main_grid_soup):
    dict_items_list = {'name': 'article', 'attrs': {'class': 'property-card__container js-property-card'}}
    items_list = main_grid_soup.find_all(name=dict_items_list['name'], attrs=dict_items_list['attrs'])
    return items_list

In [10]:
# Extrai o link:

def get_item_url(item_soup):
    item_link = item_soup.find(name='a')
    item_url = url_base + item_link['href']
    return item_url

In [11]:
# Extrai o id:

def get_item_id(item_url):
    item_id = item_url.split('-')
    id_idx = item_id.index('id') + 1
    item_id = item_id[id_idx][:-1]
    return item_id

In [12]:
# Obtem o endereço:

def get_item_address(item_page_soup):
    dict_item_address = {'name':'p', 'attrs':{'class': 'title__address js-address'}}
    item_address = item_page_soup.find(name=dict_item_address['name'], attrs=dict_item_address['attrs']).string
    return item_address

In [13]:
# Obtem o condomínio:

def get_item_condominum_name(item_page_soup):
    dict_item_cond = {'name': 'span', 'attrs': {'class': 'title__condominium'}}
    item_condominum_name = item_page_soup.find(name=dict_item_cond['name'], attrs=dict_item_cond['attrs']).find('a').string
    return item_condominum_name

In [14]:
# Obtem o preço:

def get_item_price(item_page_soup):
    dict_item_price = {'name': 'h3', 'attrs': {'class': 'price__price-info js-price-sale'}}
    item_price = item_page_soup.find(name=dict_item_price['name'], attrs=dict_item_price['attrs'])
    item_price = float(item_price.get_text().strip().split(' ')[1].split('/')[0].replace('.', ''))
    return item_price

In [15]:
# Obtem o preço do condomínio:

def get_item_condominum_price(item_page_soup):
    dict_price_cond = {'name': 'span', 'attrs': {'class': 'price__list-value condominium js-condominium'}}
    item_condominum_price = item_page_soup.find(name=dict_price_cond['name'], attrs=dict_price_cond['attrs']).string.strip()
    item_condominum_price = float(item_condominum_price.split(' ')[1].replace('.', ''))
    return item_condominum_price

In [16]:
# Obtem o preço do IPTU:

def get_item_iptu_price(item_page_soup):
    dict_price_iptu = {'name': 'span', 'attrs': {'class': 'price__list-value iptu js-iptu'}}
    item_iptu_price = item_page.find(name=dict_price_iptu['name'], attrs=dict_price_iptu['attrs']).string.strip()
    item_iptu_price = float(item_iptu_price.split(' ')[1].replace('.', ''))
    return item_iptu_price

In [17]:
# Obtem o descritivo do anúncio:

def get_item_description(item_page_soup):

    dict_item_descr = {'name': 'p', 'attrs': {'class': 'description__text'}}

    item_description = item_page.find(name=dict_item_descr['name'], attrs=dict_item_descr['attrs'])

    for br in item_description.find_all("br"):
        br.replace_with("\n")
    
    item_description = item_description.get_text().strip()

    return item_description

In [18]:
# Obtem características principais:

def get_main_features(item_page_soup):
    dict_features = {'name': 'ul', 'attrs': {'class': 'features'}}

    item_features = item_page_soup.find(name=dict_features['name'], attrs=dict_features['attrs'])

    feat_list = item_features.find_all('li')

    dict_feat = {}

    for feat in feat_list:
        f_title = feat['title']
        try:
            f_value = float(feat.find('span').string.strip())
        except:
            f_value = None
        dict_feat[f_title] = f_value
        
    return dict_feat

In [20]:
dict_final = None

for page in range(1, n_pages+1, 1):
    params = {'pagina': page}
    
    # Extrai a URL principal:
    try:
        url = get_url()
    except Exception as e:
        print('URL inválida', e)
        pass
    
    # Extrai a página principal:
    try:
        main_page = BeautifulSoup(requests.get(url, headers=headers, params=params).content, 'html.parser')
    except Exception as e:
        print("Erro ao extrair a página principal", e)
        pass
    
    # Extrai o grid de imóveis:
    try:
        main_grid = get_main_grid(main_page)
    except Exception as e:
        print("Erro ao extrair o grid de busca", e)
        pass
        
    # Extrai a lista de imóveis:
    try:
        items_list = get_properties_list(main_grid)
    except Exception as e:
        print("Erro ao extrair a lista de imóveis", e)
        pass
    
    # Realiza o loop por todos os imóveis:
    for item in items_list:
        
        # Extrai a url do imóvel:
        try:
            item_url = get_item_url(item)
        except Exception as e:
            print("Erro ao extrair a url do imóvel", e)
            pass
        
        # Acessa a página do imóvel:
        try:
            item_page = BeautifulSoup(requests.get(item_url, headers=headers).content, 'html.parser')
        except Exception as e:
            print("Erro ao acessar a página do imóvel", e)
            pass
        
        
        # Obtem as informações do imóvel:
        dict_item = {}
        
        try:
            dict_item['id'] = [get_item_id(item_url)]
        except:
            dict_item['id'] = [None]
            
        try:
            dict_item['address'] = [get_item_address(item_page)]
        except:
            dict_item['address'] = [None]
            
        try:
            dict_item['cond_name'] = [get_item_condominum_name(item_page)]
        except:
            dict_item['cond_name'] = [None]
            
        try:
            dict_item['price'] = [get_item_price(item_page)]
        except:
            dict_item['price'] = [None]
    
        try:
            dict_item['cond_price'] = [get_item_condominum_price(item_page)]
        except:
            dict_item['cond_price'] = [None]
            
        try:
            dict_item['iptu_price'] = [get_item_iptu_price(item_page)]
        except:
            dict_item['iptu_price'] = [None]
            
        try:
            dict_features = get_main_features(item_page)
        except:
            None
        try:
            dict_item['area'] = [dict_features['Área']]
        except:
            dict_item['area'] = [None]
        try:
            dict_item['n_room'] = [dict_features['Quartos']]
        except:
            dict_item['n_room'] = [None]
        try:
            dict_item['n_bath'] = [dict_features['Banheiros']]
        except:
            dict_item['n_bath'] = [None]
        try:
            dict_item['n_park'] = [dict_features['Vagas']]
        except:
            dict_item['n_park'] = [None]
                
        try:
            dict_item['description'] = [get_item_description(item_page)]
        except:
            dict_item['description'] = [None]
                
        dict_item['url'] = [item_url]
            
        # Atualiza o dicionário final:
        if dict_final is None:
            dict_final = copy(dict_item)
        else:
            for key in dict_item.keys():
                dict_final[key] += dict_item[key]

In [21]:
df_final = pd.DataFrame.from_dict(dict_final)

In [22]:
df_final.shape

(180, 12)

In [23]:
df_final.head()

Unnamed: 0,id,address,cond_name,price,cond_price,iptu_price,area,n_room,n_bath,n_park,description,url
0,2517971680,"Rua Desembargador Isaías Bevilaqua, 800 - Merc...",,1490.0,350.0,40.0,22.0,1.0,1.0,,Lindo Apartamento Studio para Locação\nNa melh...,https://www.vivareal.com.br/imovel/apartamento...
1,2518735856,"Avenida Sete de Setembro, 3000 - Centro, Curit...",Condomínio Edificio Lifespace Sete de Setembro,1300.0,300.0,40.0,34.0,1.0,1.0,,ALUGO APARTAMENTO STUDIO\n\nEDIFÍCIO LIFESPACE...,https://www.vivareal.com.br/imovel/apartamento...
2,2510171400,"Rua Vereador Garcia Rodrigues Velho, 234 - Cab...",,3500.0,1050.0,2588.0,161.0,3.0,4.0,2.0,"Apartamento face norte, com 161m², excelente l...",https://www.vivareal.com.br/imovel/apartamento...
3,2518631679,"Rua Brigadeiro Franco, 2190 - Batel, Curitiba ...",Condomínio Edificio Brigadeiro Towers,2100.0,450.0,850.0,62.0,2.0,1.0,1.0,Detalhes do apartamento: sala para dois ambien...,https://www.vivareal.com.br/imovel/apartamento...
4,2515893810,"Rua Afonso Piotto, 108 - Cidade Industrial, Cu...",Condomínio Residencial Aruana,1650.0,360.0,841.0,80.0,3.0,1.0,1.0,"Imóvel novo, piso porcelanato, cozinha com arm...",https://www.vivareal.com.br/imovel/apartamento...


In [25]:
len(df_final.id.unique())

41