In [384]:
import re
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import pandas as pd
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import time
from numpy import random


In [385]:
class Apartment:
    def __init__(self, apt_driver):
        self.apt_driver = apt_driver
     
    def get_url(self):
        self.url = self.apt_driver.find_element(By.CSS_SELECTOR, '[href]').get_attribute('href')
        return self.url
        
    def get_rental_price(self):
        self.rental_price = self._get_number_from_class('js-property-card__price-small')
        return self.rental_price
    
    def get_condo_price(self):
        self.condo_price = self._get_number_from_class('js-condo-price')
        return self.condo_price
    
    def get_area(self):
        self.area = self._get_number_from_class('js-property-card-detail-area')
        return self.area
    
    def get_rooms(self):
        self.rooms = self._get_number_from_class('js-property-detail-rooms')
        return self.rooms
    
    def get_bathrooms(self):
        self.bathrooms = self._get_number_from_class('js-property-detail-bathroom')
        return self.bathrooms
    
    def get_address(self):
        self.address = self._get_text_from_class('property-card__address')
        return self.address
    
    def get_amenities(self):
        raw_amenities = self._get_text_from_class('property-card__amenities')
        if raw_amenities:
            self.amenities = ', '.join(raw_amenities.split('\n'))
            return self.amenities
    
    def _get_text_from_class(self, class_name):
        try:
            return self.apt_driver.find_element(By.CLASS_NAME, class_name).text
        except:
            # print('Not Found class', class_name)
            return None

    def _get_number_from_class(self, class_name):
        try:
            found_driver = self.apt_driver.find_element(By.CLASS_NAME, class_name)
            return int(''.join(re.findall(r'\d+', found_driver.text)))
        except:
            # print('Not Found class', class_name)
            return None

    def mount_df(self):
        return (
            self.get_rental_price(),
            self.get_condo_price(),
            self.get_area(),
            self.get_rooms(),
            self.get_bathrooms(),
            self.get_address(),
            self.get_amenities(),
            self.get_url()
            )



In [386]:
driver = webdriver.Chrome('chromedriver.exe')
link_raw = 'https://www.vivareal.com.br/aluguel/' + \
            'rj/rio-de-janeiro/zona-sul/{0}/' + \
            '#area-desde={1}&banheiros={2}&preco-ate={3}&preco-total=sim&quartos={4}'

neighborhoods = ['ipanema', 'leblon']
min_area = '100'
min_bathrooms = '2'
max_price = '9000'
min_rooms = '3'

data = []
for neighborhood in neighborhoods:
    print('Getting data from:', neighborhood)
    driver.get(link_raw.format(neighborhood, min_area, min_bathrooms, max_price, min_rooms))
    driver.maximize_window()
    try:
        driver.find_element(By.XPATH, '//*[@id="cookie-notifier-cta"]').click()
    except:
        pass
    time.sleep(5)
    i = 1
    while True:
        apartments = driver.find_elements(By.XPATH, '//*[@id="js-site-main"]/div[2]/div[1]/section/div[2]/div[1]/div')
        apt_list = [Apartment(apt) for apt in apartments]
        data.extend([apt.mount_df() for apt in apt_list])
        i += 1
        try:
            print('Trying to go to page n.', i)
            next_page = driver.find_element(By.XPATH, "//*[contains(text(), 'Próxima página')]")
            next_page.location_once_scrolled_into_view
            print('waiting to click')
            time.sleep(2)
            next_page.click()
            time.sleep(4)
        except Exception as e:
            print(e)
            print(f'Page {i} not found')
            break

driver.quit()

  driver = webdriver.Chrome('chromedriver.exe')


Getting data from: ipanema
Trying to go to page n. 2
waiting to click
Trying to go to page n. 3
waiting to click
Trying to go to page n. 4
waiting to click


StaleElementReferenceException: Message: stale element reference: element is not attached to the page document
  (Session info: chrome=96.0.4664.110)
Stacktrace:
Backtrace:
	Ordinal0 [0x00BD65C3+2516419]
	Ordinal0 [0x00B6FAA1+2095777]
	Ordinal0 [0x00A72698+1058456]
	Ordinal0 [0x00A74D94+1068436]
	Ordinal0 [0x00A74C5E+1068126]
	Ordinal0 [0x00A74EC0+1068736]
	Ordinal0 [0x00A9D022+1232930]
	Ordinal0 [0x00A9D49B+1234075]
	Ordinal0 [0x00A938F1+1194225]
	Ordinal0 [0x00AB637A+1336186]
	Ordinal0 [0x00A93876+1194102]
	Ordinal0 [0x00AB642A+1336362]
	Ordinal0 [0x00AC5A1F+1399327]
	Ordinal0 [0x00AB620B+1335819]
	Ordinal0 [0x00A925E7+1189351]
	Ordinal0 [0x00A93449+1193033]
	GetHandleVerifier [0x00D63AA4+1573796]
	GetHandleVerifier [0x00E0D8E7+2269671]
	GetHandleVerifier [0x00C6491B+528923]
	GetHandleVerifier [0x00C63D69+525929]
	Ordinal0 [0x00B750A9+2117801]
	Ordinal0 [0x00B79458+2135128]
	Ordinal0 [0x00B79592+2135442]
	Ordinal0 [0x00B830F1+2175217]
	BaseThreadInitThunk [0x76EEFA29+25]
	RtlGetAppContainerNamedObjectPath [0x77947A9E+286]
	RtlGetAppContainerNamedObjectPath [0x77947A6E+238]


In [260]:
df = pd.DataFrame(data, columns = ['rental_price', 'condo_price', 'area', 'rooms', 'bathrooms', 'address', 'amenities', 'url'])
print(len(df))
df.head()

281


Unnamed: 0,rental_price,condo_price,area,rooms,bathrooms,address,amenities,url
0,5900,1950.0,158,3,2,"Ipanema, Rio de Janeiro - RJ","Elevador, Interfone, Salão de festas, Armário ...",https://www.vivareal.com.br/imovel/apartamento...
1,4100,2420.0,100,3,2,"Rua Prudente de Morais, 1133 - Ipanema, Rio de...","Elevador, Condomínio fechado, Portão eletrônic...",https://www.vivareal.com.br/imovel/apartamento...
2,5500,2200.0,117,3,3,"Rua Garcia D'Avila - Ipanema, Rio de Janeiro - RJ","Elevador, Ar-condicionado, Salão de festas, Ar...",https://www.vivareal.com.br/imovel/apartamento...
3,4300,1570.0,102,3,3,"Rua Visconde de Pirajá - Ipanema, Rio de Janei...","Portaria 24h, Portão eletrônico, Aceita animais",https://www.vivareal.com.br/imovel/apartamento...
4,6700,1750.0,158,3,3,"Avenida Epitácio Pessoa - Ipanema, Rio de Jane...",Ar-condicionado,https://www.vivareal.com.br/imovel/apartamento...


In [285]:
address_blocklist = ['Elizabeth', 'Teixeira', 'Correia de Melo', 'Gorceix', 'Timóteo', 'Canning', 'Parreiras', 'Epitácio',
'Bartolomeu', 'Farme', 'Gomes Carneiro', 'Tubira', 'Francisco Otaviano', 'Borges de Medeiros']

for block_address in address_blocklist:
    df = df[~df['address'].str.contains(block_address)]

In [286]:
df['cond_per_rent'] = df['condo_price'] / df['rental_price']
df = df.sort_values(by='cond_per_rent').reset_index(drop=True)
df = df.dropna(subset=['condo_price'])
df['total'] = df['condo_price'] + df['rental_price']
df.tail(50)

Unnamed: 0,rental_price,condo_price,area,rooms,bathrooms,address,amenities,url,cond_per_rent,num_flag,total
156,6000,2375.0,125,3,3,"Rua Prudente de Morais, 261 - Ipanema, Rio de ...",,https://www.vivareal.com.br/imovel/apartamento...,0.395833,True,8375.0
157,4800,1900.0,101,3,3,"Avenida Ataulfo de Paiva - Leblon, Rio de Jane...",Elevador,https://www.vivareal.com.br/imovel/apartamento...,0.395833,,6700.0
158,6000,2380.0,135,3,2,"Rua Barão da Torre - Ipanema, Rio de Janeiro - RJ","Elevador, Quarto de serviço, Banheiro de servi...",https://www.vivareal.com.br/imovel/apartamento...,0.396667,,8380.0
159,5000,2000.0,133,4,4,"Leblon, Rio de Janeiro - RJ","Elevador, Interfone",https://www.vivareal.com.br/imovel/apartamento...,0.4,,7000.0
160,5500,2200.0,117,3,3,"Rua Garcia D'Avila - Ipanema, Rio de Janeiro - RJ","Elevador, Interfone, Armário na cozinha, Armár...",https://www.vivareal.com.br/imovel/apartamento...,0.4,,7700.0
161,5500,2200.0,120,3,2,"Rua Garcia D'Avila - Ipanema, Rio de Janeiro - RJ","Elevador, Garagem, Cozinha, Interfone, Circuit...",https://www.vivareal.com.br/imovel/apartamento...,0.4,,7700.0
162,5500,2200.0,117,3,2,"Rua Garcia D'Avila, 57 - Ipanema, Rio de Janei...","Cozinha, Elevador, Área de serviço",https://www.vivareal.com.br/imovel/apartamento...,0.4,True,7700.0
163,5500,2200.0,117,3,3,"Rua Garcia D'Avila - Ipanema, Rio de Janeiro - RJ","Elevador, Condomínio fechado, Aceita animais, ...",https://www.vivareal.com.br/imovel/apartamento...,0.4,,7700.0
164,5500,2200.0,117,3,3,"Rua Garcia D'Avila - Ipanema, Rio de Janeiro - RJ","Elevador, Ar-condicionado, Salão de festas, Ar...",https://www.vivareal.com.br/imovel/apartamento...,0.4,,7700.0
165,5000,2000.0,110,3,3,"Rua Garcia D'Avila - Ipanema, Rio de Janeiro - RJ","Mobiliado, Elevador, Condomínio fechado, Ar-co...",https://www.vivareal.com.br/imovel/apartamento...,0.4,,7000.0


In [287]:
df.to_excel('base_20211226.xlsx', index=False)

In [274]:
def contains_number_flag(row):
    for char in row:
        if char.isdigit():
            return True


In [275]:
df['num_flag'] = df['address'].apply(lambda x: contains_number_flag(x))

In [288]:
driver = webdriver.Chrome('chromedriver.exe')

  driver = webdriver.Chrome('chromedriver.exe')


In [350]:
def get_nearest_subway(driver, link):
    driver.get(link)
    time.sleep(random.randint(4, 10) + random.random())
    subways = driver.find_elements(By.CLASS_NAME, 'poi-nearby__item')
    min_distance = 100000
    for subway in subways:
        subway_name = subway.text
        subway_distance = subway.find_element(By.CLASS_NAME, 'poi-nearby__item--distance').text
        if 'k' in subway_distance:
            subway_distance = int(float(subway_distance.split('km')[0])*1000)
        elif 'm' in subway_distance:
            subway_distance = int(subway_distance.split('m')[0])
        
        if subway_distance < min_distance:
            min_distance = subway_distance
            min_name = subway_name
    return (min_name.split(str(min_distance))[0], min_distance)
        



In [352]:
import undetected_chromedriver as uc
from selenium import webdriver
import time

options = webdriver.ChromeOptions() 
options.add_argument("start-maximized")
driver = uc.Chrome(options=options)

In [353]:
nearest_subway = df['url'].apply(lambda x: get_nearest_subway(driver, x))

In [354]:
df['nearest_subway'] = nearest_subway
df.to_excel('base_20211226_locations.xlsx', index=False)

In [355]:
nearest_subway

0      (Metrô Nossa Senhora Da Paz , 216)
1         (Metrô Antero De Quental , 344)
2      (Metrô Nossa Senhora Da Paz , 176)
3      (Metrô Nossa Senhora Da Paz , 176)
4      (Metrô Nossa Senhora Da Paz , 176)
                      ...                
201          (Metrô General Osório , 701)
202       (Metrô Antero De Quental , 159)
203    (Metrô Nossa Senhora Da Paz , 176)
204          (Metrô General Osório , 701)
205          (Metrô General Osório , 376)
Name: url, Length: 206, dtype: object

In [369]:
df['nearest_subway'] = nearest_subway

# Tem que pegar todas as distancias e tirar se tiver algum menor que < 800m de osorio 

In [372]:
df['nearest_subway_name'] = df['nearest_subway'].apply(lambda x : x[0])
df['nearest_subway_distance'] = df['nearest_subway'].apply(lambda x : x[1])

In [383]:
df[~(df['nearest_subway_name'].str.contains('Osório')) & (df['nearest_subway_distance'] < 800)]

Unnamed: 0,rental_price,condo_price,area,rooms,bathrooms,address,amenities,url,cond_per_rent,num_flag,total,nearest_subway,nearest_subway_name,nearest_subway_distance
0,7900,690.0,120,3,3,"Rua Nascimento Silva - Ipanema, Rio de Janeiro...","Elevador, Condomínio fechado, Varanda, Aceita ...",https://www.vivareal.com.br/imovel/apartamento...,0.087342,,8590.0,"(Metrô Nossa Senhora Da Paz , 216)",Metrô Nossa Senhora Da Paz,216
1,7470,1003.0,100,3,2,"Rua Dias Ferreira - Leblon, Rio de Janeiro - RJ","Mobiliado, Elevador, Ar-condicionado, Área de ...",https://www.vivareal.com.br/imovel/apartamento...,0.134270,,8473.0,"(Metrô Antero De Quental , 344)",Metrô Antero De Quental,344
2,7500,1133.0,100,3,3,"Rua Prudente de Morais - Ipanema, Rio de Janei...","Ar-condicionado, Cozinha, Elevador, Interfone,...",https://www.vivareal.com.br/imovel/apartamento...,0.151067,,8633.0,"(Metrô Nossa Senhora Da Paz , 176)",Metrô Nossa Senhora Da Paz,176
3,7500,1133.0,100,3,3,"Rua Prudente de Morais - Ipanema, Rio de Janei...","Elevador, Academia, Ar-condicionado, Varanda, ...",https://www.vivareal.com.br/imovel/apartamento...,0.151067,,8633.0,"(Metrô Nossa Senhora Da Paz , 176)",Metrô Nossa Senhora Da Paz,176
4,7200,1100.0,110,3,3,"Rua Prudente de Morais - Ipanema, Rio de Janei...","Elevador, Condomínio fechado, Varanda, Academi...",https://www.vivareal.com.br/imovel/apartamento...,0.152778,,8300.0,"(Metrô Nossa Senhora Da Paz , 176)",Metrô Nossa Senhora Da Paz,176
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198,5000,3285.0,113,3,2,"Rua Vinícius de Moraes, 197 - Ipanema, Rio de ...","Playground, Portaria 24h, Ar-condicionado",https://www.vivareal.com.br/imovel/apartamento...,0.657000,True,8285.0,"(Metrô Nossa Senhora Da Paz , 332)",Metrô Nossa Senhora Da Paz,332
199,5000,3300.0,191,4,2,"Rua Aníbal de Mendonça - Ipanema, Rio de Janei...","Andar inteiro, Armário na cozinha, Armário emb...",https://www.vivareal.com.br/imovel/apartamento...,0.660000,,8300.0,"(Metrô Nossa Senhora Da Paz , 544)",Metrô Nossa Senhora Da Paz,544
200,2550,1750.0,105,3,3,"Rua Visconde de Pirajá - Ipanema, Rio de Janei...",Elevador,https://www.vivareal.com.br/imovel/apartamento...,0.686275,,4300.0,"(Metrô Nossa Senhora Da Paz , 120)",Metrô Nossa Senhora Da Paz,120
202,2000,1700.0,100,3,2,"Rua João de Barros - Leblon, Rio de Janeiro - RJ","Ar-condicionado, Jardim, Área de serviço, Armá...",https://www.vivareal.com.br/imovel/apartamento...,0.850000,,3700.0,"(Metrô Antero De Quental , 159)",Metrô Antero De Quental,159


In [368]:
df[df['nearest_subway_name'].str.contains('Osório')]

Unnamed: 0,rental_price,condo_price,area,rooms,bathrooms,address,amenities,url,cond_per_rent,num_flag,total,nearest_subway
41,5800,1373.0,150,3,3,"Ipanema, Rio de Janeiro - RJ","Mobiliado, Elevador, Ar-condicionado, Jardim, ...",https://www.vivareal.com.br/imovel/apartamento...,0.236724,,7173.0,"('Metrô General Osório ', 304)"
53,4200,1098.0,130,3,4,"Ipanema, Rio de Janeiro - RJ",,https://www.vivareal.com.br/imovel/apartamento...,0.261429,,5298.0,"('Metrô General Osório ', 304)"
57,4500,1200.0,140,3,3,"Ipanema, Rio de Janeiro - RJ",Elevador,https://www.vivareal.com.br/imovel/apartamento...,0.266667,,5700.0,"('Metrô General Osório ', 309)"
93,4200,1300.0,118,3,2,"Ipanema, Rio de Janeiro - RJ","Ar-condicionado, Armário embutido, Armário na ...",https://www.vivareal.com.br/imovel/apartamento...,0.309524,,5500.0,"('Metrô General Osório ', 304)"
95,6700,2106.0,140,3,2,"Ipanema, Rio de Janeiro - RJ","Elevador, Interfone",https://www.vivareal.com.br/imovel/apartamento...,0.314328,,8806.0,"('Metrô General Osório ', 304)"
117,6000,2008.0,170,3,2,"Rua Barão da Torre, 489 - Ipanema, Rio de Jane...","Cozinha, Elevador, Interfone, Área de serviço",https://www.vivareal.com.br/imovel/apartamento...,0.334667,True,8008.0,"('Metrô General Osório ', 921)"
123,6500,2200.0,140,3,2,"Ipanema, Rio de Janeiro - RJ","Cozinha, Elevador, Interfone, TV a cabo, Área ...",https://www.vivareal.com.br/imovel/apartamento...,0.338462,,8700.0,"('Metrô General Osório ', 863)"
147,4500,1700.0,135,3,2,"Rua Joaquim Nabuco - Ipanema, Rio de Janeiro - RJ","Elevador, Armário na cozinha, Armário embutido...",https://www.vivareal.com.br/imovel/apartamento...,0.377778,,6200.0,"('Metrô General Osório ', 508)"
151,6500,2500.0,140,3,3,"Avenida Vieira Souto - Ipanema, Rio de Janeiro...","Elevador, Área de serviço",https://www.vivareal.com.br/imovel/apartamento...,0.384615,,9000.0,"('Metrô General Osório ', 966)"
158,6000,2380.0,135,3,2,"Rua Barão da Torre - Ipanema, Rio de Janeiro - RJ","Elevador, Quarto de serviço, Banheiro de servi...",https://www.vivareal.com.br/imovel/apartamento...,0.396667,,8380.0,"('Metrô General Osório ', 921)"
