In [18]:
import re
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import pandas as pd
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import time
from numpy import random
from datetime import datetime
today = datetime.utcnow().strftime('%Y%m%d')
import undetected_chromedriver as uc
from selenium import webdriver
import time


In [19]:
class Apartment:
    def __init__(self, apt_driver):
        self.apt_driver = apt_driver
     
    def get_url(self):
        self.url = self.apt_driver.find_element(By.CSS_SELECTOR, '[href]').get_attribute('href')
        return self.url
        
    def get_rental_price(self):
        self.rental_price = self._get_number_from_class('js-property-card__price-small')
        return self.rental_price
    
    def get_condo_price(self):
        self.condo_price = self._get_number_from_class('js-condo-price')
        return self.condo_price
    
    def get_area(self):
        self.area = self._get_number_from_class('js-property-card-detail-area')
        return self.area
    
    def get_rooms(self):
        self.rooms = self._get_number_from_class('js-property-detail-rooms')
        return self.rooms
    
    def get_bathrooms(self):
        self.bathrooms = self._get_number_from_class('js-property-detail-bathroom')
        return self.bathrooms
    
    def get_address(self):
        self.address = self._get_text_from_class('property-card__address')
        return self.address
    
    def get_amenities(self):
        raw_amenities = self._get_text_from_class('property-card__amenities')
        if raw_amenities:
            self.amenities = ', '.join(raw_amenities.split('\n'))
            return self.amenities
    
    def _get_text_from_class(self, class_name):
        try:
            return self.apt_driver.find_element(By.CLASS_NAME, class_name).text
        except:
            # print('Not Found class', class_name)
            return None

    def _get_number_from_class(self, class_name):
        try:
            found_driver = self.apt_driver.find_element(By.CLASS_NAME, class_name)
            return int(''.join(re.findall(r'\d+', found_driver.text)))
        except:
            # print('Not Found class', class_name)
            return None

    def mount_df(self):
        return (
            self.get_rental_price(),
            self.get_condo_price(),
            self.get_area(),
            self.get_rooms(),
            self.get_bathrooms(),
            self.get_address(),
            self.get_amenities(),
            self.get_url()
            )



In [20]:
def contains_number_flag(row):
    for char in row:
        if char.isdigit():
            return True


def get_nearest_subway(driver, link):
    driver.get(link)
    time.sleep(random.randint(4, 10) + random.random())
    subways = driver.find_elements(By.CLASS_NAME, 'poi-nearby__item')
    min_distance = 100000
    for subway in subways:
        subway_name = subway.text
        subway_distance = subway.find_element(By.CLASS_NAME, 'poi-nearby__item--distance').text
        if 'k' in subway_distance:
            subway_distance = int(float(subway_distance.split('km')[0])*1000)
        elif 'm' in subway_distance:
            subway_distance = int(subway_distance.split('m')[0])
        
        if subway_distance < min_distance:
            min_distance = subway_distance
            min_name = subway_name
            return (min_name.split(str(min_distance))[0], min_distance)

In [21]:
options = webdriver.ChromeOptions() 
options.add_argument("start-maximized")
driver = uc.Chrome(options=options)
# driver = webdriver.Chrome('chromedriver.exe')
link_raw = 'https://www.vivareal.com.br/aluguel/' + \
            'rj/rio-de-janeiro/zona-sul/{0}/' + \
            '#area-desde={1}&banheiros={2}&preco-ate={3}&preco-total=sim&quartos={4}'

neighborhoods = ['ipanema', 'leblon']
min_area = '70'
min_bathrooms = '2'
max_price = '6000'
min_rooms = '2'

data = []
for neighborhood in neighborhoods:
    print('Getting data from:', neighborhood)
    driver.get(link_raw.format(neighborhood, min_area, min_bathrooms, max_price, min_rooms))
    driver.maximize_window()
    try:
        driver.find_element(By.XPATH, '//*[@id="cookie-notifier-cta"]').click()
    except:
        pass
    time.sleep(5)
    i = 1
    while True:
        apartments = driver.find_elements(By.XPATH, '//*[@id="js-site-main"]/div[2]/div[1]/section/div[2]/div[1]/div')
        apt_list = [Apartment(apt) for apt in apartments]
        data.extend([apt.mount_df() for apt in apt_list])
        i += 1
        try:
            print('Trying to go to page n.', i)
            next_page = driver.find_element(By.XPATH, "//*[contains(text(), 'Próxima página')]")
            next_page.location_once_scrolled_into_view
            print('waiting to click')
            time.sleep(2)
            next_page.click()
            time.sleep(4)
        except Exception as e:
            print(e)
            print(f'Page {i} not found')
            break

driver.quit()
df = pd.DataFrame(data, columns = ['rental_price', 'condo_price', 'area', 'rooms', 'bathrooms', 'address', 'amenities', 'url'])
df.to_excel(f'base_raw_{min_rooms}_rooms_{today}.xlsx', index=False)


Getting data from: ipanema
Trying to go to page n. 2
waiting to click
Trying to go to page n. 3
waiting to click
Trying to go to page n. 4
waiting to click
Message: element click intercepted: Element <a href="#pagina=" class="js-change-page" title="Próxima página" data-page="" data-disabled="">...</a> is not clickable at point (958, 16). Other element would receive the click: <li class="pagination__item">...</li>
  (Session info: chrome=97.0.4692.71)
Stacktrace:
Backtrace:
	Ordinal0 [0x008EFDC3+2555331]
	Ordinal0 [0x008877F1+2127857]
	Ordinal0 [0x00782E08+1060360]
	Ordinal0 [0x007B48CF+1263823]
	Ordinal0 [0x007B2B10+1256208]
	Ordinal0 [0x007B076B+1247083]
	Ordinal0 [0x007AF559+1242457]
	Ordinal0 [0x007A4FB3+1200051]
	Ordinal0 [0x007C7B0C+1342220]
	Ordinal0 [0x007A4984+1198468]
	Ordinal0 [0x007C7C14+1342484]
	Ordinal0 [0x007D75FA+1406458]
	Ordinal0 [0x007C7976+1341814]
	Ordinal0 [0x007A36B6+1193654]
	Ordinal0 [0x007A4546+1197382]
	GetHandleVerifier [0x00A89622+1619522]
	GetHandleVerifie

In [22]:
address_blocklist = ['Elizabeth', 'Teixeira', 'Correia de Melo', 'Gorceix', 'Timóteo', 'Canning', 'Parreiras', 'Epitácio',
'Bartolomeu', 'Farme', 'Gomes Carneiro', 'Tubira', 'Francisco Otaviano', 'Borges de Medeiros', 'Almirante Saddock']

for block_address in address_blocklist:
    df = df[~df['address'].str.contains(block_address)]

df['cond_per_rent'] = df['condo_price'] / df['rental_price']
df = df.sort_values(by='cond_per_rent').reset_index(drop=True)
df = df.dropna(subset=['condo_price'])
df['total'] = df['condo_price'] + df['rental_price']
df['num_flag'] = df['address'].apply(lambda x: contains_number_flag(x))
df.to_excel(f'base_locations_{min_rooms}_rooms_{today}.xlsx', index=False)

In [30]:
# driver = uc.Chrome(options=options)
# with uc.Chrome(options=options) as driver:
with webdriver.Chrome() as wd:
    nearest_subway = df['url'].apply(lambda x: get_nearest_subway(wd, x))

In [31]:
df['nearest_subway'] = nearest_subway
df['nearest_subway_name'] = df['nearest_subway'].apply(lambda x : x[0])
df['nearest_subway_distance'] = df['nearest_subway'].apply(lambda x : x[1])
df_filter = df[~(df['nearest_subway_name'].str.contains('Osório')) & (df['nearest_subway_distance'] < 800)]

df.to_excel(f'base_subway_{min_rooms}_rooms_{today}.xlsx', index=False)
df_filter.to_excel(f'base_filter_{min_rooms}_{today}.xlsx', index=False)