In [1]:
import json
import os
import time

import pandas as pd
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.support.ui import WebDriverWait


In [115]:
options = webdriver.ChromeOptions()
options.add_extension('adblock.crx')
options.add_argument('--start-maximized')
options.add_argument('--blink-settings=imagesEnabled=false')  #block images
options.add_argument('--ignore-certificate-errors')
options.add_argument('--ignore-ssl-errors')
options.add_argument('--ignore-gpu-blacklist')
options.add_argument('--use-gl')
options.add_argument("--disable-cookies")
options.add_argument('--disable-web-security')
options.add_experimental_option("excludeSwitches", ['enable-logging'])

In [51]:
regions_codes = {
    'if': 'Івано-Франківська область',
    'vin': 'Вінницька область',
    'vol': 'Волинська область',
    'dnp': 'Дніпропетровська область',
    'don': 'Донецька область',
    'zht': 'Житомирська область',
    'zak': 'Закарпатська область',
    'zap': 'Запорізька область',
    'kir': 'Кіровоградська область',
    'ko': 'Київська область',
    # 'cri': 'Крим (АРК)',
    'lug': 'Луганська область',
    'lv': 'Львівська область',
    'nik': 'Миколаївська область',
    'od': 'Одеська область',
    'pol': 'Полтавська область',
    'rov': 'Рівненська область',
    'sum': 'Сумська область',
    'ter': 'Тернопільська область',
    'kha': 'Харківська область',
    'khe': 'Херсонська область',
    'khm': 'Хмельницька область',
    'chk': 'Черкаська область',
    'chv': 'Чернівецька область',
    'chn': 'Чернігівська область'
}

In [114]:
def grab_data(driver, links_to_check):
    data = []
    count = 0
    start_time = time.time()
    for link in links_to_check:
        count += 1
        driver.get(link)
        try:
            price = driver.find_element(By.CLASS_NAME, 'css-12vqlj3').text
        except NoSuchElementException:
            price = None
        try:
            description = driver.find_element(By.CLASS_NAME, 'css-1juynto').text
        except NoSuchElementException:
            description = None
        try:
            place = driver.find_element(By.CLASS_NAME, 'css-149mw5z').get_attribute("alt")
        except NoSuchElementException:
            place = None

        try:
            floors_find = driver.find_elements(By.CLASS_NAME, "css-1r0si1e")
            floor_current = None
            total_flooring = None
            total_area = None
            for elem in floors_find:
                text_elem = elem.text
                if "Поверх:" in text_elem:
                    floor_current = text_elem
                elif "Поверховість:" in text_elem:
                    total_flooring = text_elem
                elif "Загальна площа: " in text_elem:
                    total_area = text_elem
        except NoSuchElementException:
            floor_current = None
            total_flooring = None
            total_area = None

        if count % 50 == 0:
            end_time = time.time()
            execution_time = end_time - start_time
            print(f"Batch done in: {execution_time:.2f}")
            start_time = time.time()
        data.append({
            "URL_ads": link,
            "place": place,
            "price": price,
            "total_area": total_area,
            "floor": floor_current,
            "total_floors": total_flooring,
            "description": description,
        })
        print(f"More 50 processed from {len(links_to_check)} left {len(links_to_check) - count}")
    return data

In [59]:
BASE_URL = "https://www.olx.ua/uk/nedvizhimost/kvartiry/"

In [None]:
for code in regions_codes.keys():
    print(BASE_URL + code + '/?currency=UAH&search%5Border%5D=created_at:desc')

In [53]:
def init_regions(BASE_URL, regions_codes):
    all_links = []
    for code in regions_codes.keys():
        driver = webdriver.Chrome(options=options)
        driver.get(BASE_URL + code + '/?currency=UAH&search%5Border%5D=created_at:desc')
        # Wait for page to load
        while driver.execute_script("return document.readyState") != "complete":
            pass
        cookie_ok_button = WebDriverWait(driver, 5).until(
            ec.element_to_be_clickable((By.XPATH, '//*[@id="hydrate-root"]/div[4]/div[3]/button')))
        cookie_ok_button.click()
        print(driver.title)
        start_page = driver.find_elements(By.CLASS_NAME, 'css-z3gu2d')
        all_links += [link.get_attribute('href') for link in start_page]
        paginator_pages = driver.find_elements(By.CLASS_NAME, 'css-1mi714g')[-1].text
        for i in range(1, int(paginator_pages)):
            print(f"Scrapping page # {i}")
            next_page = f'{BASE_URL}{code}/?currency=UAH&page={i}&search%5Border%5D=created_at%3Adesc'
            driver.get(next_page)
            while driver.execute_script("return document.readyState") != "complete":
                pass
            on_page = driver.find_elements(By.CLASS_NAME, 'css-z3gu2d')
            links_on_page = [link.get_attribute('href') for link in on_page]
            all_links.extend(links_on_page)
        driver.quit()
    return all_links

In [None]:
test = init_regions(BASE_URL, regions_codes)

In [None]:
len(test)
all_links = set(test)
print(len(all_links))
with open('All_links_regions.txt', 'w') as file:
    for link in all_links:
        file.write(f"{link}\n")
print("Дані збережено у файл 'All_links_regions.json'")

In [None]:
start_time = time.time(
    
driver = webdriver.Chrome(options=options)
driver.get(BASE_URL)
while driver.execute_script("return document.readyState") != "complete":
    pass
cookie_ok_button = WebDriverWait(driver, 5).until(ec.element_to_be_clickable((By.XPATH, '//*[@id="hydrate-root"]/div[4]/div[3]/button')))
cookie_ok_button.click()
all_links_to_grab = test
all_links = set(all_links_to_grab)
print(len(all_links))
data = grab_data(driver, all_links)
end_time = time.time()
execution_time = end_time - start_time
print(f"Total time: {execution_time / 60:.2f} minutes ")

driver.quit()

In [117]:
with open('data.json', 'w', encoding='utf-8') as file:
    json.dump(data, file, ensure_ascii=False, indent=4)

print("Saved local links to 'data.json'")

Дані збережено у файл 'data.json'


In [118]:
driver.quit()


In [None]:
!pip install gspread
!pip install oauth2client
!pip install openpyxl pydrive


In [None]:
gauth = GoogleAuth()
gauth.LocalWebserverAuth()  
drive = GoogleDrive(gauth)
df = pd.DataFrame(data)

file_path = 'real_estate_data.xlsx'

if not os.path.exists(file_path):
    df.to_excel(file_path, index=False)
    print(f"{file_path} created!")
else:
    print(f"{file_path}  exist.")

file_drive = drive.CreateFile(
    {'title': 'Real Estate Data', 'parents': [{'id': '1oeNCttwXUU-8XOclNSyTBAkrBeNdtU4K'}]})
file_drive.SetContentFile(file_path)
file_drive.Upload()

os.remove(file_path)
print(f"File {file_drive['title']} uploaded to Google Drive!")
