# Scraping

In [2]:
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import Select

from bs4 import BeautifulSoup
import requests
import time
import datetime as dt 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas

import pickle
import os
import sys
import warnings

plt.style.use('ggplot')
warnings.filterwarnings('ignore')
pd.options.display.max_columns=200


path=r"C:\Users\berid\python\myhome project"


headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}


from selenium.webdriver.edge.options import Options

edge_options = Options()
edge_options.add_argument('--headless')
edge_options.add_argument("--blink-settings=imagesEnabled=false")  # Disable images
edge_options.add_argument("--autoplay-policy=no-user-gesture-required")  # Disable autoplay
edge_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")



driver = webdriver.Edge(options=edge_options)
driver.maximize_window()
driver.execute_script("document.body.style.zoom='25%'")


In [2]:
import datetime

today=datetime.datetime.today().strftime('%d%m%y')
today

'201024'

### Scraping URLs

In [None]:
all_urls=[]


if not any([today in file for file in os.listdir(os.path.join(path,'urls'))]):
    for page in range(1,21):
        try:
            url=f"https://www.myhome.ge/s/iyideba-bina-Tbilisshi/?deal_types=1&real_estate_types=1&cities=1&currency_id=1&CardView=2&page={page}&slug=iyideba-bina-Tbilisshi"
            driver.get(url)

            hrefs=WebDriverWait(driver,10).until(expected_conditions.visibility_of_all_elements_located((By.CSS_SELECTOR,'div[class="relative flex w-full space-x-5"] a')))
            for href in hrefs:
                try:
                    url=href.get_attribute('href')
                    all_urls.append(url)
                except:
                    continue

            print(f'Page : {page}, Len : {len(all_urls)}', end='\r')
        
        except:
            pickle.dump(all_urls,open(os.path.join(path,'urls',f'urls_{today}.pickle'),'wb'))

    pickle.dump(all_urls,open(os.path.join(path,'urls',f'urls_{today}.pickle'),'wb'))

Page : 6, Len : 100

In [3]:
all_urls=[]
for file in os.listdir(os.path.join(path,'urls')):
    if file.endswith('pickle'):
        file_path=os.path.join(path,'urls',file)
        data=pickle.load(open(file_path,'rb'))
        all_urls.extend(data)
        
all_urls=list(set(all_urls))
len(all_urls)

7284

### Scraping Data And Saving in SQL Database

In [5]:
def return_dict(url,scrape_data):
    driver.get(url)
    time.sleep(2)
    
    try:
        address=driver.find_element(By.CSS_SELECTOR,'div[class="flex flex-wrap items-center justify-between lg:flex-nowrap"]').text
    except:
        address=None

    try:
        street=driver.find_element(By.CSS_SELECTOR,'div[class="px-0 pt-0 pb-4 mt-5 bg-white md:border md:border-gray-20 rounded-2xl md:px-6 md:pt-5 md:pb-6"] div[class="flex flex-col items-start"]').text
    except:
        street=None    
        
    try:
        details=driver.find_element(By.CSS_SELECTOR,'div[class="items-center flex-wrap border border-gray-20 rounded-xl p-5 md:p-6 lg:p-8 mt-0 md:mt-4 justify-between grid grid-cols-2 md:grid-cols-4 gap-3"]').text
    except:
        details=None

    try:
        price=driver.find_element(By.CSS_SELECTOR,'div[class="col-span-3 hidden lg:block sticky"] div[class="flex items-center justify-start md:justify-between"]').text
    except:    
        price=None

    try:
        see_more_button=driver.find_element(By.CSS_SELECTOR,'button[class="text-primary-100 text-sm flex items-center gap-2 mt-4 hidden lg:flex"]')
        driver.execute_script('arguments[0].click();',see_more_button)
    except:
        None
    try:
        parameters = {i.text.split('\n')[0]: i.text.split('\n')[1] for i in [i for i in driver.find_elements(By.CSS_SELECTOR, 'div[class="py-5 pl-5 pr-8 mt-4 bg-white border rounded-2xl md:py-6 md:pl-6 border-gray-20 md:mt-5"]') if 'დამატებითი პარამეტრები' in i.text][0]
                            .find_elements(By.CSS_SELECTOR, 'div[class="flex text-sm "]')}
    except:
        parameters=None

    try:
        furniture=[i for i in driver.find_elements(By.CSS_SELECTOR,'div[class="py-5 pl-5 pr-8 mt-4 bg-white border rounded-2xl md:py-6 md:pl-6 border-gray-20 md:mt-5"]') if "ავეჯი" in i.text][0].text
    except:
        furniture=None

    dict={'URL':url,'Address':address,'Street':street,'Details':details,'Price':price,'Parameters':parameters,'Furniture':furniture,'Scrape Date':scrape_data}
    return dict


In [5]:
import sqlite3

conn = sqlite3.Connection("myhome.db")
cursor = conn.cursor()

# Step 1: Create a new table with the desired column order
cursor.execute("""
CREATE TABLE IF NOT EXISTS homes (
    URL TEXT,
    ADDRESS TEXT,
    STREET TEXT,
    DETAILS TEXT,
    PRICE TEXT,
    PARAMETERS TEXT,
    FURNITURE TEXT,
    SCRAPE_DATE TEXT
)
""")

conn.commit() 

In [6]:
screped_urls=pd.read_sql_query('SELECT DISTINCT URL FROM homes ',conn)['URL'].tolist()#WHERE ADDRESS != "None"

urls_to_scrape=[url for url in all_urls if url not in screped_urls]

print(f'Scraped : {len(screped_urls)}, Left : {len(urls_to_scrape)}')

Scraped : 7333, Left : 8


In [8]:
today=datetime.datetime.today().strftime('%d-%m-%y')

for i,url in enumerate(urls_to_scrape,start=1):
    if url in screped_urls:
        continue

    try:
        dict=return_dict(url,today)
        values=[str(v) for k, v in dict.items()]
        cursor.execute("INSERT INTO homes (URL, ADDRESS, STREET, DETAILS, PRICE, PARAMETERS, FURNITURE, SCRAPE_DATE) VALUES (?, ?, ?, ?, ?, ?, ?, ?)", values)
        conn.commit()
    except Exception as e:
        print(f'{e}')
    
    print(f'{i} / {len(urls_to_scrape)}, : {dict}',end='\r')


63 / 63, : {'URL': 'https://www.myhome.ge/pr/19499623/iyideba-5-otaxiani-bina-vakeshi/', 'Address': 'იყიდება 5 ოთახიანი ბინა ვაკეში\nგუშინ 21:53-ზე\n51\nID: 19499623', 'Street': 'ყიფშიძე ნ. ქ. 10', 'Details': 'საერთო ფართი\n167 მ²\nოთახი\n5\nსაძინებელი\n3\nსართული\n12 / 13', 'Price': '638,190\n₾', 'Parameters': {'სტატუსი': 'მშენებარე', 'სვ.წერტილები': '2', 'მდგომარეობა': 'მწვანე კარკასი', 'პროექტის ტიპი': 'არასტანდარტული', 'ჭერის სიმაღლე': '4 მ'}, 'Furniture': None, 'Scrape Date': '20-10-24'}: '2.85 მ', 'გათბობა': 'ცენტრალური გათბობა', 'პარკირება': 'ეზოს პარკინგი', 'ცხელი წყალი': 'ცენტრალური გათბობა', 'სამშენებლო მასალა': 'კომბინირებული', 'აივანი': '1/3.8 მ²', 'მისაღები': 'გამოყოფილი/31 მ²', 'სათავსო': 'საკუჭნაო', 'ინტერნეტი': 'კი', 'ტელევიზია': 'კი', 'ბუნ. აირი': 'კი', 'ლიფტი': 'კი', 'წყალი': 'კი', 'კანალიზაცია': 'კი', 'ელ.ენერგია': 'კი', 'ტელეფონი': 'კი', 'სამზარეულო + ტექნიკა': 'კი'}, 'Furniture': 'ავეჯი\nავეჯი\nმაგიდა\nსკამები\nქურა (გაზის/ელექტრო)\nღუმელი\nსარეცხი მანქანა', 'Scrap