In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import time
from tqdm import tqdm

def scrape_object_details(object_url):
    url = "https://tashkent.etagi.com" + object_url
    response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
    if response.status_code != 200:
        return {}

    soup = BeautifulSoup(response.content, 'html.parser')
    title = soup.select_one('[displayname="objectTitle"]').get_text(strip=True) if soup.select_one('[displayname="objectTitle"]') else "N/A"
    количество_комнат = re.search(r'(\d+)-комн', title)
    количество_комнат = количество_комнат.group(1) if количество_комнат else "N/A"
    тип = "квартира" if "квартира" in title.lower() else "студия" if "студия" in title.lower() else "N/A"
    
    валюта = "USD" if "у.е." in (цена_str := soup.select_one('.eypL8').get_text(strip=True) if soup.select_one('.eypL8') else "N/A").lower() else "UZS"
    цена = re.sub(r'[^\d]', '', цена_str)

    additional_details = {}
    ul_element = soup.select_one('ul.PpfZ1')
    if ul_element:
        for li in ul_element.find_all('li'):
            title = li.find('span', class_='Y65Dj').get_text(strip=True)
            value = li.find('span', class_='XVztD').get_text(strip=True)
            additional_details[title] = value

    дата_публикации_raw = soup.select_one('.o8Cyp span').get_text(strip=True) if soup.select_one('.o8Cyp span') else "N/A"
    дата_публикации_match = re.search(r'(\d{2}\.\d{2}\.\d{2})', дата_публикации_raw)
    дата_публикации = (дата_публикации_match.group(1)[:-2] + "20" + дата_публикации_match.group(1)[-2:]) if дата_публикации_match else "N/A"

    return {
        "Название": 'NA',
        "Тип": тип,
        "Санузел": soup.select_one('.object-params__value--bathrooms').get_text(strip=True) if soup.select_one('.object-params__value--bathrooms') else "N/A",
        "Тип постройки": soup.select_one('.object-params__value--build-type').get_text(strip=True) if soup.select_one('.object-params__value--build-type') else "N/A",
        "Материал": soup.select_one('.XVztD').get_text(strip=True) if soup.select_one('.XVztD') else "N/A",
        "Адрес": soup.select_one('.mfrBs').get_text(strip=True) if soup.select_one('.mfrBs') else "N/A",
        "Ремонт": additional_details.get('Ремонт', "N/A"),
        "Площадь": re.sub(r'[^\d]', '', additional_details.get('Общая площадь', "0 м²")),
        "Этаж": additional_details.get('Этаж/Этажность', "N/A").split()[0],
        "Этажность": additional_details.get('Этаж/Этажность', "N/A").split()[-1],
        "Количество комнат": количество_комнат,
        "Дата публикации": дата_публикации,
        "Валюта": валюта,
        "Цена": цена,
        "Описание": soup.select_one('.tv2WS').get_text(strip=True) if soup.select_one('.tv2WS') else "N/A",
    }

def get_object_hrefs(page_url):
    response = requests.get(page_url, headers={'User-Agent': 'Mozilla/5.0'})
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        return [card['href'] for card in soup.select('a.templates-object-card__slider[href]')]
    else:
        return []

def scrape_all_pages(base_url, max_pages):
    data = []
    for page_num in tqdm(range(1, max_pages + 1), desc="Pages", unit="page"):
        page_url = f"{base_url}?page={page_num}"
        hrefs = get_object_hrefs(page_url)
        
        for href in tqdm(hrefs, desc="Objects", unit="object", leave=False):
            data.append(scrape_object_details(href))
        
        time.sleep(2)
    return pd.DataFrame(data)

# Example usage
base_url = "https://tashkent.etagi.com/realty/"
df = scrape_all_pages(base_url, max_pages=300)

Pages:  78%|███████▊  | 233/300 [48:41<13:20, 11.95s/page]  

In [6]:
import pandas as pd
import time
import numpy as np
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Initialize WebDriver (open it once and reuse it)
def init_driver():
    driver = webdriver.Chrome()
    driver.get("https://www.google.com/maps")
    return driver

# Function to calculate cosine similarity
def calculate_cosine_similarity(input_address, suggestion_texts):
    vectorizer = TfidfVectorizer().fit_transform([input_address] + suggestion_texts)
    vectors = vectorizer.toarray()
    
    # Compute the cosine similarity between the input address and each suggestion
    input_vector = vectors[0]
    suggestion_vectors = vectors[1:]
    similarity_scores = cosine_similarity([input_vector], suggestion_vectors).flatten()
    
    return similarity_scores

# Function to find the most similar search suggestion
def select_most_similar_suggestion(driver, input_address):
    try:
        suggestions_elements = driver.find_elements(By.CSS_SELECTOR, 'div[jsaction="suggestion.select"]')
        suggestions_texts = [suggestion.text for suggestion in suggestions_elements]

        if not suggestions_texts:
            return False  # No suggestions available

        # Calculate cosine similarity between the input address and all suggestions
        similarity_scores = calculate_cosine_similarity(input_address, suggestions_texts)
        
        # Find the suggestion with the highest similarity
        best_match_index = np.argmax(similarity_scores)
        best_similarity_score = similarity_scores[best_match_index]

        print(f"Best match: {suggestions_texts[best_match_index]} (Cosine Similarity: {best_similarity_score:.2f})")
        
        # Only select if it's 95% or more similar
        if best_similarity_score >= 0.95:
            suggestions_elements[best_match_index].click()
            return True

        return False  # No suitable suggestion found
    except Exception as e:
        print(f"Error selecting the suggestion: {e}")
        return False

def get_lat_long(driver, address):
    try:
        # Find the search box and input the address
        search_box = driver.find_element(By.NAME, "q")
        search_box.clear()  # Clear previous input before entering new address
        search_box.send_keys(address)
        search_box.submit()
        time.sleep(5)  # Wait for the search results to load

        # Select the most similar suggestion based on cosine similarity
        if not select_most_similar_suggestion(driver, address):
            print(f"No matching suggestion for {address}. Skipping.")
            return "N/A", "N/A"

        time.sleep(5)  # Wait for the page to load with the selected location

        # Extract latitude and longitude from the opened page itself
        url = driver.current_url

        # Parsing the latitude and longitude from the URL using 'll' or 'query'
        if "@" in url:
            coords_section = url.split("@")[1].split(",")  # The coordinates are usually after the '@' in the URL
            latitude = coords_section[0]
            longitude = coords_section[1]
            return latitude, longitude
        else:
            return "N/A", "N/A"
    except Exception as e:
        print(f"Error retrieving coordinates for {address}: {e}")
        return "N/A", "N/A"

# Main function to apply the extraction to the DataFrame
def extract_lat_long_from_df(df, driver):
    latitudes = []
    longitudes = []

    for address in df['Адрес']:
        print(f"Getting coordinates for: {address}")
        latitude, longitude = get_lat_long(driver, address)
        latitudes.append(latitude)
        longitudes.append(longitude)
        time.sleep(1)  # Optional: delay between requests to avoid being blocked
    
    df['Широта'] = latitudes
    df['Долгота'] = longitudes
    return df

# Initialize the driver once
driver = init_driver()

# Extract latitude and longitude
df = extract_lat_long_from_df(df, driver)
df
driver.quit()

Getting coordinates for: Ташкент, улица Ниёзов 5
Best match: 
5-Maktab улица Абдулла Кадыри, Ташкент (Cosine Similarity: 0.20)
No matching suggestion for Ташкент, улица Ниёзов 5. Skipping.
Getting coordinates for: Ташкент, проспект Мустакиллик 10
Best match: 
проспект Мустакиллик 10 Ташкент (Cosine Similarity: 1.00)
                              Адрес      Широта     Долгота
0           Ташкент, улица Ниёзов 5         N/A         N/A
1  Ташкент, проспект Мустакиллик 10  41.3193568  69.2958078
