In [16]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from tqdm import tqdm # pip install tqdm
import time
import os

MAIN_URL = 'https://www.house.kg/kupit-kvartiru'

In [29]:
def get_max_pages(url:str)->int:
    "Getting the number of the last page"
    
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    obj = soup.find("div", class_="listings-wrapper")
    return int(obj.find('nav').find_all('li')[-1].find('a')['data-page'])

In [28]:
def get_url()->None:
  "Loading URLS of each House and saving as file"
  print('___ LOAD URLS ___')

  sub_url = []
  pages = input('How many pages do you want? (For all pages leave blank) ')
  
  if pages == '':
    pages = get_max_pages(MAIN_URL)
  else:
    pages = int(pages)
    
  for page in tqdm(range(1, pages+1)):
    url = f'{MAIN_URL}?page={page}'
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    objects = soup.find("div", class_="listings-wrapper")
    lincs = objects.find_all('div', class_="left-image")
    for ur in lincs:
      sub_url.append(ur.find('a')['href'])

  print(f'\n ___ Received {len(sub_url)} URLS ___')

  urls = []
  for i in sub_url:
    urls.append("https://www.house.kg" + i)

  pd.DataFrame(urls, columns=['URLS']).to_csv('urls.csv', index=False)
  print('\nFILE "urls.csv" SAVED')

In [30]:
get_url()

___ LOAD URLS ___


100%|██████████| 642/642 [18:20<00:00,  1.71s/it]


 ___ Received 6420 URLS ___

FILE "urls.csv" SAVED





In [24]:
import datetime
current_date = datetime.date.today()

In [42]:
def get_data(ur:str, saled=False)->dict:
  
  response = requests.get(ur)
  soup = BeautifulSoup(response.text, "html.parser")
  atributs = soup.find_all("div", class_="info-row")
  USD_price = soup.find("div", class_="price-dollar").text.strip().replace(' ', '').replace('$', '')
  kgs_price  = soup.find("div", class_="price-som").text.strip().replace(' ', '').replace('сом', '')
  adress = soup.find("i", class_="fas fa-map-marker-alt")
  n_rooms = soup.find("div", class_="left").text.strip().split('\n')[0]
  
  describe = soup.find("div", class_="description")
  added = soup.find('span', class_ ='added-span')
  upped = soup.find('span', class_ ='upped-span')
  rating = soup.find('div', class_ ='rating score')
  view = soup.find('span', class_ ='view-count')
  atr_dict = {}
  
  for atr in atributs:
    key = atr.find('div', class_="label").text.strip().replace('\n', ' ')
    val = atr.find('div', class_="info").text.strip().replace('\n', ' ')
    atr_dict[key] = val
  try:
    atr_dict['adress'] = adress.next_sibling.strip()
    atr_dict['n_rooms'] = n_rooms
  except Exception:
    atr_dict['adress'] = np.nan
  try:
    atr_dict['describe'] = describe.find('p').text.strip().replace('\n', ' ')
  except Exception:
    atr_dict['describe'] = np.nan
  try:
    atr_dict['added'] = added
    atr_dict['upped'] = upped
  except Exception:
    atr_dict['added'] = np.nan
    atr_dict['upped'] = np.nan
    
  atr_dict['rating'] = rating
  atr_dict['view'] = view
  atr_dict['USD_price'] = USD_price
  atr_dict['KGS_price '] = kgs_price
  atr_dict['current_date '] = current_date

  try: 
    hearts = soup.find('span', class_ = 'favourite-count table-comments').text
    atr_dict['hearts'] = hearts.split(';')[1]
  except Exception:
    atr_dict['hearts'] = np.nan

  return atr_dict

In [41]:
response = requests.get('https://www.house.kg/details/150283366c856a61bfab6-78961157')
soup = BeautifulSoup(response.text, 'html.parser')

# atributs = soup.find_all("div", class_="info-row")
# USD_price = soup.find("div", class_="price-dollar").text.strip().replace(' ', '').replace('$', '')
# kgs_price  = soup.find("div", class_="price-som").text.strip().replace(' ', '').replace('сом', '')
# adress = soup.find("i", class_="fas fa-map-marker-alt")
# n_rooms = soup.find("div", class_="left").text.strip().split('\n')[0]

# describe = soup.find("div", class_="description")
# added = soup.find('span', class_ ='added-span')
# upped = soup.find('span', class_ ='upped-span')
# rating = soup.find('div', class_ ='rating score')
# view = soup.find('span', class_ ='view-count')



'\xa03620'

In [43]:
urls = pd.read_csv('urls.csv').URLS.to_list()
data = []
error_urls = []

for url in tqdm(urls):
    try:
        data.append(get_data(url))
    except Exception:
        url_index = urls.index(url)
        print('!!!___ERRROR___!!!')
        print(f'With url index: {url_index}')
        print(f'With url: {url}')
        error_urls.append(url)

  0%|          | 0/6420 [00:00<?, ?it/s]

 32%|███▏      | 2041/6420 [42:15<1:30:40,  1.24s/it]


KeyboardInterrupt: 

In [46]:
data = pd.DataFrame(data)

In [54]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2041 entries, 0 to 2040
Data columns (total 38 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Тип предложения                 2026 non-null   object 
 1   Серия                           2026 non-null   object 
 2   Дом                             2026 non-null   object 
 3   Этаж                            2020 non-null   object 
 4   Площадь                         2026 non-null   object 
 5   Отопление                       1406 non-null   object 
 6   Состояние                       1525 non-null   object 
 7   Санузел                         887 non-null    object 
 8   Газ                             801 non-null    object 
 9   Балкон                          741 non-null    object 
 10  Мебель                          782 non-null    object 
 11  Пол                             538 non-null    object 
 12  Разное                          71

In [47]:
data.columns

Index(['Тип предложения', 'Серия', 'Дом', 'Этаж', 'Площадь', 'Отопление',
       'Состояние', 'Санузел', 'Газ', 'Балкон', 'Мебель', 'Пол', 'Разное',
       'Правоустанавливающие документы', 'Возможность рассрочки',
       'Возможность ипотеки', 'Возможность обмена', 'adress', 'n_rooms',
       'describe', 'added', 'upped', 'rating', 'view', 'USD_price',
       'KGS_price ', 'current_date ', 'hearts', 'Телефон', 'Входная дверь',
       'Парковка', 'Высота потолков', 'Интернет', 'Безопасность',
       'Канализация', 'Питьевая вода', 'Электричество', 'Площадь участка'],
      dtype='object')

In [52]:
data['describe'][0]

'Продается уютная квартира. \r ЦО Каприз, Байтон 8\\3\r 4-я линия от пляжа\r Возможен обмен на авто с доплатой.'