In [16]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from tqdm import tqdm # pip install tqdm
import time
import os

MAIN_URL = 'https://www.house.kg/kupit-kvartiru'

In [29]:
def get_max_pages(url:str)->int:
    "Getting the number of the last page"
    
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    obj = soup.find("div", class_="listings-wrapper")
    return int(obj.find('nav').find_all('li')[-1].find('a')['data-page'])

In [28]:
def get_url()->None:
  "Loading URLS of each House and saving as file"
  print('___ LOAD URLS ___')

  sub_url = []
  pages = input('How many pages do you want? (For all pages leave blank) ')
  
  if pages == '':
    pages = get_max_pages(MAIN_URL)
  else:
    pages = int(pages)
    
  for page in tqdm(range(1, pages+1)):
    url = f'{MAIN_URL}?page={page}'
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    objects = soup.find("div", class_="listings-wrapper")
    lincs = objects.find_all('div', class_="left-image")
    for ur in lincs:
      sub_url.append(ur.find('a')['href'])

  print(f'\n ___ Received {len(sub_url)} URLS ___')

  urls = []
  for i in sub_url:
    urls.append("https://www.house.kg" + i)

  pd.DataFrame(urls, columns=['URLS']).to_csv('urls.csv', index=False)
  print('\nFILE "urls.csv" SAVED')

In [82]:
get_url()

___ LOAD URLS ___


100%|██████████| 648/648 [19:21<00:00,  1.79s/it]


 ___ Received 6476 URLS ___

FILE "urls.csv" SAVED





In [24]:
import datetime
current_date = datetime.date.today()

In [83]:
def get_data(ur:str)->dict:
  
  response = requests.get(ur)
  soup = BeautifulSoup(response.text, "html.parser")
  atributs = soup.find_all("div", class_="info-row")
  USD_price = soup.find("div", class_="price-dollar").text.strip().replace(' ', '').replace('$', '')
  kgs_price  = soup.find("div", class_="price-som").text.strip().replace(' ', '').replace('сом', '')
  adress = soup.find("i", class_="fas fa-map-marker-alt")
  map_attr = soup.find('div', id='map2gis')
  latitude = map_attr['data-lat']
  longitude = map_attr['data-lon']
  n_rooms = soup.find("div", class_="left").text.strip().split('\n')[0]
  
  describe = soup.find("div", class_="description")
  added = soup.find('span', class_ ='added-span')
  upped = soup.find('span', class_ ='upped-span')
  rating = soup.find('div', class_ ='rating score')
  view = soup.find('span', class_ ='view-count')
  atr_dict = {}
  
  for atr in atributs:
    key = atr.find('div', class_="label").text.strip().replace('\n', ' ')
    val = atr.find('div', class_="info").text.strip().replace('\n', ' ')
    atr_dict[key] = val
  try:
    atr_dict['adress'] = adress.next_sibling.strip()
    atr_dict['n_rooms'] = n_rooms
  except Exception:
    atr_dict['adress'] = np.nan
  try:
    atr_dict['describe'] = describe.find('p').text.strip().replace('\n', ' ')
  except Exception:
    atr_dict['describe'] = np.nan
  try:
    atr_dict['added'] = added
    atr_dict['upped'] = upped
  except Exception:
    atr_dict['added'] = np.nan
    atr_dict['upped'] = np.nan
  try:
    atr_dict['latitude'] = float(latitude)
    atr_dict['longitude'] = float(longitude)
  except Exception:
    atr_dict['latitude'] = np.nan
    atr_dict['longitude'] = np.nan
    
  atr_dict['rating'] = rating
  atr_dict['view'] = int(view.text)
  atr_dict['USD_price'] = USD_price
  atr_dict['KGS_price '] = kgs_price
  atr_dict['current_date '] = current_date

  try: 
    hearts = soup.find('span', class_ = 'favourite-count table-comments').text
    atr_dict['hearts'] = hearts.split(';')[1]
  except Exception:
    atr_dict['hearts'] = np.nan

  return atr_dict

In [88]:
data = []
error_urls = []

In [89]:
urls = pd.read_csv('urls.csv').URLS.to_list()

for url in tqdm(urls):
    try:
        data.append(get_data(url))
    except Exception:
        url_index = urls.index(url)
        print('!!!___ERRROR___!!!')
        print(f'With url index: {url_index}')
        print(f'With url: {url}')
        error_urls.append(url)

  1%|          | 43/6476 [01:00<2:25:22,  1.36s/it]

!!!___ERRROR___!!!
With url index: 42
With url: https://www.house.kg/details/728676766ebf0a9b33b32-88503514


  1%|          | 44/6476 [01:02<2:17:49,  1.29s/it]

!!!___ERRROR___!!!
With url index: 43
With url: https://www.house.kg/details/332924866ebf090d3d332-26535346


  1%|          | 47/6476 [01:06<2:25:48,  1.36s/it]

!!!___ERRROR___!!!
With url index: 46
With url: https://www.house.kg/details/808322866ebefc4efdc31-25012237


  1%|          | 49/6476 [01:09<2:29:59,  1.40s/it]

!!!___ERRROR___!!!
With url index: 48
With url: https://www.house.kg/details/94390666ebef7b451b01-27638521


  8%|▊         | 529/6476 [15:05<3:13:35,  1.95s/it] 

!!!___ERRROR___!!!
With url index: 528
With url: https://www.house.kg/details/11793766712122baa432-41669720


  8%|▊         | 530/6476 [15:07<3:01:22,  1.83s/it]

!!!___ERRROR___!!!
With url index: 529
With url: https://www.house.kg/details/33825776666d92c911966-84995485


 20%|██        | 1322/6476 [39:27<2:35:50,  1.81s/it]

!!!___ERRROR___!!!
With url index: 1321
With url: https://www.house.kg/details/84181066d179613bb6a6-35904182


 24%|██▎       | 1530/6476 [45:49<2:28:06,  1.80s/it] 


KeyboardInterrupt: 

In [102]:
len(urls)

6476

In [103]:
urls = pd.read_csv('urls.csv').URLS.to_list()

for url in tqdm(range(5511, len(urls) + 1)):
    try:
        data.append(get_data(urls[url]))
    except Exception:
        # url_index = urls.index(url)
        print('!!!___ERRROR___!!!')
        print(f'With url index: {url}')
        print(f'With url: {urls[url]}')
        error_urls.append(urls[url])

  6%|▌         | 59/966 [01:03<16:28,  1.09s/it]

!!!___ERRROR___!!!
With url index: 5569
With url: https://www.house.kg/details/680183466d85fff3a71a8-44539857


 17%|█▋        | 168/966 [03:02<17:39,  1.33s/it]

!!!___ERRROR___!!!
With url index: 5678
With url: https://www.house.kg/details/1239472664ce9372a63d1-99297086


 20%|██        | 194/966 [03:28<12:45,  1.01it/s]

!!!___ERRROR___!!!
With url index: 5704
With url: https://www.house.kg/details/50909466ce37255c7e75-97743998


 20%|██        | 196/966 [03:30<15:22,  1.20s/it]

!!!___ERRROR___!!!
With url index: 5706
With url: https://www.house.kg/details/629391966b364ee1da490-46033580


 37%|███▋      | 359/966 [06:55<08:48,  1.15it/s]

!!!___ERRROR___!!!
With url index: 5869
With url: https://www.house.kg/details/381389065f0b56a6fe814-70415649


 48%|████▊     | 466/966 [09:05<08:16,  1.01it/s]

!!!___ERRROR___!!!
With url index: 5976
With url: https://www.house.kg/details/260698466c5a42de797b7-90478381


 65%|██████▌   | 629/966 [11:49<05:06,  1.10it/s]

!!!___ERRROR___!!!
With url index: 6139
With url: https://www.house.kg


 65%|██████▌   | 630/966 [11:50<04:50,  1.16it/s]

!!!___ERRROR___!!!
With url index: 6140
With url: https://www.house.kg


 79%|███████▉  | 766/966 [14:09<04:24,  1.32s/it]

!!!___ERRROR___!!!
With url index: 6276
With url: https://www.house.kg/details/88505266b2f60d48e3a1-38960698


 96%|█████████▋| 931/966 [21:12<06:13, 10.67s/it]

!!!___ERRROR___!!!
With url index: 6441
With url: https://www.house.kg/details/195967566a4935d7af6c7-66812990


100%|█████████▉| 965/966 [22:15<00:01,  1.38s/it]

!!!___ERRROR___!!!
With url index: 6476





IndexError: list index out of range

In [105]:
pd.DataFrame(data).to_csv('data.csv', index=False)

In [108]:
pd.DataFrame(error_urls).to_csv('error_urls.csv', index=False)


In [54]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2041 entries, 0 to 2040
Data columns (total 38 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Тип предложения                 2026 non-null   object 
 1   Серия                           2026 non-null   object 
 2   Дом                             2026 non-null   object 
 3   Этаж                            2020 non-null   object 
 4   Площадь                         2026 non-null   object 
 5   Отопление                       1406 non-null   object 
 6   Состояние                       1525 non-null   object 
 7   Санузел                         887 non-null    object 
 8   Газ                             801 non-null    object 
 9   Балкон                          741 non-null    object 
 10  Мебель                          782 non-null    object 
 11  Пол                             538 non-null    object 
 12  Разное                          71

In [47]:
data.columns

Index(['Тип предложения', 'Серия', 'Дом', 'Этаж', 'Площадь', 'Отопление',
       'Состояние', 'Санузел', 'Газ', 'Балкон', 'Мебель', 'Пол', 'Разное',
       'Правоустанавливающие документы', 'Возможность рассрочки',
       'Возможность ипотеки', 'Возможность обмена', 'adress', 'n_rooms',
       'describe', 'added', 'upped', 'rating', 'view', 'USD_price',
       'KGS_price ', 'current_date ', 'hearts', 'Телефон', 'Входная дверь',
       'Парковка', 'Высота потолков', 'Интернет', 'Безопасность',
       'Канализация', 'Питьевая вода', 'Электричество', 'Площадь участка'],
      dtype='object')

In [52]:
data['describe'][0]

'Продается уютная квартира. \r ЦО Каприз, Байтон 8\\3\r 4-я линия от пляжа\r Возможен обмен на авто с доплатой.'