In [16]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from tqdm import tqdm # pip install tqdm
import time
import os

MAIN_URL = 'https://www.house.kg/kupit-kvartiru'

In [29]:
def get_max_pages(url:str)->int:
    "Getting the number of the last page"
    
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    obj = soup.find("div", class_="listings-wrapper")
    return int(obj.find('nav').find_all('li')[-1].find('a')['data-page'])

In [126]:
def get_url()->None:
  "Loading URLS of each House and saving as file"
  print('___ LOAD URLS ___')

  sub_url = []
  pages = input('How many pages do you want? (For all pages leave blank) ')
  
  if pages == '':
    pages = get_max_pages(MAIN_URL)
  else:
    pages = int(pages)
    
  for page in tqdm(range(1, pages+1)):
    url = f'{MAIN_URL}?page={page}'
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    objects = soup.find("div", class_="listings-wrapper")
    lincs = objects.find_all('div', class_="left-image")
    for ur in lincs:
      sub_url.append(ur.find('a')['href'])

  print(f'\n ___ Received {len(sub_url)} URLS ___')

  urls = []
  for i in sub_url:
    urls.append(f"https://www.house.kg{i}")

  pd.DataFrame(urls, columns=['URLS']).to_csv('urls.csv', index=False)
  print('\nFILE "urls.csv" SAVED')

In [116]:
get_url()

___ LOAD URLS ___


100%|██████████| 649/649 [21:39<00:00,  2.00s/it]  


 ___ Received 6481 URLS ___

FILE "urls.csv" SAVED





In [24]:
import datetime
current_date = datetime.date.today()

In [176]:
def get_data(ur:str)->dict:
  
  response = requests.get(ur)
  soup = BeautifulSoup(response.text, "html.parser")
  main_content = soup.find('div', class_='main-content')
  header = main_content.find('div', class_='details-header')
  user_info = main_content.find('div', class_='phone-fixable-block')
  main_details = main_content.find('div', class_='details-main')
  atributs = main_details.find_all("div", class_="info-row")
  
  USD_price = header.find("div", class_="price-dollar").text.strip().replace(' ', '').replace('$', '')
  kgs_price  = header.find("div", class_="price-som").text.strip().replace(' ', '').replace('сом', '')
  
  address = header.find("i", class_="fas fa-map-marker-alt")
  number = user_info.find('div', class_='number')
  map_attr = main_details.find('div', id='map2gis')
  latitude = map_attr['data-lat']
  longitude = map_attr['data-lon']
  n_rooms = header.find("div", class_="left").text.strip().split('\n')[0]
  
  describe = main_details.find("div", class_="description")
  added = header.find('span', class_ ='added-span')
  upped = header.find('span', class_ ='upped-span')
  rating = user_info.find('div', class_ ='rating score')
  views = header.find('span', class_ ='view-count')
  atr_dict = {
    'url': ur,
  }
  
  for atr in atributs:
    key = atr.find('div', class_="label").text.strip().replace('\n', ' ')
    val = atr.find('div', class_="info").text.strip().replace('\n', ' ')
    atr_dict[key] = val
    
  try:
    atr_dict['address'] = address.next_sibling.strip()
    atr_dict['n_rooms'] = n_rooms
  except Exception:
    atr_dict['adress'] = np.nan
  try:
    atr_dict['describe'] = describe.find('p').text.strip().replace('\n', ' ')
  except Exception:
    atr_dict['describe'] = np.nan
  try:
    atr_dict['added'] = added.text
    atr_dict['upped'] = upped.text
  except Exception:
    atr_dict['added'] = np.nan
    atr_dict['upped'] = np.nan
  try:
    atr_dict['latitude'] = float(latitude)
    atr_dict['longitude'] = float(longitude)
  except Exception:
    atr_dict['latitude'] = np.nan
    atr_dict['longitude'] = np.nan
  try:
    atr_dict['rating'] = rating.text.strip()
  except Exception:
    atr_dict['rating'] = np.nan
    
  atr_dict['views'] = int(views.text)
  atr_dict['phone_number'] = number.text
  atr_dict['USD_price'] = USD_price
  atr_dict['KGS_price '] = kgs_price
  atr_dict['current_date '] = current_date

  try: 
    hearts = header.find('span', class_ ='favourite-count table-comments').text
    atr_dict['hearts'] = hearts.split(';')[1]
  except Exception:
    atr_dict['hearts'] = np.nan

  return atr_dict

In [175]:
response = requests.get('https://www.house.kg/details/521743066ec024c57fe40-61109241')
soup = BeautifulSoup(response.text, "html.parser")

# main_details = soup.find('div', class_='details-main').find('div', class_='right')
# rating = main_details.find('div', class_='fotorama').find_all('a')

# img_list = []
# for a in rating:
#     img_list.append(a['href'])
    
# img_list

('Добавлено 9 часов назад', ' Поднято 8 часов назад')

In [165]:
data = []
error_urls = []

In [179]:
urls = pd.read_csv('urls.csv').URLS.to_list()

for url in tqdm(urls):
    try:
        data.append(get_data(url))
    except Exception:
        url_index = urls.index(url)
        print('!!!___ERRROR___!!!')
        print(f'With url index: {url_index}')
        print(f'With url: {url}')
        error_urls.append(url)
        
pd.DataFrame(data).to_csv('data_last.csv', index=False)
pd.DataFrame(error_urls).to_csv('error_urls_last.csv', index=False)

  0%|          | 0/6556 [00:00<?, ?it/s]

  1%|          | 43/6556 [00:47<2:26:59,  1.35s/it]

!!!___ERRROR___!!!
With url index: 42
With url: https://www.house.kg/details/728676766ebf0a9b33b32-88503514


  1%|          | 44/6556 [00:49<2:28:26,  1.37s/it]

!!!___ERRROR___!!!
With url index: 43
With url: https://www.house.kg/details/332924866ebf090d3d332-26535346


  1%|          | 47/6556 [00:52<2:24:13,  1.33s/it]

!!!___ERRROR___!!!
With url index: 46
With url: https://www.house.kg/details/808322866ebefc4efdc31-25012237


  1%|          | 49/6556 [00:55<2:18:06,  1.27s/it]

!!!___ERRROR___!!!
With url index: 48
With url: https://www.house.kg/details/94390666ebef7b451b01-27638521


  8%|▊         | 529/6556 [11:14<1:32:26,  1.09it/s]

!!!___ERRROR___!!!
With url index: 528
With url: https://www.house.kg/details/11793766712122baa432-41669720


  8%|▊         | 530/6556 [11:15<1:28:51,  1.13it/s]

!!!___ERRROR___!!!
With url index: 529
With url: https://www.house.kg/details/33825776666d92c911966-84995485


 12%|█▏        | 756/6556 [16:28<2:27:27,  1.53s/it]

!!!___ERRROR___!!!
With url index: 755
With url: https://www.house.kg/details/7371691667d35d8d65300-74710401


 16%|█▌        | 1065/6556 [23:28<1:56:57,  1.28s/it]

!!!___ERRROR___!!!
With url index: 1064
With url: https://www.house.kg/details/228853666d14e9408da33-34060723


 16%|█▋        | 1068/6556 [23:32<2:05:39,  1.37s/it]

!!!___ERRROR___!!!
With url index: 1067
With url: https://www.house.kg/details/795115466b9da433331a4-11232614


 16%|█▋        | 1073/6556 [23:39<2:14:31,  1.47s/it]

!!!___ERRROR___!!!
With url index: 1072
With url: https://www.house.kg/details/20455366e29bc31c23f8-43079706


 16%|█▋        | 1075/6556 [23:43<2:41:57,  1.77s/it]

!!!___ERRROR___!!!
With url index: 1074
With url: https://www.house.kg/details/768338866e56c46d68201-22799425


 20%|██        | 1322/6556 [28:56<1:55:32,  1.32s/it]

!!!___ERRROR___!!!
With url index: 1321
With url: https://www.house.kg/details/84181066d179613bb6a6-35904182


 34%|███▍      | 2238/6556 [49:19<1:43:13,  1.43s/it]

!!!___ERRROR___!!!
With url index: 2237
With url: https://www.house.kg/details/120677065e1f54d3ffce1-05367546


 41%|████▏     | 2715/6556 [1:03:53<1:27:56,  1.37s/it]

!!!___ERRROR___!!!
With url index: 2714
With url: https://www.house.kg/details/333788066cacc4581bcf5-21878537


 43%|████▎     | 2800/6556 [1:05:51<1:21:51,  1.31s/it]

!!!___ERRROR___!!!
With url index: 2799
With url: https://www.house.kg/details/190423266cc5df0557280-73522850


 44%|████▎     | 2863/6556 [1:07:15<1:25:57,  1.40s/it]

!!!___ERRROR___!!!
With url index: 2862
With url: https://www.house.kg/details/374138166dab9a8baf611-55528603


 58%|█████▊    | 3827/6556 [1:30:17<1:07:44,  1.49s/it] 

!!!___ERRROR___!!!
With url index: 3826
With url: https://www.house.kg/details/858240566263393bb0e14-28951104


 59%|█████▉    | 3859/6556 [1:30:50<40:53,  1.10it/s]  

!!!___ERRROR___!!!
With url index: 3858
With url: https://www.house.kg/details/206847669f37c3212919-94226138


 61%|██████▏   | 4027/6556 [1:34:37<58:04,  1.38s/it]  

!!!___ERRROR___!!!
With url index: 4026
With url: https://www.house.kg/details/684261566eac5ce501e26-50333778


 63%|██████▎   | 4148/6556 [1:37:26<58:07,  1.45s/it]  

!!!___ERRROR___!!!
With url index: 4147
With url: https://www.house.kg/details/661483766db02c469a716-00536687


 65%|██████▍   | 4233/6556 [1:39:17<33:26,  1.16it/s]  

!!!___ERRROR___!!!
With url index: 4232
With url: https://www.house.kg/details/781304866cc67ad1676c2-86016910


 72%|███████▏  | 4704/6556 [1:49:05<33:57,  1.10s/it]  

!!!___ERRROR___!!!
With url index: 4703
With url: https://www.house.kg/details/618560566ea8a817f0e43-28198126


 72%|███████▏  | 4732/6556 [1:49:37<29:57,  1.01it/s]

!!!___ERRROR___!!!
With url index: 4731
With url: https://www.house.kg/details/378124667b7dddbf5b41-27142435


 75%|███████▌  | 4917/6556 [1:52:52<26:46,  1.02it/s]

!!!___ERRROR___!!!
With url index: 4916
With url: https://www.house.kg/details/602323066ab50041af2e7-58375847


 77%|███████▋  | 5073/6556 [1:55:41<22:12,  1.11it/s]

!!!___ERRROR___!!!
With url index: 5072
With url: https://www.house.kg/details/782445667f90905fc827-97766744


 77%|███████▋  | 5074/6556 [1:55:42<21:39,  1.14it/s]

!!!___ERRROR___!!!
With url index: 5073
With url: https://www.house.kg/details/3305137667f8f61f25e18-79391688


 79%|███████▉  | 5195/6556 [1:57:50<24:43,  1.09s/it]

!!!___ERRROR___!!!
With url index: 5194
With url: https://www.house.kg/details/37264066e27a995115f5-16758366


 81%|████████  | 5308/6556 [1:59:54<22:47,  1.10s/it]

!!!___ERRROR___!!!
With url index: 5307
With url: https://www.house.kg/details/737094566e190a23fe0a1-55891839


 81%|████████  | 5323/6556 [2:00:12<26:51,  1.31s/it]

!!!___ERRROR___!!!
With url index: 5322
With url: https://www.house.kg/details/717308966e16565b4bb62-46877938


 82%|████████▏ | 5344/6556 [2:00:36<20:48,  1.03s/it]

!!!___ERRROR___!!!
With url index: 5343
With url: https://www.house.kg


 82%|████████▏ | 5364/6556 [2:00:56<18:33,  1.07it/s]

!!!___ERRROR___!!!
With url index: 5363
With url: https://www.house.kg/details/596610266e06696d2b982-63282275


 82%|████████▏ | 5368/6556 [2:00:59<17:04,  1.16it/s]

!!!___ERRROR___!!!
With url index: 5367
With url: https://www.house.kg/details/647885669e0e94b74545-07246914


 82%|████████▏ | 5379/6556 [2:01:09<17:44,  1.11it/s]

!!!___ERRROR___!!!
With url index: 5378
With url: https://www.house.kg/details/759897166e00499bc52c3-92531933


 82%|████████▏ | 5388/6556 [2:01:17<16:54,  1.15it/s]

!!!___ERRROR___!!!
With url index: 5387
With url: https://www.house.kg/details/984586064c9d0fbbf1788-62974933


 84%|████████▍ | 5495/6556 [2:03:17<18:19,  1.04s/it]

!!!___ERRROR___!!!
With url index: 5494
With url: https://www.house.kg/details/183637966dabd7df34b93-11197994


 85%|████████▍ | 5570/6556 [2:04:31<19:06,  1.16s/it]

!!!___ERRROR___!!!
With url index: 5569
With url: https://www.house.kg/details/680183466d85fff3a71a8-44539857


 87%|████████▋ | 5679/6556 [2:06:27<13:32,  1.08it/s]

!!!___ERRROR___!!!
With url index: 5678
With url: https://www.house.kg/details/1239472664ce9372a63d1-99297086


 87%|████████▋ | 5705/6556 [2:06:50<11:53,  1.19it/s]

!!!___ERRROR___!!!
With url index: 5704
With url: https://www.house.kg/details/50909466ce37255c7e75-97743998


 87%|████████▋ | 5707/6556 [2:06:52<12:15,  1.15it/s]

!!!___ERRROR___!!!
With url index: 5706
With url: https://www.house.kg/details/629391966b364ee1da490-46033580


 88%|████████▊ | 5752/6556 [2:07:40<16:48,  1.25s/it]

!!!___ERRROR___!!!
With url index: 5751
With url: https://www.house.kg/details/266733666d3e58e45f802-39598291


 88%|████████▊ | 5753/6556 [2:07:41<17:58,  1.34s/it]

!!!___ERRROR___!!!
With url index: 5752
With url: https://www.house.kg/details/691930466d3e3734b8930-16376554


 88%|████████▊ | 5787/6556 [2:08:17<12:22,  1.04it/s]

!!!___ERRROR___!!!
With url index: 5786
With url: https://www.house.kg/details/579719566c83462a56184-83990530


 90%|████████▉ | 5870/6556 [2:09:40<12:25,  1.09s/it]

!!!___ERRROR___!!!
With url index: 5869
With url: https://www.house.kg/details/381389065f0b56a6fe814-70415649


 90%|████████▉ | 5875/6556 [2:09:46<13:10,  1.16s/it]

!!!___ERRROR___!!!
With url index: 5874
With url: https://www.house.kg/details/7152268669794495ba3e6-20305317


 91%|█████████ | 5977/6556 [2:11:39<11:30,  1.19s/it]

!!!___ERRROR___!!!
With url index: 5976
With url: https://www.house.kg/details/260698466c5a42de797b7-90478381


 92%|█████████▏| 6012/6556 [2:12:10<07:56,  1.14it/s]

!!!___ERRROR___!!!
With url index: 6011
With url: https://www.house.kg/details/153017466c5d859c61836-08478375


 92%|█████████▏| 6043/6556 [2:12:40<07:51,  1.09it/s]

!!!___ERRROR___!!!
With url index: 6042
With url: https://www.house.kg/details/949267566c444f0351116-31952048


 94%|█████████▎| 6140/6556 [2:14:25<07:55,  1.14s/it]

!!!___ERRROR___!!!
With url index: 5343
With url: https://www.house.kg


 94%|█████████▎| 6141/6556 [2:14:26<07:28,  1.08s/it]

!!!___ERRROR___!!!
With url index: 5343
With url: https://www.house.kg


 95%|█████████▌| 6253/6556 [2:16:22<05:58,  1.18s/it]

!!!___ERRROR___!!!
With url index: 6252
With url: https://www.house.kg/details/12662466b3b2410f7847-89023704


 96%|█████████▌| 6277/6556 [2:16:50<04:28,  1.04it/s]

!!!___ERRROR___!!!
With url index: 6276
With url: https://www.house.kg/details/88505266b2f60d48e3a1-38960698


 98%|█████████▊| 6442/6556 [2:19:48<02:13,  1.17s/it]

!!!___ERRROR___!!!
With url index: 6441
With url: https://www.house.kg/details/195967566a4935d7af6c7-66812990


100%|██████████| 6556/6556 [2:25:25<00:00,  1.33s/it]


In [180]:
data = pd.DataFrame(data)

In [181]:
data

Unnamed: 0,url,Тип предложения,Серия,Дом,Этаж,Площадь,Отопление,Состояние,Телефон,Санузел,...,current_date,hearts,Интернет,Безопасность,Возможность рассрочки,Возможность обмена,Площадь участка,Канализация,Питьевая вода,Электричество
0,https://www.house.kg/details/515492365e7189104...,от собственника,индивид. планировка,"кирпичный, 2024 г.",1 этаж из 3,35 м2,электрическое,хорошее,возможно подключение,совмещенный,...,2024-09-18,,,,,,,,,
1,https://www.house.kg/details/932931666cc8a829e...,от собственника,элитка,"монолитный, 2022 г.",12 этаж из 15,64 м2,на газе,евроремонт,,2 с/у и более,...,2024-09-18,,,,,,,,,
2,https://www.house.kg/details/266187266e0264c64...,от собственника,индивид. планировка,"кирпичный, 2021 г.",4 этаж из 4,"39 м2, жилая: 13 м2, кухня: 17 м2",на газе,евроремонт,,,...,2024-09-18,,проводной,"видеонаблюдение, ...",,,,,,
3,https://www.house.kg/details/696572651eac369ce...,от собственника,элитка,"монолитный, 2014 г.",4 этаж из 6,"115 м2, кухня: 23 м2",автономное,евроремонт,есть,2 с/у и более,...,2024-09-18,,проводной,"домофон, ...",нет,обмен не предлагать,,,,
4,https://www.house.kg/details/4141411668e6aea34...,от собственника,элитка,"кирпичный, 2021 г.",9 этаж из 9,101 м2,на газе,евроремонт,возможно подключение,2 с/у и более,...,2024-09-18,,adsl,"решетки на окнах, ...",,рассмотрю варианты,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6499,https://www.house.kg/details/911020166ea8db808...,от агента,элитка,"кирпичный, 2022 г.",5 этаж из 5,"79 м2, жилая: 36.1 м2, кухня: 12 м2",центральное,хорошее,нет,совмещенный,...,2024-09-18,,оптика,"домофон, ...",нет,обмен не предлагать,,,,
6500,https://www.house.kg/details/390821266ebbff8a4...,от агента,хрущевка,"кирпичный, 1965 г.",3 этаж из 4,"30 м2, жилая: 18 м2, кухня: 5.5 м2",центральное,среднее,нет,совмещенный,...,2024-09-18,,оптика,домофон,нет,обмен не предлагать,,,,
6501,https://www.house.kg/details/280086366e90b780f...,от собственника,элитка,кирпичный,4 этаж из 12,50 м2,центральное,евроремонт,,,...,2024-09-18,,,,нет,обмен не предлагать,,,,
6502,https://www.house.kg/details/104503366e81ead64...,от собственника,105 серия улучшенная,"кирпичный, 2024 г.",7 этаж из 9,60 м2,центральное,под самоотделку (псо),нет,раздельный,...,2024-09-18,,,,,,,,,


In [182]:
data.columns

Index(['url', 'Тип предложения', 'Серия', 'Дом', 'Этаж', 'Площадь',
       'Отопление', 'Состояние', 'Телефон', 'Санузел', 'Газ', 'Балкон',
       'Входная дверь', 'Парковка', 'Мебель', 'Пол', 'Высота потолков',
       'Разное', 'Правоустанавливающие документы', 'Возможность ипотеки',
       'address', 'n_rooms', 'describe', 'added', 'upped', 'latitude',
       'longitude', 'rating', 'views', 'phone_number', 'USD_price',
       'KGS_price ', 'current_date ', 'hearts', 'Интернет', 'Безопасность',
       'Возможность рассрочки', 'Возможность обмена', 'Площадь участка',
       'Канализация', 'Питьевая вода', 'Электричество'],
      dtype='object')