In [1]:
from os import listdir
from os.path import isfile, join
import os
import requests
from bs4 import BeautifulSoup
from lxml import html
import pandas as pd
import time
import numpy as np
from tqdm import tnrange, tqdm_notebook
from datetime import datetime

In [2]:
def get_soup_by_url(url):
    
    html_ = requests.get(url).text
    soup = BeautifulSoup(html_, 'lxml')
    
    return soup    

In [3]:
# Получаем номер последней страницы
def get_number_last_page():
    
    soup = get_soup_by_url('https://www.tomsk.ru09.ru/realty?type=1&otype=1&district[1]=on&district[2]=on&district[3]=on&district[4]=on&perpage=50&page=1')
    number_last_page = int(soup.find('td', {'class':'pager_pages'}).find_all('a')[4].text)
    
    return number_last_page

In [4]:
def find_district_field(keys):
    
    for i, j in enumerate(keys):
        if 'район' in j:
            break
    return i

In [5]:
def parse_apartment(url):
#     headers = {'User-Agent: Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'}
#     start_time = time.time()
    
    soup = get_soup_by_url(url)
    
    keys = [i.find('span').text.replace('\xa0','').lower() for i in soup.find_all('tr', {'class': 'realty_detail_attr'})]
    
    district_idx = find_district_field(keys)
    items = {'район': keys[district_idx]}

    keys = [j for i, j in enumerate(keys) if i not in (district_idx - 1, district_idx)]
    values = [i.text.replace('\xa0', ' ') for i in soup.find_all(class_='nowrap')]
    
    items.update(dict(zip(keys, values)))
    items['адрес'] = soup.find(class_='table_map_link').text.replace('\xa0', ' ')
    items['цена'] = int(soup.find('div', {'class': 'realty_detail_price inline'}).text.replace('\xa0','').replace('руб.',''))
    items['ид'] = int(soup.find('strong').text)
    items['дата добавления'] = soup.find(class_='realty_detail_date nobr').get('title')
    items['дата истечения'] = soup.find_all(class_='realty_detail_date')[4].get('title')
    
    return items

In [6]:
def handle_dataframe(df):
    
    some_keys = ['общая площадь', 'жилая','кухня']
    for i in some_keys:
        df[i] = pd.to_numeric([i.split(' ')[0] if not isinstance(i, float) else i for i in df[i].values])
        
    df['дата добавления'] = pd.to_datetime(df['дата добавления'], format='%d.%m.%Y %H:%M:%S')
    df['дата истечения'] = pd.to_datetime(df['дата истечения'], format='%d.%m.%Y')
    df['этаж'] = [int(i[0]) if i[0].isdigit() else 0 for i in df['этаж/этажность'].str.split('/')]
    df.drop('этаж/этажность', axis=1, inplace=True)
    
    return df

In [7]:
def get_urls_pages(start_page=1, end_page=None):
    
    url_base = 'https://www.tomsk.ru09.ru/realty?type=1&otype=1&district[1]=on&district[2]=on&district[3]=on&district[4]=on&perpage=50&page='
    
    end_page = end_page or get_number_last_page()
    pages_to_parse = range(start_page, end_page + 1)
    urls_pages = [url_base + str(i) for i in pages_to_parse]
        
    return urls_pages

In [8]:
def get_urls_apartments_by_page(url_page):
    
    url_base = 'https://www.tomsk.ru09.ru'
    
    soup = get_soup_by_url(url_page)
    soup = soup.find_all('a', {'class':'visited_ads'})

    urls_apartments = set()
    
    for i in soup:
        urls_apartments.add(url_base + i.get('href'))
    
    return urls_apartments

In [9]:
def main(start_page=1, end_page=None, filename='data.json'):
    
    urls_pages = get_urls_pages(start_page, end_page)
    path = 'C:/Users/qwerty.Oleg/'
    
    if filename in listdir(path):
        with open(filename, 'r') as fp:
            storage_dict = json.load(fp)
        len_storage = len(storage_dict)
        print('Apartments in storage: {}\n'.format(len_storage))
    else:
        storage_dict = {}
        
    for url_page in tqdm_notebook(urls_pages, desc='Pages:'):
        urls_apartments = get_urls_apartments_by_page(url_page)
        urls_apartments_to_parse = urls_apartments.difference(set(storage_dict))
        
        if len(urls_apartments_to_parse) != 0:
            for url_apartment in tqdm_notebook(urls_apartments_to_parse, desc='Apartments:', leave=False):
                storage_dict[url_apartment] = parse_apartment(url_apartment)
                
        with open(filename, 'w') as fp:
            json.dump(storage_dict, fp)
    print('New apartments: {}'.format(len(storage_dict)-len_storage))

In [10]:
main()

Apartments in storage: 9537



HBox(children=(IntProgress(value=0, description='Pages:', max=150, style=ProgressStyle(description_width='init…

HBox(children=(IntProgress(value=0, description='Apartments:', max=12, style=ProgressStyle(description_width='…

HBox(children=(IntProgress(value=0, description='Apartments:', max=35, style=ProgressStyle(description_width='…

HBox(children=(IntProgress(value=0, description='Apartments:', max=32, style=ProgressStyle(description_width='…

HBox(children=(IntProgress(value=0, description='Apartments:', max=15, style=ProgressStyle(description_width='…

HBox(children=(IntProgress(value=0, description='Apartments:', max=18, style=ProgressStyle(description_width='…

HBox(children=(IntProgress(value=0, description='Apartments:', max=27, style=ProgressStyle(description_width='…

HBox(children=(IntProgress(value=0, description='Apartments:', max=16, style=ProgressStyle(description_width='…

HBox(children=(IntProgress(value=0, description='Apartments:', max=31, style=ProgressStyle(description_width='…

HBox(children=(IntProgress(value=0, description='Apartments:', max=30, style=ProgressStyle(description_width='…

HBox(children=(IntProgress(value=0, description='Apartments:', max=27, style=ProgressStyle(description_width='…

HBox(children=(IntProgress(value=0, description='Apartments:', max=37, style=ProgressStyle(description_width='…

HBox(children=(IntProgress(value=0, description='Apartments:', max=1, style=ProgressStyle(description_width='i…


New apartments: 281


In [11]:
df = pd.read_json('data.json', orient='index')
df = handle_dataframe(df)
df

Unnamed: 0,адрес,балкон/лоджия,вид,год постройки,дата добавления,дата истечения,жилая,ид,количество комнат,кухня,материал,общая площадь,отделка,район,санузел,тип квартиры,цена,этажность,этаж
https://www.tomsk.ru09.ru/realty?subaction=detail&id=1011925,Гагарина 19,"лоджия, остекление",вторичное,,2020-03-03 09:38:03,2020-06-09,83.0,1011925,4,19.0,кирпич,137.0,в хорошем состоянии,советский район,раздельный,,12500000,4,4
https://www.tomsk.ru09.ru/realty?subaction=detail&id=1071671,Мусы Джалиля 31,"2 лоджии, остекление",вторичное,1995.0,2020-02-26 10:33:30,2020-04-23,179.0,1071671,4,25.0,кирпич,300.0,в отличном состоянии,кировский район,раздельный,,12000000,4,2
https://www.tomsk.ru09.ru/realty?subaction=detail&id=1270561,Первомайская 101,,вторичное,2010.0,2020-02-03 16:04:21,2020-05-03,,1270561,1,,кирпич,20.3,в хорошем состоянии,ленинский район,совмещенный,гостинка,900000,3,1
https://www.tomsk.ru09.ru/realty?subaction=detail&id=1520047,Вершинина 9,"2 балкона, остекление",вторичное,,2020-01-13 17:18:38,2020-04-12,,1520047,2,,кирпич,77.0,в отличном состоянии,советский район,совмещенный,,9850000,6,3
https://www.tomsk.ru09.ru/realty?subaction=detail&id=1745323,Большая Подгорная 46,,новостройка,,2020-01-20 11:54:14,2020-04-19,,1745323,2,9.0,монолит,48.0,черновая отделка,ленинский район,раздельный,,3200000,12,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
https://www.tomsk.ru09.ru/realty?subaction=detail&id=4414907,Льва Толстого 38б,,вторичное,2005.0,2020-03-26 18:44:42,2020-06-24,,4414907,1,,кирпич,43.3,,советский район,,,2700000,14,4
https://www.tomsk.ru09.ru/realty?subaction=detail&id=4414908,Говорова 50,,вторичное,1994.0,2020-03-26 18:45:00,2020-06-24,,4414908,2,,панель,53.7,,ленинский район,,,3500000,10,7
https://www.tomsk.ru09.ru/realty?subaction=detail&id=799374,Гоголя 37,,вторичное,2001.0,2020-03-14 08:37:53,2020-06-12,,799374,9,18.0,кирпич,306.0,в хорошем состоянии,советский район,,двухуровневая,28000000,5,1
https://www.tomsk.ru09.ru/realty?subaction=detail&id=864187,Нижний переулок 29/3,лоджия,новостройка,2017.0,2020-03-03 10:05:14,2020-06-01,17.0,864187,1,10.0,кирпич,42.8,черновая отделка,советский район,раздельный,,1800000,14,14
