In [None]:
from os import listdir
from os.path import isfile, join
import os
import requests
from bs4 import BeautifulSoup
from lxml import html
import pandas as pd
import time
import numpy as np
from tqdm import tnrange, tqdm_notebook
from datetime import datetime

In [None]:
def get_soup_by_url(url):
    
    html_ = requests.get(url).text
    soup = BeautifulSoup(html_, 'lxml')
    
    return soup    

In [None]:
# Получаем номер последней страницы
def get_number_last_page():
    
    soup = get_soup_by_url('https://www.tomsk.ru09.ru/realty?type=1&otype=1&district[1]=on&district[2]=on&district[3]=on&district[4]=on&perpage=50&page=1')
    number_last_page = int(soup.find('td', {'class':'pager_pages'}).find_all('a')[4].text)
    
    return number_last_page

In [None]:
def find_district_field(keys):
    
    for i, j in enumerate(keys):
        if 'район' in j:
            break
    return i

In [None]:
def parse_apartment(url):
    
    soup = get_soup_by_url(url)
    
    keys = [i.find('span').text.replace('\xa0','').lower() for i in soup.find_all('tr', {'class': 'realty_detail_attr'})]
    
    district_idx = find_district_field(keys)
    items = {'район': keys[district_idx]}

    keys = [j for i, j in enumerate(keys) if i not in (district_idx - 1, district_idx)]
    values = [i.text.replace('\xa0', ' ') for i in soup.find_all(class_='nowrap')]
    
    items.update(dict(zip(keys, values)))
    items['адрес'] = soup.find(class_='table_map_link').text.replace('\xa0', ' ')
    items['цена'] = int(soup.find('div', {'class': 'realty_detail_price inline'}).text.replace('\xa0','').replace('руб.',''))
    items['ид'] = int(soup.find('strong').text)
    items['дата добавления'] = soup.find(class_='realty_detail_date nobr').get('title')
    items['дата истечения'] = soup.find_all(class_='realty_detail_date')[4].get('title')
    
    return items

In [None]:
def handle_dataframe(df):
    
    some_keys = ['общая площадь', 'жилая','кухня']
    for i in some_keys:
        df[i] = pd.to_numeric([i.split(' ')[0] if not isinstance(i, float) else i for i in df[i].values])
        
    df['дата добавления'] = pd.to_datetime(df['дата добавления'], format='%d.%m.%Y %H:%M:%S')
    df['дата истечения'] = pd.to_datetime(df['дата истечения'], format='%d.%m.%Y')
    df['этаж'] = [int(i[0]) if i[0].isdigit() else 0 for i in df['этаж/этажность'].str.split('/')]
    df.drop('этаж/этажность', axis=1, inplace=True)
    
    return df

In [None]:
def get_urls_pages(start_page=1, end_page=None):
    
    url_base = 'https://www.tomsk.ru09.ru/realty?type=1&otype=1&district[1]=on&district[2]=on&district[3]=on&district[4]=on&perpage=50&page='
    
    end_page = end_page or get_number_last_page()
    pages_to_parse = range(start_page, end_page + 1)
    urls_pages = [url_base + str(i) for i in pages_to_parse]
        
    return urls_pages

In [None]:
def get_urls_apartments_by_page(url_page):
    
    url_base = 'https://www.tomsk.ru09.ru'
    
    soup = get_soup_by_url(url_page)
    soup = soup.find_all('a', {'class':'visited_ads'})

    urls_apartments = set()
    
    for i in soup:
        urls_apartments.add(url_base + i.get('href'))
    
    return urls_apartments

In [None]:
def main(start_page=1, end_page=None, filename='data.json'):
    
    urls_pages = get_urls_pages(start_page, end_page)
    path = 'C:/Users/qwerty.Oleg/'
    
    if filename in listdir(path):
        with open(filename, 'r') as fp:
            storage_dict = json.load(fp)
        len_storage = len(storage_dict)
        print('Apartments in storage: {}\n'.format(len_storage))
    else:
        storage_dict = {}
        
    for url_page in tqdm_notebook(urls_pages, desc='Pages:'):
        urls_apartments = get_urls_apartments_by_page(url_page)
        urls_apartments_to_parse = urls_apartments.difference(set(storage_dict))
        
        if len(urls_apartments_to_parse) != 0:
            for url_apartment in tqdm_notebook(urls_apartments_to_parse, desc='Apartments:', leave=False):
                storage_dict[url_apartment] = parse_apartment(url_apartment)
                
        with open(filename, 'w') as fp:
            json.dump(storage_dict, fp)
    print('New apartments: {}'.format(len(storage_dict)-len_storage))

In [None]:
main()

In [None]:
df = pd.read_json('data.json', orient='index')
df = handle_dataframe(df)
df