In [2]:
from bs4 import BeautifulSoup
from collections import OrderedDict
from datetime import datetime
from fake_useragent import UserAgent
import json
import pandas as pd
import pprint
import random
import re
import requests
import sys
import hashlib

from selenium import webdriver
from selenium.common.exceptions import (NoSuchElementException,
                                        ElementClickInterceptedException)
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.proxy import Proxy, ProxyType

from sqlalchemy import create_engine, Column, ForeignKey, String, TIMESTAMP, text
from sqlalchemy.dialects.mysql import INTEGER, TINYINT
from sqlalchemy.orm import relationship
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm.session import sessionmaker

In [3]:
BASE_URL = 'https://youla.ru'
CITY = 'moskva'
SECTION = 'nedvijimost'
SUBCATEGORY = 'prodaja-kvartiri'

PARSE_URL = f'{BASE_URL}/{CITY}/{SECTION}/{SUBCATEGORY}'

SQLALCHEMY_DATABASE_URI = 'mysql+pymysql://root:system32@localhost/parser'
engine = create_engine(SQLALCHEMY_DATABASE_URI, echo=False)

Base = declarative_base()
metadata = Base.metadata
factory = sessionmaker(bind=engine, autocommit=False, autoflush=False)
session = factory()

In [4]:
class Url(Base):
    __tablename__ = 'url'
    __table_args__ = {'comment': 'URL "–∫–∞—Ä—Ç–æ—á–µ–∫" –Ω–µ–¥–≤–∏–∂–∏–º–æ—Å—Ç–∏'}

    id = Column(INTEGER(11), primary_key=True)
    url = Column(String(1000))
    is_parsed = Column(TINYINT(1), server_default=text("0"))
    number_of_attempts = Column(INTEGER(11), nullable=False, server_default=text("1"))
    date_added = Column(TIMESTAMP, nullable=False, server_default=text("current_timestamp()"))
    date_parsed = Column(TIMESTAMP)


class Log(Base):
    __tablename__ = 'log'
    __table_args__ = {'comment': '–õ–æ–≥ –æ—à–∏–±–æ–∫ –¥–ª—è –ø–∞—Ä—Å–µ—Ä–∞'}

    id = Column(INTEGER(11), primary_key=True)
    id_url = Column(ForeignKey('url.id'), index=True)
    error_line = Column(INTEGER(11))
    error_type = Column(String(1000))
    error = Column(String(10000), nullable=False)
    date_add = Column(TIMESTAMP, nullable=False, server_default=text("current_timestamp()"))

    url = relationship('Url')

In [5]:
proxy_list = [
    {
        'http':     'http://185.187.197.108:8080',
        # 'https':    'https://91.225.226.39:44388'
    },
    {
        'http':     'http://185.187.197.108:8080',
        # 'https':    'https://91.225.226.39:44388'
    }
]

subcategory_dict = {
    2001: '–ü—Ä–æ–¥–∞–∂–∞ –∫–≤–∞—Ä—Ç–∏—Ä—ã',
    2002: '–ü—Ä–æ–¥–∞–∂–∞ –∫–æ–º–Ω–∞—Ç—ã',
    2003: '–ü—Ä–æ–¥–∞–∂–∞ –¥–æ–º–∞',
    2004: '–ü—Ä–æ–¥–∞–∂–∞ —É—á–∞—Å—Ç–∫–∞',
    2005: '–ê—Ä–µ–Ω–¥–∞ –∫–≤–∞—Ä—Ç–∏—Ä—ã –¥–ª–∏—Ç–µ–ª—å–Ω–æ',
    2006: '–ê—Ä–µ–Ω–¥–∞ –∫–æ–º–Ω–∞—Ç—ã –¥–ª–∏—Ç–µ–ª—å–Ω–æ',
    2007: '–ê—Ä–µ–Ω–¥–∞ –¥–æ–º–∞ –¥–ª–∏—Ç–µ–ª—å–Ω–æ',
    2008: '–ü—Ä–æ—á–∏–µ —Å—Ç—Ä–æ–µ–Ω–∏—è',
    2010: '–ê—Ä–µ–Ω–¥–∞ –∫–≤–∞—Ä—Ç–∏—Ä—ã –ø–æ—Å—É—Ç–æ—á–Ω–æ',
    2011: '–ê—Ä–µ–Ω–¥–∞ –∫–æ–º–Ω–∞—Ç—ã –ø–æ—Å—É—Ç–æ—á–Ω–æ',
    2012: '–ê—Ä–µ–Ω–¥–∞ –¥–æ–º–∞ –ø–æ—Å—É—Ç–æ—á–Ω–æ',
    2013: '–ö–æ–º–º–µ—Ä—á–µ—Å–∫–∞—è –Ω–µ–¥–≤–∏–∂–∏–º–æ—Å—Ç—å',
}

# —Å–æ–æ—Ç–≤–µ—Ç—Å—Ç–≤–∏–µ 'taskcode' —Å 'subcategory'
taskcode_dict = {
    2001: 1,    # –ø—Ä–æ–¥–∞–∂–∞
    2002: 1,
    2003: 1,
    2004: 1,
    2005: 2,    # –∞—Ä–µ–Ω–¥–∞
    2006: 2,
    2007: 2,

    2008: 0,
    
    2010: 2,
    2011: 2,
    2012: 2,
    2013: 0     # –Ω–µ–∏–∑–≤–µ—Å—Ç–Ω–æ
}

# —Å–æ–æ—Ç–≤–µ—Ç—Å—Ç–≤–∏–µ 'typecode' —Å 'subcategory'
typecode_dict = {
    2001: 1,    # –∫–≤–∞—Ä—Ç–∏—Ä–∞
    2002: 2,    # –∫–æ–º–Ω–∞—Ç–∞
    2003: 3,    # –∑–∞–≥–æ—Ä–æ–¥–Ω–∞—è
    2004: 3,
    2005: 1,
    2006: 2,
    2007: 3,
    2008: 5,    # –≥–∞—Ä–∞–∂ | –ø–æ–¥—É–º–∞—Ç—å: subcategory ‚Äì –ü—Ä–æ—á–∏–µ —Å—Ç—Ä–æ–µ–Ω–∏—è
    2010: 1,
    2011: 2,
    2012: 3,
    2013: 4     # –∫–æ–º–º–µ—Ä—á–µ—Å–∫–∞—è
}

repair_type = {
    "–ù–µ —Ç—Ä–µ–±—É–µ—Ç—Å—è"      : 1,
    "–¢—Ä–µ–±—É–µ—Ç—Å—è —Ä–µ–º–æ–Ω—Ç"  : 2,
    "–ö–æ—Å–º–µ—Ç–∏—á–µ—Å–∫–∏–π"     : 3,
    "–ï–≤—Ä–æ—Ä–µ–º–æ–Ω—Ç"        : 4,
    "–î–∏–∑–∞–π–Ω–µ—Ä—Å–∫–∏–π"      : 5,
    "–ö–∞–ø–∏—Ç–∞–ª—å–Ω—ã–π —Ä–µ–º–æ–Ω—Ç": 6,

}

bathroom_type = {
    "–°–æ–≤–º–µ—â–µ–Ω–Ω—ã–π":          1,
    "–†–∞–∑–¥–µ–ª—å–Ω—ã–π":           2,
    "2 –∏ –±–æ–ª–µ–µ":            3,
    "–ù–µ—Å–∫–æ–ª—å–∫–æ —Å–∞–Ω—É–∑–ª–æ–≤":   3,
    "–ù–∞ —É–ª–∏—Ü–µ":             4,
    "–í –¥–æ–º–µ":               5,

}

wall_material = {
    "–ü–∞–Ω–µ–ª—å–Ω—ã–π" :           1,
    "–ö–∏—Ä–ø–∏—á–Ω—ã–π" :           2,
    "–ú–æ–Ω–æ–ª–∏—Ç" :             3,
    "–ú–æ–Ω–æ–ª–∏—Ç–Ω—ã–π" :          3,
    "–ö–∏—Ä–ø–∏—á–Ω–æ-–º–æ–Ω–æ–ª–∏—Ç–Ω—ã–π" : 4,
    "–ë–ª–æ—á–Ω—ã–π" :             5,
    "–î–µ—Ä–µ–≤—è–Ω–Ω—ã–π" :          6,
    "–©–∏—Ç–æ–≤–æ–π" :             7,
}

tenure_dict = {
    "–î–æ 3-—Ö –ª–µ—Ç":       1,
    "–û—Ç 3 –¥–æ 5 –ª–µ—Ç":    2,
    "–ë–æ–ª–µ–µ 5 –ª–µ—Ç":      3,
}

housing_dict = {
    "–í—Ç–æ—Ä–∏—á–∫–∞":     1,
    "–ù–æ–≤–æ—Å—Ç—Ä–æ–π–∫–∞":  2,
}

commission_type = {
    "–ù–µ—Ç":      1,
    "30%":      2,
    "50%":      3,
    "100%":     4,
    "–î—Ä—É–≥–∞—è":   5
}

prepay_type = {
    "–ë–µ–∑ –ø—Ä–µ–¥–æ–ø–ª–∞—Ç—ã":       1,
    "1 –º–µ—Å—è—Ü":              2,
    "2 –º–µ—Å—è—Ü–∞":             3,
    "3 –º–µ—Å—è—Ü–∞":             4,
    "4 –∏ –±–æ–ª–µ–µ –º–µ—Å—è—Ü–µ–≤":    5,
}

building_type = {
    "–î–æ–º":                                  1,
    "–¢–∞—É–Ω—Ö–∞—É—Å":                             2,
    "–ö–æ—Ç—Ç–µ–¥–∂":                              3,
    "–î–∞—á–∞":                                 4,
    "–ü–æ–º–µ—â–µ–Ω–∏–µ —Å–≤–æ–±–æ–¥–Ω–æ–≥–æ –Ω–∞–∑–Ω–∞—á–µ–Ω–∏—è":      5,
    "–¢–æ—Ä–≥–æ–≤–æ–µ –ø–æ–º–µ—â–µ–Ω–∏–µ":                   6,
    "–û—Ñ–∏—Å–Ω–æ–µ –ø–æ–º–µ—â–µ–Ω–∏–µ":                    7,
    "–ü—Ä–æ–∏–∑–≤–æ–¥—Å—Ç–≤–æ":                         8,
    "–°–∫–ª–∞–¥":                                9,
    "–î—Ä—É–≥–∞—è –∫–æ–º–º–µ—Ä—á–µ—Å–∫–∞—è –Ω–µ–¥–≤–∏–∂–∏–º–æ—Å—Ç—å":     10,
}

plot_type = {
    "–°–µ–ª—å—Ö–æ–∑ (–°–ù–¢ –∏–ª–∏ –î–ù–ü)":                1,
    "–§–µ—Ä–º–µ—Ä—Å–∫–æ–µ —Ö–æ–∑-–≤–æ":                    2,
    "–ü–æ—Å–µ–ª–µ–Ω–∏—è (–ò–ñ–°)":                      3,
    "–ó–µ–º–ª—è –ø—Ä–æ–º–Ω–∞–∑–Ω–∞—á–µ–Ω–∏—è":                 4,
    "–ò–Ω–≤–µ—Å—Ç–ø—Ä–æ–µ–∫—Ç":                         5,
}

# —Å–æ–æ—Ç–≤–µ—Ç—Å–≤—Ç–∏–µ –∏–º–µ–Ω–æ–≤–∞–Ω–∏—è –Æ–ª—ã —Å –≤—ã—Ö–æ–¥–Ω—ã–º –∏–º–µ–Ω–æ–≤–∞–Ω–∏–µ–º —Å–ª–æ–≤–∞—Ä—è (json) ad_dict
attribute_dict = {
    "name":                     "name",
    "description":              "text",
    "price":                    "cost",
    "realty_obshaya_ploshad":   "totalarea",
    "realty_ploshad_kuhni":     "kitchenarea",
    "komnat_v_kvartire":        "roomquantity",
    "realty_etaj":              "floor",
    "realty_etajnost_doma":     "floors",
    "sobstvennik_ili_agent":    "is_agent",
    "realty_building_type":     "housing",
    "realty_hidden_location":   "fullAddress",
    "posudomoechnaya_mashina":  "dishWasher",
    "holodilnik":               "refr",
    "remont":                   "repair",
    "lift":                     "cargoLift",
    "realty_god_postroyki":     "buildYear",
}

In [6]:
ua = UserAgent()
s = requests.Session()

# —Å–ª—É—á–∞–π–Ω—ã–π –≤—ã–±–æ—Ä –ø—Ä–æ–∫—Å–∏ –∏–∑ —Å–ø–∏—Å–∫–∞ –¥–æ—Å—Ç—É–ø–Ω—ã—Ö –ø—Ä–æ–∫—Å–∏,
# —á—Ç–æ–±—ã –Ω–µ –∑–∞–±–∞–Ω–∏–ª–∏ –ø–æ –ø–æ—Ä—è–¥–∫—É –ø–µ—Ä–µ–±–æ—Ä–∞ –≤—Å–µ –ø—Ä–æ–∫—Å–∏,
# ! —Å–æ–≤—Ä–µ–º–µ–Ω–Ω—ã–µ –≤–µ–±-—Å–µ—Ä–≤–µ—Ä—ã –∑–∞—â–∏—â–µ–Ω—ã –æ—Ç –ø–æ—Å–ª–µ–¥–æ–≤–∞—Ç–µ–ª—å–Ω–æ–≥–æ –ø–µ—Ä–µ–±–æ—Ä–∞ –ø—Ä–æ–∫—Å–∏,
# ! –ø–æ—ç—Ç–æ–º—É —Å—Ç–æ–∏—Ç –ø–æ–¥—Å—Ç—Ä–∞—Ö–æ–≤–∞—Ç—å—Å—è
s.proxies = random.choice(proxy_list)
headers = {'User-Agent': ua.random}
r_page = s.get(PARSE_URL, headers=headers)

# –ø–æ–ª—á–∞–µ–º –æ–±—ä–µ–∫—Ç —Å—Ç—Ä–∞–Ω–∏—Ü—ã —Å–æ –≤–ª–æ–∂–µ–Ω–Ω–æ–π —Å—Ç—Ä—É–∫—Ç–æ—Ä–æ–π,
# —á—Ç–æ–±—ã —É–¥–æ–±–Ω–æ –ø–æ–ª—É—á–∏—Ç—å –æ—Ç–Ω–æ—Å–∏—Ç–µ–ª—å–Ω—ã–µ –ø—É—Ç–∏ (url) –¥–ª—è –∫–∞—Ä—Ç–æ—á–µ–∫
soup = BeautifulSoup(r_page.content, 'html.parser')

In [20]:
# –∫–æ–Ω—Ñ–∏–≥–∏ selenium
prox = Proxy()
prox.proxy_type = ProxyType.MANUAL
prox.http_proxy = random.choice(proxy_list)['http'].replace('http://', '')
capabilities = webdriver.DesiredCapabilities.CHROME
prox.add_to_capabilities(capabilities)
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument(f'user-agent="{ua.random}"')
chrome_options.add_experimental_option('excludeSwitches', ['enable-logging', 'enable-automation'])
chrome_options.add_argument('--blink-settings=imagesEnabled=false')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument("--window-size=1920,1920")


In [21]:
# –ø–æ–ª—É—á–∞–µ–º –æ—Ç–Ω–æ—Å–∏—Ç–µ–ª—å–Ω—ã–µ –ø—É—Ç–∏ –¥–ª—è "–∫–∞—Ä—Ç–æ—á–µ–∫"
card_section_list = [i.a['href'] for i in soup.find_all('li', class_='product_item')]
# –ø–æ–ª—É—á–∞–µ–º –ø–æ–ª–Ω—ã–π URL –¥–ª—è "–∫–∞—Ä—Ç–æ—á–µ–∫"
card_url_list = [BASE_URL + i for i in card_section_list]

In [22]:
import time
start_time = time.time()
i = 1
for card_url in card_url_list:
    browser = webdriver.Chrome(
        'chromedriver.exe',
        options=chrome_options,
        desired_capabilities=capabilities
        )
    browser.get(card_url)
    js = '__YOULA_STATE__.entities.products[0]'
    result = browser.execute_script(f"return {js}")

    df = pd.DataFrame()
    base_fields = {
        'id':               hashlib.md5(result.get('url').encode()).hexdigest(),
        'forumId':          284,
        'name':             result.get('name'),
        'text':             result.get('description'),
        'images':           ';'.join([el['url'] for el in result.get('images')]) if len(result.get('images')) else None,
        'cost':             int(result.get('rawValue')) // 100 if str(result.get('rawValue')).isnumeric() else 0,
        'url':              result.get('url'),
        'fullAddress':     result.get('location')['description'] if result.get('location') else None,
        'latitude':        result.get('location')['latitude'] if result.get('location') else None,
        'longitude':       result.get('location')['longitude'] if result.get('location') else None,
    }

    custom_fields = {}
    if typecode_dict[int(result.get('subcategory'))] == 1: # –∫–≤–∞—Ä—Ç–∏—Ä–∞
        custom_fields['type–°ode'] = 1
        for d in result['attributes']:
            if d.get('slug') == 'balkon':
                custom_fields['balcony'] = True if '–ë–∞–ª–∫–æ–Ω' in d.get('rawValue') or '–ù–µ—Å–∫–æ–ª—å–∫–æ –±–∞–ª–∫–æ–Ω–æ–≤' in d.get('rawValue') else False
                custom_fields['loggia'] = True if '–õ–æ–¥–∂–∏—è' in d.get('rawValue') else False
            if d.get('slug') == 'komnat_v_kvartire':
                custom_fields['roomQuantity'] = int(re.search(r'\d+', d.get('rawValue'))[0]) if any(i.isdigit() for i in d.get('rawValue')) else None
            if d.get('slug') == 'lift':
                custom_fields['passLift'] = True if '–ª–µ–≥–∫–æ–≤–æ–π' in d.get('rawValue').lower() or '–ª–∏—Ñ—Ç–æ–≤' in d.get('rawValue').lower() else False
                custom_fields['cargoLift'] = True if '–≥—Ä—É–∑–æ–≤–æ–π' in d.get('rawValue').lower() or '–ª–∏—Ñ—Ç–æ–≤' in d.get('rawValue').lower() else False
            if d.get('slug') == 'realty_etaj':
                custom_fields['floor'] = int(d.get('rawValue')) if str(d.get('rawValue')).isnumeric() else 0
            if d.get('slug') == 'realty_etajnost_doma':
                custom_fields['floors'] = int(d.get('rawValue')) if str(d.get('rawValue')).isnumeric() else 0
            if d.get('slug') == 'realty_god_postroyki':
                custom_fields['buildYear'] = int(d.get('rawValue')) if str(d.get('rawValue')).isnumeric() else 0
            if d.get('slug') == 'realty_obshaya_ploshad':
                custom_fields['totalArea'] = float(d.get('rawValue')) / 100 if str(d.get('rawValue')).isnumeric() else 0
            if d.get('slug') == 'realty_ploshad_kuhni':
                custom_fields['kitchenArea'] = float(d.get('rawValue')) / 100 if str(d.get('rawValue')).isnumeric() else 0
            if d.get('slug') == 'remont':
                custom_fields['repair'] = repair_type.get(d.get('rawValue')) if repair_type.get(d.get('rawValue')) else 1
            if d.get('slug') == 'sanuzli':
                custom_fields['bathroomType'] = bathroom_type.get(d.get('rawValue')) if bathroom_type.get(d.get('rawValue')) else 2
            if d.get('slug') == 'sobstvennik_ili_agent':
                custom_fields['isOwner'] = True if '–°–æ–±—Å—Ç–≤–µ–Ω–Ω–∏–∫' in d.get('rawValue') else False
            if d.get('slug') == 'tip_doma':
                custom_fields['wallMaterial'] = wall_material.get(d.get('rawValue'))
            if d.get('slug') == 'building_flat_living_area':
                custom_fields['livingArea'] = float(d.get('rawValue')) / 100 if str(d.get('rawValue')).isnumeric() else 0
        if taskcode_dict[int(result.get('subcategory'))] == 1: # –ø—Ä–æ–¥–∞–∂–∞
            custom_fields['task–°ode'] = 1
            for d in result['attributes']:
                if d.get('slug') == 'let_v_sobstvennosti':
                    custom_fields['tenure'] = tenure_dict.get(d.get('rawValue'))
                if d.get('slug') == 'realty_building_type':
                    custom_fields['housing'] = housing_dict.get(d.get('rawValue'))
        if taskcode_dict[int(result.get('subcategory'))] == 2: # –∞—Ä–µ–Ω–¥–∞
            custom_fields['task–°ode'] = 2
            for d in result['attributes']:
                if d.get('slug') == 'holodilnik':
                    custom_fields['fridge'] = True if '–ï—Å—Ç—å' in d.get('rawValue') else False
                if d.get('slug') == 'posudomoechnaya_mashina':
                    custom_fields['dishWasher'] = True if '–ï—Å—Ç—å' in d.get('rawValue') else False
                if d.get('slug') == 'stiralnaya_mashina':
                    custom_fields['washer'] = True if '–ï—Å—Ç—å' in d.get('rawValue') else False
                if d.get('slug') == 'komissiya':
                    custom_fields['commissionType'] = commission_type.get(d.get('rawValue')) if commission_type.get(d.get('rawValue')) else 5
                if d.get('slug') == 'kommunalnie_uslugi_vhodyat':
                    custom_fields['utilitiesInclude'] = True if '–í–∫–ª—é—á–µ–Ω—ã' in d.get('rawValue') else False
                if d.get('slug') == 'predoplata_mesechnaya':
                    custom_fields['prepayType'] = prepay_type.get(d.get('rawValue')) if prepay_type.get(d.get('rawValue')) else 1
    
    if typecode_dict[int(result.get('subcategory'))] == 2: # –∫–æ–º–Ω–∞—Ç–∞
        custom_fields['type–°ode'] = 2
        for d in result['attributes']:
            if d.get('slug') == 'balkon':
                custom_fields['balcony'] = True if '–ë–∞–ª–∫–æ–Ω' in d.get('rawValue') or '–ù–µ—Å–∫–æ–ª—å–∫–æ –±–∞–ª–∫–æ–Ω–æ–≤' in d.get('rawValue') else False
                custom_fields['loggia'] = True if '–õ–æ–¥–∂–∏—è' in d.get('rawValue') else False
            if d.get('slug') == 'komnat_v_kvartire':
                custom_fields['roomQuantity'] = int(re.search(r'\d+', d.get('rawValue'))[0]) if any(i.isdigit() for i in d.get('rawValue')) else None
            if d.get('slug') == 'lift':
                custom_fields['passLift'] = True if '–ª–µ–≥–∫–æ–≤–æ–π' in d.get('rawValue').lower() or '–ª–∏—Ñ—Ç–æ–≤' in d.get('rawValue').lower() else False
                custom_fields['cargoLift'] = True if '–≥—Ä—É–∑–æ–≤–æ–π' in d.get('rawValue').lower() or '–ª–∏—Ñ—Ç–æ–≤' in d.get('rawValue').lower() else False
            if d.get('slug') == 'realty_etaj':
                custom_fields['floor'] = int(d.get('rawValue')) if str(d.get('rawValue')).isnumeric() else 0
            if d.get('slug') == 'realty_etajnost_doma':
                custom_fields['floors'] = int(d.get('rawValue')) if str(d.get('rawValue')).isnumeric() else 0
            if d.get('slug') == 'realty_god_postroyki':
                custom_fields['buildYear'] = int(d.get('rawValue')) if str(d.get('rawValue')).isnumeric() else 0
            if d.get('slug') == 'realty_ploshad_komnati':
                custom_fields['totalArea'] = float(d.get('rawValue')) / 100 if str(d.get('rawValue')).isnumeric() else 0
            if d.get('slug') == 'remont':
                custom_fields['repair'] = repair_type.get(d.get('rawValue')) if repair_type.get(d.get('rawValue')) else 1
            if d.get('slug') == 'sanuzli':
                custom_fields['bathroomType'] = bathroom_type.get(d.get('rawValue')) if bathroom_type.get(d.get('rawValue')) else 2
            if d.get('slug') == 'sobstvennik_ili_agent':
                custom_fields['isOwner'] = True if '–°–æ–±—Å—Ç–≤–µ–Ω–Ω–∏–∫' in d.get('rawValue') else False
            if d.get('slug') == 'tip_doma':
                custom_fields['wallMaterial'] = wall_material.get(d.get('rawValue'))
        if taskcode_dict[int(result.get('subcategory'))] == 1: # –ø—Ä–æ–¥–∞–∂–∞
            custom_fields['task–°ode'] = 1
            for d in result['attributes']:
                if d.get('slug') == 'let_v_sobstvennosti':
                    custom_fields['tenure'] = tenure_dict.get(d.get('rawValue'))
                if d.get('slug') == 'realty_building_type':
                    custom_fields['housing'] = housing_dict.get(d.get('rawValue'))
        if taskcode_dict[int(result.get('subcategory'))] == 2: # –∞—Ä–µ–Ω–¥–∞
            custom_fields['task–°ode'] = 2
            for d in result['attributes']:
                if d.get('slug') == 'holodilnik':
                    custom_fields['fridge'] = True if '–ï—Å—Ç—å' in d.get('rawValue') else False
                if d.get('slug') == 'komissiya':
                    custom_fields['commissionType'] = commission_type.get(d.get('rawValue')) if commission_type.get(d.get('rawValue')) else 5
                if d.get('slug') == 'kommunalnie_uslugi_vhodyat':
                    custom_fields['utilitiesInclude'] = True if '–í–∫–ª—é—á–µ–Ω—ã' in d.get('rawValue') else False
                if d.get('slug') == 'posudomoechnaya_mashina':
                    custom_fields['dishWasher'] = True if '–ï—Å—Ç—å' in d.get('rawValue') else False
                if d.get('slug') == 'predoplata_mesechnaya':
                    custom_fields['prepayType'] = prepay_type.get(d.get('rawValue')) if prepay_type.get(d.get('rawValue')) else 1
                if d.get('slug') == 'stiralnaya_mashina':
                    custom_fields['washer'] = True if '–ï—Å—Ç—å' in d.get('rawValue') else False
    
    if typecode_dict[int(result.get('subcategory'))] == 3: # –∑–∞–≥–æ—Ä–æ–¥–Ω–∞—è
        custom_fields['type–°ode'] = 3
        for d in result['attributes']:
            if d.get('slug') == 'realty_etaj':
                custom_fields['floor'] = int(d.get('rawValue')) if str(d.get('rawValue')).isnumeric() else 0
            if d.get('slug') == 'realty_etajnost_doma':
                custom_fields['floors'] = int(d.get('rawValue')) if str(d.get('rawValue')).isnumeric() else 0
            if d.get('slug') == 'realty_god_postroyki':
                custom_fields['buildYear'] = int(d.get('rawValue')) if str(d.get('rawValue')).isnumeric() else 0
            if d.get('slug') == 'realty_ploshad_doma':
                custom_fields['totalArea'] = float(d.get('rawValue')) / 10 if str(d.get('rawValue')).isnumeric() else 0
            if d.get('slug') == 'realty_ploshad_uchastka':
                custom_fields['landArea'] = float(d.get('rawValue')) / 10 if str(d.get('rawValue')).isnumeric() else 0
            if d.get('slug') == 'sobstvennik_ili_agent':
                custom_fields['isOwner'] = True if '–°–æ–±—Å—Ç–≤–µ–Ω–Ω–∏–∫' in d.get('rawValue') else False
            if d.get('slug') == 'tip_postroyki':
                custom_fields['buildingType'] = building_type.get(d.get('rawValue')) if building_type.get(d.get('rawValue')) else 1
            if d.get('slug') == 'elektrichestvo':
                custom_fields['electricity'] = True if '–ü–æ–¥–∫–ª—é—á–µ–Ω–æ' in d.get('rawValue') else False
            if d.get('slug') == 'garaj_mashinomesto':
                custom_fields['garage'] = False if '–ù–µ—Ç' in d.get('rawValue') else True
            if d.get('slug') == 'gaz':
                custom_fields['gas'] = False if '–ù–µ—Ç' in d.get('rawValue') else True
            if d.get('slug') == 'let_v_sobstvennosti':
                custom_fields['tenure'] = tenure_dict.get(d.get('rawValue'))
            if d.get('slug') == 'material_doma':
                custom_fields['wallMaterial'] = wall_material.get(d.get('rawValue'))
            if d.get('slug') == 'otoplenie':
                custom_fields['heating'] = False if '–ù–µ—Ç' in d.get('rawValue') else True
            if d.get('slug') == 'prodaja_uchastka_elektrichestvo':
                custom_fields['electricity'] = True if '–ï—Å—Ç—å' in d.get('rawValue') else False
            if d.get('slug') == 'prodaja_uchastka_gaz':
                custom_fields['gas'] = False if '–ù–µ—Ç' in d.get('rawValue') else True
            if d.get('slug') == 'sanuzel':
                custom_fields['bathroomType'] = bathroom_type.get(d.get('rawValue')) if bathroom_type.get(d.get('rawValue')) else 4
            if d.get('slug') == 'tip_uchastka':
                custom_fields['plotType'] = plot_type.get(d.get('rawValue'))
            if d.get('slug') == 'vodosnabjenie_i_kanalizaciya':
                custom_fields['waterSupply'] = False if '–ù–µ—Ç' in d.get('rawValue') else True
            if d.get('slug') == 'realty_kolichestvo_spalen':
                custom_fields['bedrooms'] = int(d.get('rawValue')) if str(d.get('rawValue')).isnumeric() else 0
        if taskcode_dict[int(result.get('subcategory'))] == 1: # –ø—Ä–æ–¥–∞–∂–∞
             custom_fields['task–°ode'] = 1
        if taskcode_dict[int(result.get('subcategory'))] == 2: # –∞—Ä–µ–Ω–¥–∞
             custom_fields['task–°ode'] = 2
             for d in result['attributes']:
                 if d.get('slug') == 'komissiya':
                     custom_fields['commissionType'] = commission_type.get(d.get('rawValue')) if commission_type.get(d.get('rawValue')) else 5
                 if d.get('slug') == 'kommunalnie_uslugi_vhodyat':
                     custom_fields['utilitiesInclude'] = True if '–í–∫–ª—é—á–µ–Ω—ã' in d.get('rawValue') else False
                 if d.get('slug') == 'predoplata_mesechnaya':
                     custom_fields['prepayType'] = prepay_type.get(d.get('rawValue')) if prepay_type.get(d.get('rawValue')) else 1
                 if d.get('slug') == 'holodilnik':
                     custom_fields['fridge'] = True if '–ï—Å—Ç—å' in d.get('rawValue') else False
                 if d.get('slug') == 'stiralnaya_mashina':
                     custom_fields['washer'] = True if '–ï—Å—Ç—å' in d.get('rawValue') else False
                 if d.get('slug') == 'posudomoechnaya_mashina':
                     custom_fields['dishWasher'] = True if '–ï—Å—Ç—å' in d.get('rawValue') else False
    
    if typecode_dict[int(result.get('subcategory'))] == 4: # –∫–æ–º–º–µ—Ä—á–µ—Å–∫–∞—è
        custom_fields['type–°ode'] = 4
        for d in result['attributes']:
            if d.get('slug') == 'tip_sdelki':
                custom_fields['task–°ode'] = 1 if d.get('rawValue') == '–ü—Ä–æ–¥–∞–∂–∞' else 2
            if d.get('slug') == 'kommer_realty_tip_stroeniya':
                custom_fields['buildingType'] = building_type.get(d.get('rawValue'))
            if d.get('slug') == 'sobstvennik_ili_agent':
                custom_fields['isOwner'] = True if '–°–æ–±—Å—Ç–≤–µ–Ω–Ω–∏–∫' in d.get('rawValue') else False
            if d.get('slug') == 'realty_obshaya_ploshad':
                custom_fields['totalArea'] = float(d.get('rawValue')) / 100 if str(d.get('rawValue')).isnumeric() else 0

    if typecode_dict[int(result.get('subcategory'))]== 5: # –≥–∞—Ä–∞–∂ –∏ –º–∞—à–∏–Ω–æ–º–µ—Å—Ç–æ
            for d in result['attributes']:
                if d.get('slug') == 'tip_stroeniya':
                    custom_fields['type–°ode'] = 5 if d.get('rawValue') == '–ì–∞—Ä–∞–∂' or d.get('rawValue') == '–ú–∞—à–∏–Ω–æ–º–µ—Å—Ç–æ' else None
                if d.get('slug') == 'tip_sdelki':
                    custom_fields['task–°ode'] = 1 if d.get('rawValue') == '–ü—Ä–æ–¥–∞–∂–∞' else 2
                if d.get('slug') == 'sobstvennik_ili_agent':
                    custom_fields['isOwner'] = True if '–°–æ–±—Å—Ç–≤–µ–Ω–Ω–∏–∫' in d.get('rawValue') else False
            
            if custom_fields.get('type–°ode') is None:
                custom_fields = {}
    i += 1
    if result.get('isReserved'):
        custom_fields['task–°ode'] = 3
                
    if len(custom_fields):
        fields = {**base_fields, **custom_fields}
    else:
        pass
    # pprint.pprint(custom_fields)
browser.close()
print(time.time() - start_time, i)

332.00072050094604 61


In [10]:
c = []
for d in result['attributes']:
    c.append((d.get('slug'), d.get('rawValue')))
c = OrderedDict(c)
c

OrderedDict([('category', '–ù–µ–¥–≤–∏–∂–∏–º–æ—Å—Ç—å'),
             ('subcategory', '–ü—Ä–æ–¥–∞–∂–∞ –∫–≤–∞—Ä—Ç–∏—Ä—ã'),
             ('realty_infrastructure', '–®–∫–æ–ª–∞'),
             ('realty_hidden_location',
              '–†–æ—Å—Å–∏—è, –ì–æ—Ä–æ–¥ –ú–æ—Å–∫–≤–∞, –£–ª–∏—Ü–∞ –ù–æ–≤–æ–≤–∞—Ç—É—Ç–∏–Ω—Å–∫–∞—è 3-—è, 13 –∫–æ—Ä–ø—É—Å 2'),
             ('realty_etaj', '12'),
             ('rasstoyanie_ot_metro', '–ù–∞ —Ç—Ä–∞–Ω—Å–ø–æ—Ä—Ç–µ'),
             ('remont', '–ï–≤—Ä–æ—Ä–µ–º–æ–Ω—Ç'),
             ('sobstvennik_ili_agent', '–ê–≥–µ–Ω—Ç'),
             ('building_flat_living_area', '1500'),
             ('tip_doma', '–ö–∏—Ä–ø–∏—á–Ω–æ-–º–æ–Ω–æ–ª–∏—Ç–Ω—ã–π'),
             ('balkon', '–ù–µ—Ç'),
             ('let_v_sobstvennosti', '–î–æ 3-—Ö –ª–µ—Ç'),
             ('realty_god_postroyki', '2021'),
             ('komnat_v_kvartire', '–°—Ç—É–¥–∏—è'),
             ('realty_obshaya_ploshad', '2760'),
             ('lift', '–õ–µ–≥–∫–æ–≤–æ–π –∏ –≥—Ä—É–∑–æ–≤–æ–π'),
             ('sanuzli', '–°–æ–≤

In [11]:
base_fields

{'id': '3e1875d91dd682e28e0fda40cc3ecb47',
 'forumId': 284,
 'name': '–ö–≤–∞—Ä—Ç–∏—Ä–∞, —Å—Ç—É–¥–∏—è, 27.6 –º¬≤',
 'text': 'üì¢ –í–ù–ò–ú–ê–ù–ò–ï! –ë–†–û–ù–ò–†–£–ô–¢–ï –ö–í–ê–†–¢–ò–†–£ —É–¥–∞–ª–µ–Ω–Ω–æ ON-LINE –ø–æ —Ç–µ–ª–µ—Ñ–æ–Ω—É. ‚òé –ó–í–û–ù–ò–¢–ï –∏ –∑–∞—Ñ–∏–∫—Å–∏—Ä—É–π—Ç–µ —Ü–µ–Ω—É –Ω–∞ —Å–≤–æ—é –∫–≤–∞—Ä—Ç–∏—Ä—É —Å–µ–≥–æ–¥–Ω—è!\nüì¢ –°–ï–ì–û–î–ù–Ø - –ö–í–ê–†–¢–ò–†–ê –í –ú–û–°–ö–í–ï –° –û–¢–î–ï–õ–ö–û–ô –ø–æ–¥ –∫–ª—é—á —Å–æ –°–ö–ò–î–ö–û–ô!\n‚úî –ù–æ–≤—ã–µ –í–∞—Ç—É—Ç–∏–Ω–∫–∏ –¶–µ–Ω—Ç—Ä–∞–ª—å–Ω—ã–π —Ä–∞—Å–ø–æ–ª–æ–∂–µ–Ω –≤ 14 –∫–º –æ—Ç –ú–ö–ê–î –ø–æ –ö–∞–ª—É–∂—Å–∫–æ–º—É —à–æ—Å—Å–µ –≤ —Ü–µ–Ω—Ç—Ä–µ –ù–æ–≤–æ–π –ú–æ—Å–∫–≤—ã. - –ú–æ—Å–∫–æ–≤—Å–∫–∞—è –ü—Ä–æ–ø–∏—Å–∫–∞.\n\n‚úî –ù–∞ –ª–∏—á–Ω–æ–º —Ç—Ä–∞–Ω—Å–ø–æ—Ä—Ç–µ: –¥–æ –ú–ö–ê–î –í—ã –¥–æ–µ–¥–µ—Ç–µ –∑–∞ 10 –º–∏–Ω—É—Ç. –î–æ —Å—Ç–∞–Ω—Ü–∏–π –º–µ—Ç—Ä–æ —Ç—Ä–µ—Ö —Ä–∞–∑–Ω—ã—Ö –≤–µ—Ç–æ–∫ –≤—ã —Å–º–æ–∂–µ—Ç–µ –¥–æ–µ—Ö–∞—Ç—å –∑–∞ 10-20 –º–∏–Ω—É—Ç.\n\n‚úî –ö–≤–∞—Ä—Ç–∏—Ä—ã –≤ –ù–æ–≤—ã—Ö –í–∞—Ç—É—Ç–∏–Ω–∫–∞—Ö —É–∂–µ —Å –≥–æ—Ç–æ–≤–æ–π –æ—Ç–¥–µ–ª–∫–æ–π –∏ –

In [12]:
s = ' –∏ –±–æ–ª–µ–µ'
if any(i.isdigit() for i in s):
    print("True")

In [13]:
str('312s34').isnumeric()

False

In [14]:
print(repair_type.get(None))

None


In [1]:
if None:
    print(1)
else: