In [None]:
import pandas as pd
import numpy as np
import math
import re
import json
import requests
import time
from bs4 import BeautifulSoup
from datetime import datetime

In [None]:
# Предварительная настройка
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)

In [None]:
def get_data(url):
    """ Функция скачивает данные и возвращает объект soup"""
    response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
    response.encoding = 'utf-8'
    if response.status_code == 200:
        return BeautifulSoup(response.text, 'html.parser')
    else:
        print(response.text)

In [None]:
def clean_num(text):
    """ Очищает и конвертирует строку в num """
    num = re.search('[\d|\.]+', text.replace('\xa0',''))
    if num:
        return num[0]

In [None]:
def get_maxpage(soup):
    """ Функция возвращает максимальное количество страниц на сайте по данной марке машин"""
    pages = soup.find('div', class_='ListingPagination-module__container')
    return int(pages.find_all('a')[-3].get('href').split('=')[1])

In [None]:
def get_features(soup, name_feuture):
    return soup.find('li', class_=name_feuture).find_all('span')[1].text

In [None]:
def get_params(soup):
    """ Возвращает набор признаков """
    try:
        initial_state = json.loads(str(soup.find('script', {'id': 'initial-state'}).text))
        # Пробег
        try:
            mileage = initial_state['card']['state']['mileage']
        except:
            mileage = None
        # Описание
        try:
            description = initial_state['card']['description']
        except:
            description = None
        # Описание модели
        try:
            model_info = initial_state['card']['vehicle_info']['model_info']
        except:
            model_info = None
        # Название модели
        try:
            model_name = model_info['name']
        except:
            model_name = None
        # Комплектация
        try:
            complectation_dict = initial_state['card']['vehicle_info']['complectation']  
        except:
            complectation_dict = None

        try:
            equipment_dict = initial_state['card']['vehicle_info']['equipment']    
        except:
            equipment_dict = None

        try:
            super_gen = initial_state['card']['vehicle_info']['super_gen']
        except:
            super_gen = None

        # Производитель
        try:
            vendor = initial_state['card']['vehicle_info']['vendor']
        except:
            vendor = None
        # Модель
        try:
            name = initial_state['card']['vehicle_info']['tech_param']['human_name']
        except:
            name = None


        source = json.loads(soup.find('script', {'type':"application/ld+json"}).text)
        # Кузов
        try:
            bodyType = source['bodyType']
        except:
            bodyType = None
        # Марка
        try:
            brand = source['brand']
        except:
            brand = None
        # Ссылка
        try:
            car_url = source['offers']['url']
        except:
            car_url = None
        # Цвет
        try:
            color = source['color']
        except:
            color = None
        # Двигатель
        try:
            engineDisplacement = source['vehicleEngine']['engineDisplacement']
        except:
            engineDisplacement = None

        try:
            enginePower = source['vehicleEngine']['enginePower']
        except:
            enginePower = None

        try:
            fuelType = source['fuelType']
        except:
            fuelType = None

        # Ссылка на картинку
        try:
            image = source['image']
        except:
            image = None
        try:
            modelDate = source['modelDate']
        except:
            modelDate = None
        # description = source['description']
        # Количество дверей
        try:
            numberOfDoors = source['numberOfDoors']
        except:
            numberOfDoors = None
        # Цена
        try:
            price = source['offers']['price']
        except:
            price = None
        # Валюта
        try:
            priceCurrency = source['offers']['priceCurrency']
        except:
            priceCurrency = None

        try:
            vehicleConfiguration = source['vehicleConfiguration']
        except:
            vehicleConfiguration = None

        try:
            vehicleTransmission = source['vehicleTransmission']
        except:
            vehicleTransmission = None

        # Дата выпуска
        try:
            productionDate = source['productionDate']
        except:
            productionDate = None
        # Номер объявления
        try:
            sell_id = soup.find('a', class_='Link PriceUsedOffer-module__stats').get('href').split('?')[1].split('=')[1].split('-')[0]
        except:
            sell_id = None

        # Время парсинга
        parsing_unixtime = int(datetime.today().timestamp())


        # modelDate = get_features(soup, 'CardInfoRow CardInfoRow_year')

        # Пробег
        # mileage = clean_num(get_features(soup, 'CardInfoRow_kmAge'))

        # Кузов
        # bodyType = get_features(soup, 'CardInfoRow_bodytype')

        # Цвет
        # color = get_features(soup, 'CardInfoRow_color')

        # Налог
        # get_features(soup, 'CardInfoRow_transportTax')

        # transmission
        # get_features(soup, 'CardInfoRow_transmission')

        # drive
        # Привод
        try:
            drive = get_features(soup, 'CardInfoRow_drive')
        except:
            drive = None

        # enginwheele
        # Руль
        try:
            enginwheele = get_features(soup, 'CardInfoRow_wheel')
        except:
            enginwheele = None

        # state
        # Состояние
        try:
            state = get_features(soup, 'CardInfoRow_state')
        except:
            state = None

        # ownersCount
        # Владельцы
        try:
            ownersCount = get_features(soup, 'CardInfoRow_ownersCount')
        except:
            ownersCount = None
        # pts
        # ПТС
        try:
            pts = get_features(soup, 'CardInfoRow_pts')
        except:
            pts = None

        # owningTime
        # Владение
        try:
            owningTime = get_features(soup, 'CardInfoRow_owningTime')
        except:
            owningTime = 0

        # customs
        # Таможня
        try:
            customs = get_features(soup, 'CardInfoRow_customs')
        except:
            customs = None
        return pd.DataFrame(np.array([[bodyType, brand, car_url, color, complectation_dict, description, engineDisplacement, enginePower, equipment_dict, fuelType, image, mileage, modelDate, 
                             model_info, model_name, name, numberOfDoors, parsing_unixtime, priceCurrency, productionDate, sell_id, super_gen, vehicleConfiguration, vehicleTransmission, 
                             vendor, ownersCount, owningTime, pts, drive, enginwheele, state, customs, price]]), 
                            columns = ['bodyType', 'brand', 'car_url', 'color', 'complectation_dict', 'description', 'engineDisplacement', 'enginePower', 'equipment_dict', 'fuelType', 
                                       'image', 'mileage', 'modelDate', 'model_info', 'model_name', 'name', 'numberOfDoors', 'parsing_unixtime', 'priceCurrency', 'productionDate', 
                                       'sell_id', 'super_gen', 'vehicleConfiguration', 'vehicleTransmission', 'vendor', 'Владельцы', 'Владение', 'ПТС', 'Привод', 'Руль', 'Состояние', 'Таможня', 'price'])
    except:
        print('initial_state - clear')
        return None

# Можно try/except вынести в отдельную функцию, чтобы сократить код    

In [None]:
# Смотрим объявления по Москве
url = 'https://auto.ru/moskva/'
soup = get_data(url)

# Получаем список марок машин на сайте
marks = soup.find('div', class_='IndexMarks').find_all('a', class_='IndexMarks__item')

In [None]:
# Если требуется заново пропарсить сайт, следует need_new выставить в True (потребуется несколько часов/дней)
need_new = False
if need_new:
    # Инициализируем DataFrame
    parse_data = pd.DataFrame()
    # get urls
    # DEBUG
    # for mark in marks[:2]:
    for mark in marks:
        mark_name = mark.find('div', class_='IndexMarks__item-name').text
        # Парсим марки машин которые есть в тестовом датасете
        url = mark.get('href')
        print(url)
        response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
        response.encoding = 'utf-8'
        soup = BeautifulSoup(response.text, 'html.parser')
        # Получаем количество страниц по данной марке и парсим каждую
        # DEBUG
        # for page in range(1, get_maxpage(soup)+1)[:2]:
        for page in range(1, get_maxpage(soup)+1):
            new_url = '{}?page={}'.format(url, page)
            # Чтобы не частить, выводим раз в 5 страниц
            if page % 2 == 0:
                print(new_url)
            soup = get_data(new_url)
            # Получаем список url всех машин на странице
            try:
                # DEBUG
                # for car in soup.find_all('h3')[:3]:
                for car in soup.find_all('h3'):
                    try:
                        car_url = car.find('a').get('href')
                    except:
                        print('Нет данных', car_url)
                        continue
                    try:
                        # Парсим каждую
                        car_data = get_params(get_data(car_url))
                    except:
                        continue
                    # Если есть данные добавляем в DataFrame
                    if car_data is not None:
                        parse_data = parse_data.append(car_data)
                time.sleep(0.5)
            except:
                continue
        # Фиксируем данные (после обработки каждой марки, для перестраховки)
        parse_data.to_csv('data/parse_data.csv')
        time.sleep(0.5)
            
else:
    # Если парсить заново не надо, загружаем
    parse_data = pd.read_csv('data/parse_data.csv')