In [1]:
import requests
from bs4 import BeautifulSoup
import time
from pprint import pprint
from requests.structures import CaseInsensitiveDict
import json
from pathlib import Path
# from tqdm import tqdm
from tqdm.notebook import tqdm
import random




In [12]:
def get_pages(soup):
    pagging = soup.find(
        "span",
        class_="ControlGroup ControlGroup_responsive_no ControlGroup_size_s ListingPagination__pages",
    )
    if pagging is None:
        return 1
    else:
        pages = pagging.find_all("span", class_="Button__text")
        return int(pages[-1].text)


def get_html(url):
    headers = CaseInsensitiveDict()
    headers["Cache-Control"] = "no-cache, no-store, must-revalidate"
    headers["Pragma"] = "no-cache"
    headers["Expires"] = "0"
    headers["User-Agent"] = "Mozilla/5.0"
    response = requests.get(url, headers=headers)
    response.encoding = "utf8"
    return response


def get_models_link(url, page=1):
    models_link = []
    raw_html = get_html(f"{url}&page={page}")
    soup = BeautifulSoup(raw_html.text, "html.parser")

    for link in soup.find_all(
        "a",
        class_="Link ListingItemTitle__link ListingItemGroup__link",
    ):
        models_link.append(link.get("href"))

    total_pages = get_pages(soup)

    return total_pages, models_link


def get_cars_link(url, page=1):
    cars_link = []
    raw_html = get_html(f"{url}&page={page}")
    soup = BeautifulSoup(raw_html.text, "html.parser")

    for link in soup.find_all(
        "a",
        class_="Link ListingItemTitle__link",
    ):
        cars_link.append(link.get("href"))

    total_pages = get_pages(soup)

    return total_pages, cars_link


def get_car_info(url):
    url_to_list = url.split("/")
    car_id = url_to_list[-2]
    car_vendor = url_to_list[6]
    car_model = url_to_list[7]

    local_dir = Path("./").absolute()
    file_dir = Path(local_dir.joinpath("raw_html").joinpath(car_vendor).joinpath(car_model))
    file_dir.mkdir(exist_ok=True, parents=True)
    filename = file_dir.joinpath(f"{car_id}.html")
    if not filename.is_file():
        raw_html = get_html(url)
        with open(str(filename), "w") as file_:
            file_.write(raw_html.text)
            return True
    else:
        return False
    


In [15]:
def get_cars_by_vendor(vendor):
    url = f"https://auto.ru/cars/{vendor}/all/?damage_group=ANY&customs_state_group=DOESNT_MATTER&output_type=models_list"
    tqdm.write(f"Getting models for {vendor}")
    pages, model_links = get_models_link(url)
    pbar = tqdm(total=pages)
    pbar.update()
    for page in range(2, pages+1):
        time.sleep(int(random.randint(100,500)/100))
        _, links_to_add = get_models_link(url, page)
        model_links.extend(links_to_add)
        pbar.update()

    tqdm.write(f"Total models: {len(model_links)}")
    pbar = tqdm(total=len(model_links))

    car_links = []
    for url in model_links:
        if "/sale/" in url:
            car_links.append(url)
            pbar.update()
        else:
            pages, links = get_cars_link(url)
            car_links.extend(links)
            pbar.update()
            for page in range(2, pages+1):
                time.sleep(int(random.randint(100,500)/100))
                _, links_to_add = get_cars_link(url, page)
                car_links.extend(links_to_add)
                
    tqdm.write(f"Total cars: {len(car_links)}")
    pbar = tqdm(total=len(car_links))

    for url in car_links:
        if get_car_info(url):
            time.sleep(int(random.randint(100,500)/100))
        pbar.update()


def get_cars_by_model(model_links):
    tqdm.write(f"Total models: {len(model_links)}")
    pbar = tqdm(total=len(model_links))
    car_links = []
    for url in model_links:
        if "/sale/" in url:
            car_links.append(url)
            pbar.update()
        else:
            pages, links = get_cars_link(url)
            car_links.extend(links)
            pbar.update()
            for page in range(2, pages+1):
                time.sleep(int(random.randint(100,500)/100))
                _, links_to_add = get_cars_link(url, page)
                car_links.extend(links_to_add)
                
    tqdm.write(f"Total cars: {len(car_links)}")
    return car_links

def get_models_by_vendor(vendor):
    url = f"https://auto.ru/cars/{vendor}/all/?damage_group=ANY&customs_state_group=DOESNT_MATTER&output_type=models_list"
    tqdm.write(f"Getting models for {vendor}")
    pages, model_links = get_models_link(url)
    pbar = tqdm(total=pages)
    pbar.update()
    for page in range(2, pages+1):
        time.sleep(int(random.randint(100,500)/100))
        _, links_to_add = get_models_link(url, page)
        model_links.extend(links_to_add)
        pbar.update()

    tqdm.write(f"Total models: {len(model_links)}")

    return model_links

def get_cars_info(url_cars):
    tqdm.write(f"Total cars: {len(url_cars)}")
    pbar = tqdm(total=len(url_cars))

    for url in url_cars:
        if get_car_info(url):
            time.sleep(int(random.randint(100,500)/100))
        pbar.update()

In [4]:
url_models = get_models_by_vendor("audi")

Getting models for audi


  0%|          | 0/7 [00:00<?, ?it/s]

Total models: 224


In [14]:
url_cars = get_cars_by_model(url_models)

Total models: 224


  0%|          | 0/224 [00:00<?, ?it/s]

Total cars: 9790


In [None]:
url = f"https://auto.ru/cars/audi/all/?damage_group=ANY&customs_state_group=DOESNT_MATTER&output_type=models_list"
headers = CaseInsensitiveDict()
headers["Cache-Control"] = "no-cache, no-store, must-revalidate"
headers["Pragma"] = "no-cache"
headers["Expires"] = "0"
headers["User-Agent"] = "Mozilla/5.0"
response = requests.get(url, headers=headers)
response.encoding = "utf8"
print(response.status_code)
print(response.text)


In [None]:
with open("./raw_html/citroen/c5/1106547075-785b4b04.html", "r") as file_:
    html_from_file = file_.read()

soup = BeautifulSoup(html_from_file, "html.parser")
ads = json.loads(soup.find("script", id="initial-state", type="application/json").text)
pprint(ads)

In [299]:
# pprint(ads["card"]["vehicle_info"]["equipment"])
# pprint(ads["card"]["price_info"]["RUR"])