In [1]:
import requests
from bs4 import BeautifulSoup
import time
from pprint import pprint
from requests.structures import CaseInsensitiveDict
import json
from pathlib import Path
# from tqdm import tqdm
from tqdm.notebook import tqdm
import random




In [12]:
def get_pages(soup):
    pagging = soup.find(
        "span",
        class_="ControlGroup ControlGroup_responsive_no ControlGroup_size_s ListingPagination__pages",
    )
    if pagging is None:
        return 1
    else:
        pages = pagging.find_all("span", class_="Button__text")
        return int(pages[-1].text)


def get_html(url):
    headers = CaseInsensitiveDict()
    headers["Cache-Control"] = "no-cache, no-store, must-revalidate"
    headers["Pragma"] = "no-cache"
    headers["Expires"] = "0"
    headers["User-Agent"] = "Mozilla/5.0"
    response = requests.get(url, headers=headers)
    response.encoding = "utf8"
    return response


def get_models_link(url, page=1):
    models_link = []
    raw_html = get_html(f"{url}&page={page}")
    soup = BeautifulSoup(raw_html.text, "html.parser")

    for link in soup.find_all(
        "a",
        class_="Link ListingItemTitle__link ListingItemGroup__link",
    ):
        models_link.append(link.get("href"))

    total_pages = get_pages(soup)

    return total_pages, models_link


def get_cars_link(url, page=1):
    cars_link = []
    raw_html = get_html(f"{url}&page={page}")
    soup = BeautifulSoup(raw_html.text, "html.parser")

    for link in soup.find_all(
        "a",
        class_="Link ListingItemTitle__link",
    ):
        cars_link.append(link.get("href"))

    total_pages = get_pages(soup)

    return total_pages, cars_link


def get_car_info(url):
    url_to_list = url.split("/")
    car_id = url_to_list[-2]
    car_vendor = url_to_list[6]
    car_model = url_to_list[7]

    local_dir = Path("./").absolute()
    file_dir = Path(local_dir.joinpath("raw_html").joinpath(car_vendor).joinpath(car_model))
    file_dir.mkdir(exist_ok=True, parents=True)
    filename = file_dir.joinpath(f"{car_id}.html")
    if not filename.is_file():
        raw_html = get_html(url)
        with open(str(filename), "w") as file_:
            file_.write(raw_html.text)
            return True
    else:
        return False
    


In [40]:
def get_cars_by_vendor(vendor):
    url = f"https://auto.ru/cars/{vendor}/all/?damage_group=ANY&customs_state_group=DOESNT_MATTER&output_type=models_list"
    tqdm.write(f"Getting models for {vendor}")
    pages, model_links = get_models_link(url)
    pbar = tqdm(total=pages)
    pbar.update()
    for page in range(2, pages+1):
        time.sleep(int(random.randint(100,500)/100))
        _, links_to_add = get_models_link(url, page)
        model_links.extend(links_to_add)
        pbar.update()

    tqdm.write(f"Total models: {len(model_links)}")
    pbar = tqdm(total=len(model_links))

    car_links = []
    for url in model_links:
        if "/sale/" in url:
            car_links.append(url)
            pbar.update()
        else:
            pages, links = get_cars_link(url)
            car_links.extend(links)
            pbar.update()
            for page in range(2, pages+1):
                time.sleep(int(random.randint(100,500)/100))
                _, links_to_add = get_cars_link(url, page)
                car_links.extend(links_to_add)
                
    tqdm.write(f"Total cars: {len(car_links)}")
    pbar = tqdm(total=len(car_links))

    for url in car_links:
        if get_car_info(url):
            time.sleep(int(random.randint(100,500)/100))
        pbar.update()


def get_cars_by_model(model_links):
    tqdm.write(f"Total models: {len(model_links)}")
    pbar = tqdm(total=len(model_links))
    car_links = []
    for url in model_links:
        if "/sale/" in url:
            car_links.append(url)
            pbar.update()
        else:
            pages, links = get_cars_link(url)
            car_links.extend(links)
            pbar.update()
            for page in range(2, pages+1):
                time.sleep(int(random.randint(100,500)/100))
                _, links_to_add = get_cars_link(url, page)
                car_links.extend(links_to_add)
                
    tqdm.write(f"Total cars: {len(car_links)}")
    return car_links

def get_models_by_vendor(vendor):
    url = f"https://auto.ru/cars/{vendor}/all/?damage_group=ANY&customs_state_group=DOESNT_MATTER&output_type=models_list"
    tqdm.write(f"Getting models for {vendor}")
    pages, model_links = get_models_link(url)
    pbar = tqdm(total=pages)
    pbar.update()
    for page in range(2, pages+1):
        # time.sleep(int(random.randint(100,500)/100))
        _, links_to_add = get_models_link(url, page)
        model_links.extend(links_to_add)
        pbar.update()

    tqdm.write(f"Total models: {len(model_links)}")

    return model_links

def get_cars_info(url_cars):
    tqdm.write(f"Total cars: {len(url_cars)}")
    pbar = tqdm(total=len(url_cars))

    for url in url_cars:
        get_car_info(url)
        # if get_car_info(url):
        #     time.sleep(int(random.randint(100,200)/100))
        pbar.update()

In [100]:
# url_models = get_models_by_vendor("mercedes")
url_models = get_models_by_vendor("citroen")

Getting models for citroen


  0%|          | 0/3 [00:00<?, ?it/s]

Total models: 85


In [101]:
url_cars = get_cars_by_model([url_models[0]])

Total models: 1


  0%|          | 0/1 [00:00<?, ?it/s]

Total cars: 45


In [102]:
url_cars[0]

'https://auto.ru/cars/used/sale/citroen/c3/1105641590-8468dca7/'

In [68]:
url_models[0]

'https://auto.ru/cars/citroen/c5/4601326/4601327/used/?damage_group=ANY&customs_state_group=DOESNT_MATTER&only_official=false&in_stock=ANY_STOCK&output_type=list'

In [57]:
url_cars

['https://auto.ru/cars/used/sale/mercedes/s_klasse/1105630711-eaeadc05/',
 'https://auto.ru/cars/used/sale/mercedes/s_klasse/1106178333-8766ae3c/',
 'https://auto.ru/cars/used/sale/mercedes/s_klasse/1105672747-4965b13c/',
 'https://auto.ru/cars/used/sale/mercedes/s_klasse/1106413066-f496770e/']

In [None]:
get_cars_info(url_cars)

In [None]:
url = f"https://auto.ru/cars/ford/all/?damage_group=ANY&customs_state_group=DOESNT_MATTER&output_type=models_list"
# url = url_models[0]
headers = CaseInsensitiveDict()
headers["Cache-Control"] = "no-cache, no-store, must-revalidate"
headers["Pragma"] = "no-cache"
headers["Expires"] = "0"
headers["User-Agent"] = "My agent"
response = requests.get(url, headers=headers)
response.encoding = "utf8"
print(response.status_code)
print(response.text)


In [None]:
with open("./raw_html/citroen/c5/1106547075-785b4b04.html", "r") as file_:
    html_from_file = file_.read()

soup = BeautifulSoup(html_from_file, "html.parser")
ads = json.loads(soup.find("script", id="initial-state", type="application/json").text)
pprint(ads)

In [299]:
# pprint(ads["card"]["vehicle_info"]["equipment"])
# pprint(ads["card"]["price_info"]["RUR"])

In [70]:
session = requests.Session()
response = session.get(url_models[0])


In [None]:
response.encoding = "utf8"
print(response.status_code)
print(response.text)


In [None]:
import urllib3
http = urllib3.PoolManager()
r = http.request('GET', url_models[0], headers=headers)
r.data.decode('utf-8')

In [89]:
url_models[0]

'https://auto.ru/cars/citroen/c5/4601326/4601327/used/?damage_group=ANY&customs_state_group=DOESNT_MATTER&only_official=false&in_stock=ANY_STOCK&output_type=list'

In [90]:
import httpx
r = httpx.get("https://auto.ru/cars/citroen/c5/4601326/4601327/used/?damage_group=ANY&customs_state_group=DOESNT_MATTER&only_official=false&in_stock=ANY_STOCK&output_type=table")


In [91]:
r.text

'<!doctype html><html lang="ru" data-reactroot=""><head><link as="script" rel="preload" href="https://auto.ru/_crpd/1t4WqX778/8d4672MzL/dTOAG7yh2zN0WP8LRswr3ki_vEERGKWi-mbhJvtEVQYbi-7xrMIvq48pJXbGTABzM6Ieu7jGB94ueKnFVJBwP513mSK8pfdjxPX7uzKFzyKMNoQBtfiNkyNzIIeOlVTet4v2qZ02I7v7KIXCIpGwe1uTIqPkWjeKLljFqai4vKcR0632Eb0n_4AE0xP-8a8OgNYYUSUU5TMHdzDbpmkwgsODKqVx8vffu5Al8m4d7AvbM-7H8LK3vvaQ_WWklDw7YdtI" nonce="BZMEOI9SdeDK4U2TTUJ2Kg=="/><meta charSet="utf-8"/><meta http-equiv="X-UA-Compatible" content="IE=edge"/><meta name="format-detection" content="telephone=no"/><meta name="theme-color" content="#fff"/><title>Купить Citroen C5 I Рестайлинг с пробегом лифтбек по цене от 150\xa0000\xa0рублей - более 40 Ситроен C5 I Рестайлинг б/у в кузове лифтбек на Авто.ру</title><meta property="og:title" content="Смотрите, что нашлось на Авто.ру: Citroen C5 I Рестайлинг с пробегом лифтбек по цене от 150\xa0000\xa0рублей - более 40 объявлений"/><meta property="twitter:title" content="Смотрите, что нашлось на 