In [306]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import sys
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from tqdm.notebook import tqdm
from catboost import CatBoostRegressor
from sklearn.preprocessing import LabelEncoder
from nesteddictionary import NestedDict 
# from nested_dict import nested_dict

from tqdm.notebook import tqdm

from bs4 import BeautifulSoup  as bs
from pprint import pprint  
import requests  
import json
from multiprocessing import Pool  #предоставляет возможность параллельных процессов
from threading import Thread
from joblib import Parallel, delayed
import itertools

In [307]:
max_colwidth = 1000
pd.set_option("display.max_colwidth", max_colwidth)

In [308]:
!pip freeze > requirements.txt
RANDOM_SEED = 42

# Собираем датасет - парсим AUTO.RU

In [309]:
global df
df = []

def parse_car_page(url):


    def text_rem(tag, rem_text):
        try:
            return tag.text.replace(rem_text, '')
        except Exception:
            return np.nan


    headers = {'User-Agent':'Chrome/47.0.2526.83'}
    response = requests.get(url, headers=headers)
    response.encoding = 'utf-8'
    soup = bs(response.text, 'html.parser')
    

    try:
        color = soup.find('li', class_='CardInfoRow CardInfoRow_color').a.text
    except Exception:
        color = np.nan

    engineDisplacement = soup.find('meta', itemprop="engineDisplacement").get('content', np.nan)
    fuelType = soup.find('meta', itemprop="fuelType").get('content', np.nan)
    modelDate = soup.find('meta', itemprop="modelDate").get('content', np.nan)
    numberOfDoors = soup.find('meta', itemprop="numberOfDoors").get('content', np.nan)
    priceCurrency = soup.find('meta', itemprop="priceCurrency").get('content', np.nan)
    bodyType = soup.find('meta', itemprop="bodyType").get('content', np.nan)
    vehicleTransmission = soup.find('meta', itemprop="vehicleTransmission").get('content', np.nan)

    sell_id_tag = soup.find('div', title="Идентификатор объявления")#.text.replace('№ ', '')
    sell_id = text_rem(sell_id_tag, '№ ')

    owners_tag = soup.find('li', class_="CardInfoRow CardInfoRow_ownersCount")#.text.replace('Владельцы', '')
    owners = text_rem(owners_tag, 'Владельцы')

    pts_tag = soup.find('li', class_="CardInfoRow CardInfoRow_pts")#.text.replace('ПТС', '')
    pts = text_rem(pts_tag, 'ПТС')

    owning_tag = soup.find('li', class_="CardInfoRow CardInfoRow_owningTime")#.text.replace('Владение', '')
    owning = text_rem(owning_tag, 'Владение')
    

    privod_tag = soup.find('li', class_="CardInfoRow CardInfoRow_drive")#.text.replace('Привод', '')
    privod = text_rem(privod_tag, 'Привод')

    rul_tag = soup.find('li', class_="CardInfoRow CardInfoRow_wheel")#.text.replace('Руль', '')
    rul = text_rem(rul_tag, 'Руль')

    state_tag = soup.find('li', class_="CardInfoRow CardInfoRow_state")#.text.replace('Состояние', '')
    state = text_rem(state_tag, 'Состояние')

    customs_tag = soup.find('li', class_="CardInfoRow CardInfoRow_customs")#.text.replace('Таможня', '')
    customs = text_rem(customs_tag, 'Таможня')

    s1 = soup.find('div', id='app').find('div', id="sale-data-attributes").get('data-bem')#['data-bem']
    d1 = json.loads(s1).get('sale-data-attributes')#['sale-data-attributes']
    brand = d1.get('mark', np.nan)#['mark']
    model_name = d1.get('model', np.nan)#['model']
    productionDate = d1.get('year', np.nan)#['year']
    enginePower = d1.get('power', np.nan)#['power']
    # image = d1.get('image', np.nan)#['image']
    mileage = d1.get('km-age', np.nan)#['km-age']

    s2 = soup.find('script', id="initial-state").string
    d2 = json.loads(s2)

    # nested_dict = NestedDict( d2 )
    # way_to_equip = nested_dict.findall('equipment')
    
    # equipment_dict = dict(nested_dict[way_to_equip[0]])
    try: equipment_dict = d2['card']['vehicle_info']['equipment']
    except: equipment_dict = np.nan

    s3 = soup.find('div', id='sale-data-attributes')['data-bem']
    d3 = json.loads(s1)['sale-data-attributes']
    price = d3.get('price', np.nan)#d3['price']


    # way_to_super_gen = nested_dict.findall('tech_param')
    # super_gen = dict(nested_dict[way_to_super_gen[0]])
    try: super_gen = d2['card']['vehicle_info']['tech_param']
    except: super_gen = np.nan #nested_dict[way_to_super_gen]


    #return 
    car_params = {'brand': brand, 
                 'model_name': model_name, 
                 'productionDate': productionDate,
                 'enginePower':enginePower,
                #  'image': image,
                 'mileage': mileage,
                 'equipment_dict': equipment_dict,
                 'bodyType': bodyType,
                 'color': color,
                 'engineDisplacement': engineDisplacement,
                 'fuelType': fuelType,
                 'modelDate': modelDate,
                 'numberOfDoors': numberOfDoors,
                 'priceCurrency': priceCurrency,
                 'sell_id': sell_id,
                 'super_gen': super_gen,
                 'vehicleTransmission': vehicleTransmission,
                 'Владельцы': owners,
                 'ПТС': pts,
                 'Владение': owning,
                 'Привод': privod,
                 'Руль': rul,
                 'Состояние': state,
                 'Таможня': customs,
                 'price': price
                 }

    return car_params
    # df.append(car_params)

def parse_catalog_pages(url_catalog):
    response = requests.get(url_catalog)
    response.encoding = 'utf-8'
    soup = bs(response.text, 'html.parser')
    link_tags = soup.find_all('a', class_='Link ListingItemTitle-module__link')
    links = [link['href'] for link in link_tags]

    return links

def find_last_page(first_page_url):
    response = requests.get(first_page_url)
    response.encoding = 'utf-8'
    soup = bs(response.text, 'html.parser')
    all_pages = soup.find('span', class_='ControlGroup ControlGroup_responsive_no ControlGroup_size_s ListingPagination-module__pages')#.text
    page_buttoms = all_pages.find_all('span', class_='Button__text')

    return int(page_buttoms[-1].text)


def pages_list(pages_url):
    pages_list = []
    try: last_page = find_last_page(pages_url)
    except: return [pages_url]
    for page_i in range(1, last_page + 1):
        pages_list.append(f'{pages_url}?page={str(page_i)}&output_type=list')
    return pages_list



def collect_marks_links_list():
    marks_list = []
    catalog_url = 'https://auto.ru/catalog/cars/'
    response = requests.get(catalog_url)
    response.encoding = 'utf-8'
    soup = bs(response.text, 'html.parser')
    all_marks_tags = soup.find('div', class_='search-form-v2-list search-form-v2-list_invisible search-form-v2-list_type_all clearfix')
    marks_list_by_columns = all_marks_tags.find_all('div', class_='search-form-v2-list__text-item')
    marks_links_list = [mark.a['href'].split('/')[-2] for mark in marks_list_by_columns]
    return marks_links_list

def generate_all_pages_list():
    all_pages_list = []
    for mark in collect_marks_links_list():
        for year in range(1960, 2021):
            all_pages_list.append(f'https://auto.ru/moskva/cars/{mark}/{year}-year/all/')

    return all_pages_list

In [310]:
all_pages_list = generate_all_pages_list()

In [311]:
all_pages_list[14697]

'https://auto.ru/moskva/cars/volkswagen/2017-year/all/'

In [312]:
# pages_url = 'https://auto.ru/moskva/cars/all/?output_type=list&page='
# pages_url = 'https://auto.ru/moskva/cars/acura/1998-year/all/'
# threads = []
n_jobs = 50
car_urls = []
car_pages_lists_urls = []

# all_pages_list = generate_all_pages_list()
# print(all_pages_list)
# for page_link in tqdm(all_pages_list[14695:14700:]):
    # print(page_link)
car_pages_lists_urls = Parallel(n_jobs=n_jobs)(delayed(pages_list)(page_link) for page_link in tqdm(all_pages_list))
car_pages_lists_urls = list(itertools.chain(*car_pages_lists_urls))






# for page_link in tqdm(pages_list(pages_url)):
# for page_link in tqdm(all_pages_list[14680:14700:]):
    # for car_link in parse_catalog_pages(page_link):
        # x = Thread(target=parse_car_page, args=[car_link])
        # Parallel(n_jobs = 8)(delayed(parse_car_page)(car_link) for url in tqdm(car_link))
        # threads.append(x)
        # x.start()

HBox(children=(FloatProgress(value=0.0, max=16653.0), HTML(value='')))






In [313]:
full_cars_links = Parallel(n_jobs=n_jobs)(delayed(parse_catalog_pages)(page_link) for page_link in tqdm(car_pages_lists_urls))


HBox(children=(FloatProgress(value=0.0, max=18468.0), HTML(value='')))




In [None]:
full_cars_links = list(itertools.chain(*full_cars_links))

In [None]:
print(full_cars_links[102:110][0])
print(len(full_cars_links))
# # np.unique(np.unique(full_cars_links, return_counts=True)[1], return_counts=True)
# # ser = pd.Series(full_cars_links)
# # ser.value_counts()
# def chunkIt(seq, num):
#     avg = len(seq) / float(num)
#     out = []
#     last = 0.0
#     while last < len(seq):
#         out.append(seq[int(last):int(last + avg)])
#         last += avg
#     return out

# chunked_cars_links = chunkIt(full_cars_links, 8)
# len(chunked_cars_links)

In [None]:
full_cars_links[10750]

In [283]:
full_cars_list = []

# for chunked_cars in tqdm(chunked_cars_links):
#     full_cars_dict.append(Parallel(n_jobs=n_jobs)(delayed(parse_car_page)(page_link) for page_link in tqdm(chunked_cars)))

# full_cars_dict = list(itertools.chain(*full_cars_dict))
# Parallel(n_jobs=32)(delayed(parse_car_page)(page_link) for page_link in tqdm(full_cars_links))

# for page_link in tqdm(all_pages_list[14680:14700:]):
#     for car_link in parse_catalog_pages(page_link):
#         x = Thread(target=parse_car_page, args=[car_link])
#         Parallel(n_jobs = 8)(delayed(parse_car_page)(car_link) for url in tqdm(car_link))
#         threads.append(x)
#         x.start()

# ur = 'https://auto.ru/cars/used/sale/honda/civic/1097454742-c7416dcb/'
# ur = 'https://auto.ru/cars/used/sale/chery/tiggo/1101446096-1b649488/'
# parse_car_page(ur)
# response = requests.get(ur)
# response.encoding = 'utf-8'
# soup = bs(response.text, 'html.parser')
# sell_id = soup.find('div', title="Идентификатор объявления").text.replace('№ ', '')
full_cars_list += (Parallel(n_jobs=n_jobs)(delayed(parse_car_page)(page_link) for page_link in tqdm(full_cars_links)))
# len(chunked_cars_links[1][10800:11100])

HBox(children=(FloatProgress(value=0.0, max=88997.0), HTML(value='')))

AttributeError: 'NoneType' object has no attribute 'text'

In [304]:
# data = pd.DataFrame(full_cars_list)
# data
# full_cars_list[0]['super_gen']
# for keyval in full_cars_list[0]['equipment_dict']:
#     print(keyval)
# full_cars_list += (Parallel(n_jobs=n_jobs)(delayed(parse_car_page)(page_link) for page_link in tqdm(full_cars_links[33474:33475])))

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




AttributeError: 'NoneType' object has no attribute 'text'

In [305]:
full_cars_links[33474:33475]

['https://auto.ru/cars/new/group/kia/soul/21551907/21552208/1100903374-39f3a2e7/']

In [None]:
# ur = 'https://auto.ru/cars/used/sale/honda/civic/1097454742-c7416dcb/'
# ur1 = 'https://auto.ru/cars/used/sale/chery/tiggo/1101446096-1b649488/'
# abba = []
# abba.append(parse_car_page(ur))
# # abba.append(parse_car_page(ur1))
# # data = pd.DataFrame(abba)
# # data#['super_gen']
# abba

In [None]:
# chunked_cars_links[1][10715:10720]

In [None]:
# len(full_cars_dict)
# parse_car_page(full_cars_links[0])
# df

# https://auto.ru/moskva/cars/volkswagen/2017-year/all/?page=1&output_type=list

In [281]:
data = pd.DataFrame(full_cars_dict)
data.info()
# print(collect_marks_links_list())
full_cars_list[1]

<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Empty DataFrame

{'brand': 'HAVAL',
 'model_name': 'F7',
 'productionDate': 2020,
 'enginePower': 150,
 'mileage': 0,
 'equipment_dict': {'cruise-control': True,
  'asr': True,
  'esp': True,
  'usb': True,
  'multi-wheel': True,
  'audiosystem-tv': True,
  'airbag-passenger': True,
  'front-centre-armrest': True,
  'apple-carplay': True,
  'roof-rails': True,
  'bas': True,
  'lock': True,
  'rear-camera': True,
  'servo': True,
  'electro-mirrors': True,
  'drl': True,
  'steering-wheel-gear-shift-paddles': True,
  'electro-window-back': True,
  'mirrors-heat': True,
  '17-inch-wheels': True,
  'dark-interior': True,
  'rear-seats-heat': True,
  'wheel-heat': True,
  'led-lights': True,
  'park-assist-r': True,
  'glonass': True,
  'airbag-driver': True,
  'isofix': True,
  'aux': True,
  'electro-window-front': True,
  'drive-mode-sys': True,
  'light-sensor': True,
  'hcc': True,
  'automatic-lighting-control': True,
  'passenger-seat-updown': True,
  'computer': True,
  'keyless-entry': True,
  's

In [None]:
# # mark_url = 'https://auto.ru/catalog/cars/uaz/'
# mark_url_name = 'ac'
          
# def collect_models_links_list(mark_url_name):
#     models_links_list = []
#     mark_url = f'https://auto.ru/catalog/cars/{mark_url_name}/'
#     response = requests.get(mark_url)
#     response.encoding = 'utf-8'
#     soup = bs(response.text, 'html.parser')
#     all_models_tags = soup.find('div', class_='search-form-v2-list search-form-v2-list_invisible search-form-v2-list_type_all clearfix')
#     if all_models_tags==None:
#         all_models_tags = soup.find('div', class_='search-form-v2-list search-form-v2-list_type_popular clearfix')
#     models_list_by_columns = all_models_tags.find_all('div', class_='search-form-v2-list__text-item')
#     models_links_list = [mark.a['href'].split('/')[-2] for mark in models_list_by_columns]
#     return models_links_list
# all_models_links = []
# marks_links_list = collect_marks_links_list()

# for mark in tqdm(marks_links_list):
#     x = Thread(target=collect_models_links_list, args=[mark])
#     threads.append(x)
#     x.start()
#     all_models_links.append(collect_models_links_list(mark))

# all_models_links

# # print(collect_models_links_list(mark_url_name))

In [None]:
# def collect_marks():
#     marks_list = []
#     catalog_url = 'https://auto.ru/catalog/cars/bmw/'
#     response = requests.get(catalog_url)
#     response.encoding = 'utf-8'
#     soup = bs(response.text, 'html.parser')
#     all_marks_tags = soup.find('div', class_='search-form-v2-list search-form-v2-list_invisible search-form-v2-list_type_all clearfix')
#     marks_list_by_columns = all_marks_tags.find_all('div', class_='search-form-v2-list__text-item')
#     marks_list.append([mark.text for mark in marks_list_by_columns])
#     return marks_list
# print(collect_marks())

In [None]:
# /html/body/div[4]/div[2]/div[2]/div/div/form/div[1]/div[4]/div/div[3]

In [None]:
len(df)

In [None]:
# data = pd.DataFrame(df)
# data.to_csv(r'parsed_data.csv', index=False)

In [None]:
# l = list(test.columns)
# l.remove('model_name')
# l.remove('brand')
# l.remove('productionDate')
# l.remove('enginePower')
# l.remove('image')
# l.remove('mileage')
# l.remove('equipment_dict')
# l.remove('bodyType')
# l.remove('color')
# l.remove('engineDisplacement')
# l.remove('fuelType')
# l.remove('modelDate')
# l.remove('numberOfDoors')
# l.remove('priceCurrency')
# l.remove('sell_id')
# l.remove('super_gen')
# l.remove('vehicleTransmission')
# l.remove('Владельцы')
# l.remove('ПТС')
# l.remove('Владение')
# l.remove('Привод')
# l.remove('Руль')
# l.remove('Состояние')
# l.remove('Таможня')
# l.remove('Состояние')
# l.remove('Состояние')
# l.remove('Состояние')
# l

In [None]:
# def get_page(url):
#   page = requests.get(url)
#   if page.status_code != 200:
#     raise BaseException("response code " + str(page.status_code))
#   return page
# pages = Parallel(n_jobs = 5)(delayed(get_page)(url) for url in tqdm(urls))