In [1]:
import json
import requests
import time
from pprint import pprint
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from tqdm import tqdm
from bs4 import BeautifulSoup

import warnings
warnings.simplefilter('ignore')

from config import CONFIG

### Загрузка ссылок на резюме

In [5]:
vacancy_links = []
driver = webdriver.Chrome()
url = 'https://spb.hh.ru/search/resume?clusters=True&area=2&ored_clusters=True&order_by=relevance&items_on_page=50&logic=normal&pos=full_text&exp_period=all_time&exp_company_size=any&exp_industry=any&st=resumeSearch&text=Web+developer&from=suggest_post&page='

for i in tqdm(range(1, 101)):
    driver.get(url + str(i))
    page = driver.page_source
    soup = BeautifulSoup(page)
    vacancies = (
        soup
        .find_all('a', {'class': 'resume-search-item__name'}, href=True)
    )
    links = [vacancy['href'] for vacancy in vacancies if 'resume' in vacancy['href']]
    vacancy_links.extend(links)
    time.sleep(0.1)

driver.close()

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [09:42<00:00,  4.89s/it]


0


In [8]:
vacancy_links = list(set(vacancy_links))
with open(f'{CONFIG.DATA_FOLDER}/hh_cv_links.txt', 'w', encoding='utf-8') as f:
    f.write('\n'.join(vacancy_links))

### Скачивание резюме

In [None]:
def clean_text(word):
    return ' '.join(word.replace(r'\xa0', '').replace(r'\r', ' ').split())

def parse_cv(page):
    candidate = dict()
    soup = BeautifulSoup(page)

    tmp = soup.find('span', {'data-qa': 'resume-personal-gender'})
    if tmp is not None:
        candidate['gender'] = clean_text(tmp.text)
        
    tmp = soup.find('span', {'data-qa': 'resume-personal-age'})
    if tmp is not None:
        candidate['age'] = clean_text(tmp.text)
    
    tmp = soup.find('span', {'data-qa': 'resume-personal-address'})
    if tmp is not None:
        candidate['city'] = clean_text(tmp.text)
    
    tmp = soup.find('span', {'class': 'resume-block__title-text'})
    if tmp is not None:
        candidate['position_name'] = clean_text(tmp.text)
        
    tmp = soup.find('span', {'class': 'resume-block__salary'})
    if tmp is not None:
        candidate['salary'] = clean_text(tmp.text)
        
    tmp = soup.find('span', {'data-qa': 'resume-block-specialization-category'})
    if tmp is not None:
        candidate['spec_cat'] = clean_text(tmp.text)
        
    tmp = soup.find('div', {'data-qa': 'skills-table'})
    if tmp is not None:
        skills = tmp.find_all('span')
        if skills is not None:
            candidate['skills'] = ' '.join([skill.text for skill in skills[1::2]])
        
    tmp = soup.find('div', {'data-qa': 'resume-block-driver-experience'})
    if tmp is not None:
        tmp = tmp.find('div', {'class': 'resume-block-item-gap'})
        if tmp is not None:
            candidate['driver_exp'] = clean_text(tmp.text)
        
    tmp = soup.find('div', {'data-qa': 'resume-block-education'})
    if tmp is not None:
        item = tmp.find('span', {'class': 'resume-block__title-text'})
        if item is not None:
            candidate['education_degree'] = item.text
                
    tmp = soup.find('div', {'data-qa': 'resume-block-skills'})
    if tmp is not None:
        tmp = tmp.find('div', {'class': 'resume-block-item-gap'})
        if tmp is not None:
            candidate['aboutme_info'] = tmp.text
    
    tmp = soup.find('div', {'data-qa': 'resume-block-education'})
    if tmp is not None:
        education = []
        items = tmp.find_all('div', {'class': 'resume-block-item-gap'})
        if items is not None:
            for item in items:
                edu_item = dict()
                edu_year = item.find('div', {'class': 'bloko-column'})
                if edu_year is not None:
                    edu_item['year'] = clean_text(edu_year.text)
                edu_name = item.find('div', {'data-qa':'resume-block-education-name'})
                if edu_name is not None:
                    edu_item['name'] = clean_text(edu_name.text)
                edu_type = item.find('div', {'data-qa': 'resume-block-education-organization'})
                if edu_type is not None:
                    edu_item['type'] = clean_text(edu_type.text)
                education.append(edu_item)
            candidate['education_list'] = education
    
    tmp = soup.find('div', {'data-qa': 'resume-block-experience'})
    if tmp is not None:
        tmp = tmp.find('div', {'class': 'resume-block-item-gap'})
        if tmp is not None:
            tmp = tmp.find_all('div', {'class': 'resume-block-item-gap'})
            if tmp is not None:
                exp = []
                for item in tmp:
                    exp_item = dict()
                    exp_time = item.find('div', {'class': 'bloko-column'})
                    if exp_time is not None:
                        exp_item['time'] = clean_text(exp_time.text)
                        duration = exp_time.find('div', {'class': 'bloko-text-tertiary'})
                        if duration is not None:
                            exp_item['time'] = exp_item['time'].replace(clean_text(duration.text), '')
                    exp_name = item.find('div', {'class': 'bloko-text-emphasis'})
                    if exp_name is not None:
                        exp_item['company_name'] = clean_text(exp_name.text)
                    exp_ind = item.find('div', {'class': 'resume-block__experience-industries'})
                    if exp_ind is not None:
                        if exp_ind.find('spane') is not None:
                            exp_item['company_industry'] = clean_text(exp_ind.find('span').text)
                    exp_title = item.find('div', {'data-qa': 'resume-block-experience-position'})
                    if exp_title is not None:
                        exp_item['title'] = clean_text(exp_title.text)
                    exp_descr = item.find('div', {'data-qa': 'resume-block-experience-description'})
                    if exp_descr is not None:
                        exp_item['description'] = clean_text(exp_descr.text)
                    exp.append(exp_item)
                candidate['experience_list'] = exp
    
    tmp = soup.find('div', {'data-qa': 'resume-block-languages'})
    if tmp is not None:
        langs = tmp.find_all('p', {'data-qa': 'resume-block-language-item'})
        if langs is not None:
            languages = [lang.text for lang in langs]
            candidate['languages'] = languages
        
    return candidate


candidate = parse_cv(page)
pprint(candidate)

### Скачивание резюме

In [45]:
cvs = []
driver = webdriver.Chrome()

for vacancy_link in tqdm(vacancy_links):
    current_link = site_name + vacancy_link
    driver.get(current_link)
    page = driver.page_source
    cvs.append(parse_cv(page))
    time.sleep(0.3)
    
driver.close()

100%|████████████████████████████████████████████████████████████████████████████| 4943/4943 [6:02:59<00:00,  4.63s/it]


### Сохранение результатов в файл

In [53]:
with open(f'{CONFIG.DATA_FOLDER}/hh_cvs.json', 'w', encoding='utf-8') as f:
    json.dump(cvs, f, ensure_ascii=False, indent=4)