In [15]:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd

In [16]:
class HeadHunter:

    def __init__(self, key):
        self.main_link = 'https://spb.hh.ru/search/vacancy/'
        self.user_agent = {'User-agent': 'Mozilla/5.0'}
        self.key = key

        params = {'text': self.key}
        response = bs(requests.get(self.main_link, headers=self.user_agent, params=params).text, 'lxml')
        self.max_pages = int(response.findAll('a', {'data-qa': 'pager-page'})[-1].text)

    def get_vacancies(self, page):
        params = {'text': self.key, 'page': page}
        response = requests.get(self.main_link, headers=self.user_agent, params=params).text
        return bs(response, 'lxml').findAll('div', {'class': 'vacancy-serp-item'})


    def get_attributes(self, vacancy):

        name = vacancy.find('a', {'data-qa': 'vacancy-serp__vacancy-title'}).text
        
        employer_raw = vacancy.find('a', {'data-qa': 'vacancy-serp__vacancy-employer'})
        employer = employer_raw.text if employer_raw else vacancy.find('div', {'class': 'vacancy-serp-item__meta-info'}).text

        location = vacancy.find('span', {'data-qa': 'vacancy-serp__vacancy-address'}).text
        link = vacancy.find('a', {'data-qa': 'vacancy-serp__vacancy-title'})['href']

        salary_raw = vacancy.find('div', {'class': 'vacancy-serp-item__compensation'})
        salary_dict = {'currency': None, 'min': None, 'max': None}

        if salary_raw:
            if '-' in salary_raw.text:
                salary_dict = {
                    'currency': salary_raw.text.split()[-1] if salary_raw.text.split()[-2] != 'бел.' else 'BYR',
                    'min': int(''.join(salary_raw.text.split()[:-1]).split('-')[0]) if salary_raw.text.split()[-2] != 'бел.' else int(''.join(salary_raw.text.split()[:-2]).split('-')[0]),
                    'max': int(''.join(salary_raw.text.split()[:-1]).split('-')[1]) if salary_raw.text.split()[-2] != 'бел.' else int(''.join(salary_raw.text.split()[:-2]).split('-')[1])
                }
            elif 'от' in salary_raw.text:
                salary_dict = {
                    'currency': salary_raw.text.split()[-1] if salary_raw.text.split()[-2] != 'бел.' else 'BYR',
                    'min': int(''.join(salary_raw.text.split()[1:-1])) if salary_raw.text.split()[-2] != 'бел.' else int(''.join(salary_raw.text.split()[1:-2])),
                    'max': None
                }
            elif 'до' in salary_raw.text:
                salary_dict = {
                    'currency': salary_raw.text.split()[-1] if salary_raw.text.split()[-2] != 'бел.' else 'BYR',
                    'min': None,
                    'max': int(''.join(salary_raw.text.split()[1:-1])) if salary_raw.text.split()[-2] != 'бел.' else int(''.join(salary_raw.text.split()[1:-2]))
                }
            
        return {'placement': 'hh.ru', 
                'name': name, 
                'employer': employer,
                'location': location,
                'link': link, 
                'salary_currency': salary_dict['currency'],
                'salary_min': salary_dict['min'],
                'salary_max': salary_dict['max'],
               }

In [17]:
class SuperJob:

    def __init__(self, key):
        self.main_link = 'https://www.superjob.ru/vacancy/search/'
        self.key = key
        self.user_agent = {'User-agent': 'Mozilla/5.0'}

        self.params = {'keywords': self.key}

        response = bs(requests.get(self.main_link, headers=self.user_agent, params=self.params).text, 'lxml')
        page_list = [page.text for page in response.findAll('a', {'class': '_3ze9n'})]
        self.max_pages = int(page_list[-2]) if page_list else 1

    def get_vacancies(self, page):
        self.params['page'] = page
        response = bs(requests.get(self.main_link, headers=self.user_agent, params=self.params).text, 'lxml')
        return response.findAll('div', {'class': '_3zucV _2GPIV f-test-vacancy-item i6-sc _3VcZr'})

    def get_attributes(self, vacancy):

        name = vacancy.find('div', {'class': '_3mfro CuJz5 PlM3e _2JVkc _3LJqf'}).text
        employer = vacancy.find('span', {'class': '_3mfro _3Fsn4 f-test-text-vacancy-item-company-name _9fXTd _2JVkc _3e53o _15msI'}).text
        location = vacancy.find('span', {'class': '_3mfro f-test-text-company-item-location _9fXTd _2JVkc _3e53o'}).text.split(' • ')[1]
        
        link = 'https://www.superjob.ru' + vacancy.find('a', {'class': 'icMQ_'})['href']

        salary_raw = vacancy.find('span', {'class': '_3mfro _2Wp8I f-test-text-company-item-salary PlM3e _2JVkc _2VHxz'}).text.replace('\xa0', ' ')
        salary_dict = {'currency': None, 'min': None, 'max': None}

        if salary_raw != 'По договорённости':
            if '-' in salary_raw:
                salary_dict = {
                    'currency': salary_raw.split()[-1],
                    'min': int(''.join(salary_raw.split()[:-1]).split('-')[0]),
                    'max': int(''.join(salary_raw.split()[:-1]).split('-')[1])
                }
            elif 'от' in salary_raw:
                salary_dict = {
                    'currency': salary_raw.split()[-1],
                    'min': int(''.join(salary_raw.split()[1:3])),
                    'max': None
                }
            elif 'до' in salary_raw:
                salary_dict = {
                    'currency': salary_raw.split()[-1],
                    'min': None,
                    'max': int(''.join(salary_raw.split()[1:3]))
                }

        return {'placement': 'superjob.ru', 
                'name': name,
                'employer': employer,
                'location': location,
                'link': link, 
                'salary_currency': salary_dict['currency'],
                'salary_min': salary_dict['min'],
                'salary_max': salary_dict['max'],
               }

In [18]:
key = 'python'

In [19]:
hh_df = pd.DataFrame()
hh = HeadHunter(key)
i = 1
for page in range(hh.max_pages + 1):
    vacancies_list = hh.get_vacancies(page)
    for vacancy in vacancies_list:
        hh_df = hh_df.append(pd.DataFrame(hh.get_attributes(vacancy), index=[i]))
        i += 1

In [20]:
hh_df['salary_currency'].value_counts()

руб.    623
USD      44
EUR      23
KZT      13
грн.      2
BYR       1
Name: salary_currency, dtype: int64

In [21]:
hh_df.loc[hh_df['salary_currency'] == 'руб.', 'salary_currency'] = 'RUR'
hh_df.loc[hh_df['salary_currency'] == 'грн.', 'salary_currency'] = 'UAH'
hh_df.loc[hh_df['salary_currency'] == 'сум', 'salary_currency'] = 'UZS'

hh_df['salary_currency'].value_counts()

RUR    623
USD     44
EUR     23
KZT     13
UAH      2
BYR      1
Name: salary_currency, dtype: int64

In [22]:
sj_df = pd.DataFrame()
sj = SuperJob(key)
i = 1
for page in range(1, sj.max_pages + 1):
    vacancies_list = sj.get_vacancies(page)
    for vacancy in vacancies_list:
        sj_df = sj_df.append(pd.DataFrame(sj.get_attributes(vacancy), index=[i]))
        i += 1

In [23]:
sj_df['salary_currency'].value_counts()

₽    11
Name: salary_currency, dtype: int64

In [24]:
sj_df.loc[sj_df['salary_currency'] == '₽', 'salary_currency'] = 'RUR'
sj_df['salary_currency'].value_counts()

RUR    11
Name: salary_currency, dtype: int64

In [25]:
multi_df = hh_df.append(sj_df)
multi_df.shape

(2030, 8)

In [26]:
multi_df.head()

Unnamed: 0,placement,name,employer,location,link,salary_currency,salary_min,salary_max
1,hh.ru,Python Developer,iTechArt Group,"Минск, Институт Культуры",https://spb.hh.ru/vacancy/33738779?query=python,,,
2,hh.ru,Data Scientist,HeadHunter::Analytics/Data Science,"Москва, Алексеевская",https://spb.hh.ru/vacancy/35011897?query=python,RUR,200000.0,
3,hh.ru,Junior Python Developer (Mogilev),iTechArt Group,Могилев,https://spb.hh.ru/vacancy/35581311?query=python,,,
4,hh.ru,Программист Python/Django,ТОО BaiBak Exust,Алматы,https://spb.hh.ru/vacancy/35554799?query=python,,,
5,hh.ru,Python-разработчик в Процессы бэк-офиса,Яндекс,Москва,https://spb.hh.ru/vacancy/34950926?query=python,,,


In [27]:
multi_df.to_csv(f'results/vacancies_{key}.csv', index=False)