In [23]:
import requests
import pandas as pd
import time
import re
import csv

In [24]:
higher_education_patterns = [
        r"высшее\s+образование",
        r"higher\s+education",
        r"в/о",
        r"ВО\s+обязательно",
        r"университетское\s+образование",
        r"бакалавр|магистр"
    ]

In [39]:
class Parser:
    def __init__(self, query, area=1, pages=1):
        self.query = query
        self.area = area
        self.pages = pages
        self.url = "https://api.hh.ru/vacancies"
        self.headers = {"User-Agent": "Mozilla/5.0"}

        self.vacancies_data = []  

    def search(self):
        for page in range(self.pages):
            params = {
                "text": self.query,
                "area": self.area,
                "per_page": 100,
                "page": page,
                "order_by": "publication_time"
            }
            response = requests.get(self.url, params=params, headers=self.headers)
            if response.status_code == 200:
                data = response.json()
                for item in data["items"]:
                    vacancy_id = item["id"]
                    vacancy_url = f"https://api.hh.ru/vacancies/{vacancy_id}"

                    vacancy_resp = requests.get(vacancy_url, headers=self.headers)
                    if vacancy_resp.status_code == 200:
                        vacancy = vacancy_resp.json()

                        title = vacancy.get("name", "")
                        company = vacancy.get("employer", {}).get("name", "")
                        salary = vacancy.get("salary")
                        if salary:
                            salary_from = salary.get("from")
                            salary_to = salary.get("to")
                            salary_currency = salary.get("currency")
                        else:
                            salary_from = salary_to = salary_currency = None

                        requirements = vacancy.get("snippet", {}).get("requirement", "") or ""
                        responsibilities = vacancy.get("snippet", {}).get("responsibility", "") or ""
                        requirements_lower = requirements.lower()

                        has_higher_education = any(
                            re.search(pattern, requirements_lower)
                            for pattern in higher_education_patterns
                        )

                        skills = [s["name"] for s in vacancy.get("key_skills", [])]

                        experience = vacancy.get("experience", {}).get("name", "")

                        self.vacancies_data.append({
                            "title": title,
                            "company": company,
                            "salary_from": salary_from,
                            "salary_to": salary_to,
                            "salary_currency": salary_currency,
                            "skills": skills,
                            "experience": experience,
                            "has_higher_education": has_higher_education,
                            "requirements": requirements,
                            "responsibilities": responsibilities
                        })

                    time.sleep(0.2)  

            time.sleep(0.2)

    def save_to_csv(self, filename):
        """Сохраняем все данные в CSV"""
        with open(filename, mode="w", newline="", encoding="utf-8-sig") as file:
            writer = csv.writer(file)
            header = [
                "title", "company", "salary_from", "salary_to", "salary_currency",
                "skills", "experience", "has_higher_education",
                "requirements", "responsibilities"
            ]
            writer.writerow(header)
            for v in self.vacancies_data:
                writer.writerow([
                    v["title"], v["company"], v["salary_from"], v["salary_to"], v["salary_currency"],
                    ", ".join(v["skills"]), v["experience"], v["has_higher_education"],
                    v["requirements"], v["responsibilities"]
                ])
        print(f"Данные сохранены в {filename}")

In [40]:
parser = Parser(query="Data Scientist", area=1, pages=2)  
parser.search()
parser.save_to_csv("vacancies.csv")

Данные сохранены в vacancies.csv


In [41]:
df_vacancies = pd.read_csv( "vacancies.csv")

In [44]:
df_vacancies.head()

Unnamed: 0,title,company,salary_from,salary_to,salary_currency,skills,experience,has_higher_education,requirements,responsibilities
0,"Middle +/ Senior Data Scientist (AdTech, предс...",Hyper AdTech,200000.0,300000.0,RUR,"Python, pandas, Numpy, R, Scikit-learn, Matplo...",От 3 до 6 лет,False,,
1,Проджект-менеджер / Системный-аналитик в ИТ-пр...,БСТ Менеджмент-Консалтинг,40000.0,,RUR,"Управление проектами, Бизнес-анализ, Аналитиче...",От 1 года до 3 лет,False,,
2,Разработчик CV БПЛА,Базовые технологии,,,,"Python, Linux, Git, C++, БПЛА, БАС, БЛА, БВС, ...",От 3 до 6 лет,False,,
3,Data Scientist (Senior),IDF Eurasia,,,,"Python, SQL, Scikit-learn, TensorFlow, PyTorch...",От 3 до 6 лет,False,,
4,Data Scientist,Цифровые технологии и платформы,,,,"Python, pandas, SQL",От 1 года до 3 лет,False,,
