In [62]:
import requests                 # Для работы с http-запросами
import fake_useragent           # Для создания заголовков
import json                     # Для работы с форматом json
import time                     # Для задержки между запросами
import psycopg2
from psycopg2 import sql
import pandas as pd
from datetime import datetime

def get_headers():

    user = fake_useragent.UserAgent().random
    headers = {'user-agent': user}
    return headers

def get_page(filter, period, pg=0):

    params = {
        'text': filter,
        'page': pg,
        'per_page': 100,
        'period': period
    }
    url = 'https://api.hh.ru/vacancies'
    req = requests.get(url, params, headers=get_headers())
    print(req)
    data = req.content.decode()
    req.close()
    return data

def get_vacancies():
    filters = ['"Data Engineer" OR "Инженер данных" OR "Дата Инженер"'
               , '"Data Analyst" OR "Аналитик данных"'
               , '"Data Scientist"']

    conn = psycopg2.connect(host='localhost',
                            port=5430,
                            user='postgres', 
                            password='password',
                            dbname='test')

    cur = conn.cursor()

    sql = """SELECT value
             FROM proc.settings
             WHERE name = 'is_init';
           """
    cur.execute(sql)

    select = cur.fetchall()

    if not select:
        period = 30
        sql2 = """INSERT INTO proc.settings (id, name, value) VALUES (1, 'is_init', 'False');"""
        cur.execute(sql2)
        conn.commit()
    else:
        period = 1

    cur.close()

    raw_vacancies = []
    for filter in filters:
        for page in range(0, 25):
            page_dict = json.loads(get_page(filter, period, page))
            print(page_dict['pages'])
            if page_dict.get('items') is not None:
                for vacancy in page_dict.get('items'):
                    raw_vacancies.append(vacancy)

            if (page_dict['pages'] - page) <= 1:
                break

    vacancies = set()
    for vacancy in raw_vacancies:
        vacancies.add((int(vacancy['id']),                                                                                                  #id
                          vacancy['name'],                                                                                                  #vacancy_name
                          vacancy['published_at'],                                                                                          #published_at
                          bool(True if vacancy['archived'] == 'true' else False),                                                           #is_archive
                          bool(True if vacancy['type']['id'] == 'open' else False),                                                         #is_open
                          (vacancy['employer']['id'] if 'id' in vacancy['employer'] else None),                                             #employer_id
                          (vacancy['employer']['name'] if 'name' in vacancy['employer'] else None),                                         #employer_name
                          bool(vacancy['employer']['accredited_it_employer'] if 'accredited_it_employer' in vacancy['employer'] else None), #is_accredited_it_employer
                          (vacancy['experience']['id'] if 'id' in vacancy['experience'] else None),                                         #experience_id
                          (vacancy['experience']['name'] if 'name' in vacancy['experience'] else None),                                     #experience_name
                          (vacancy['area']['id'] if 'id' in vacancy['experience'] else None),                                               #area_id
                          (vacancy['area']['name'] if 'name' in vacancy['experience'] else None),                                           #area_name
                          (vacancy['salary']['from'] if vacancy['salary'] is not None else None),                                           #salary_from
                          (vacancy['salary']['to'] if vacancy['salary'] is not None else None),                                             #salary_to
                          (vacancy['salary']['currency'] if vacancy['salary'] is not None else None),                                       #salary_currency
                          bool(vacancy['salary']['gross'] if vacancy['salary'] is not None else None)))                                     #is_gross

    return vacancies

def load_data(vacancies):
    conn = psycopg2.connect(host='localhost',
                            port=5430,
                            user='postgres', 
                            password='password',
                            dbname='test')

    with conn.cursor() as cur:
        sql = """INSERT INTO stage.vacancy (   id
                                             , vacancy_name
                                             , published_at
                                             , is_archive
                                             , is_open
                                             , employer_id
                                             , employer_name
                                             , is_accredited_it_employer
                                             , experience_id
                                             , experience_name
                                             , area_id
                                             , area_name
                                             , salary_from
                                             , salary_to
                                             , salary_currency
                                             , is_gross)
                 VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"""
        cur.executemany(sql, vacancies)
        conn.commit()


In [63]:


vacancies = get_vacancies(30)

<Response [200]>
7
<Response [200]>
7
<Response [200]>
7
<Response [200]>
7
<Response [200]>
7
<Response [200]>
7
<Response [200]>
7
<Response [200]>
17
<Response [200]>
17
<Response [200]>
17
<Response [200]>
17
<Response [200]>
17
<Response [200]>
17
<Response [200]>
17
<Response [200]>
17
<Response [200]>
17
<Response [200]>
17
<Response [200]>
17
<Response [200]>
17
<Response [200]>
17
<Response [200]>
17
<Response [200]>
17
<Response [200]>
17
<Response [200]>
17
<Response [200]>
6
<Response [200]>
6
<Response [200]>
6
<Response [200]>
6
<Response [200]>
6
<Response [200]>
6


In [51]:
period = 30

filters = ['"Data Engineer" OR "Инженер данных" OR "ETL Developer" OR "ETL разработчик" OR "дата инженер" OR "дата инженер"', '"Data Analyst" OR "Аналитик данных"', '"Data Scientist"']
raw_vacancies = []
for filter in filters:
    for page in range(0, 25):
        # Преобразование текстового ответа запроса в словарь
        page_dict = json.loads(get_page(filter, period, page))
        for vacancy in page_dict['items']:
            print(vacancy)

        if (page_dict['pages'] - page) <= 1:
            break

<Response [200]>
{'id': '86384974', 'premium': False, 'name': 'ETL разработчик', 'department': None, 'has_test': False, 'response_letter_required': False, 'area': {'id': '1', 'name': 'Москва', 'url': 'https://api.hh.ru/areas/1'}, 'salary': None, 'type': {'id': 'open', 'name': 'Открытая'}, 'address': None, 'response_url': None, 'sort_point_distance': None, 'published_at': '2023-09-07T12:24:48+0300', 'created_at': '2023-09-07T12:24:48+0300', 'archived': False, 'apply_alternate_url': 'https://hh.ru/applicant/vacancy_response?vacancyId=86384974', 'show_logo_in_search': None, 'insider_interview': None, 'url': 'https://api.hh.ru/vacancies/86384974?host=hh.ru', 'alternate_url': 'https://hh.ru/vacancy/86384974', 'relations': [], 'employer': {'id': '2009', 'name': 'Antal Talent', 'url': 'https://api.hh.ru/employers/2009', 'alternate_url': 'https://hh.ru/employer/2009', 'logo_urls': {'original': 'https://hhcdn.ru/employer-logo-original/430509.png', '90': 'https://hhcdn.ru/employer-logo/2164106.p

KeyboardInterrupt: 

In [60]:
n = 0
for i in vacancies:
    n += 1

print(n)

2062


In [56]:
load_data(vacancies)

KeyboardInterrupt: 

In [102]:
period = 0

conn = psycopg2.connect(host='localhost',
                        port=5430,
                        user='postgres', 
                        password='password',
                        dbname='test')

cur = conn.cursor()

sql1 = """SELECT value
         FROM proc.settings
         WHERE name = 'is_init';
       """
cur.execute(sql1)

select = cur.fetchall()

if not select:
    period = 30
    sql2 = """INSERT INTO proc.settings (id, name, value) VALUES (1, 'is_init', 'False');"""
    cur.execute(sql2)
    conn.commit()
else:
    period = 1
print(period)

1
