In [None]:
!pip install -U deep-translator

from google.colab import drive
import os
import json
import shutil
import re
from deep_translator import GoogleTranslator


transtator = GoogleTranslator(source='ru', target='en')
drive.mount('/content/drive')
path = r"/content/drive/MyDrive/data" # Папка с файлами
shutil.unpack_archive(os.path.join(path, 'vacancies_20240227.zip'), 'vacancies') # Разархивирую в колаб ZIP файл с вакансиями
shutil.unpack_archive(os.path.join(path, 'resumes_20240227.zip'), 'resumes') # Разархивирую в колаб ZIP файл с резюме


In [None]:
#@title Резюме (словарь с одним уровнем вложенности)

# Функция, которая будет проходить через все элементы многоуровневого вложенного JSON и формировать
# словарь с одним уровнем вложенности, где ключ формируется цепочкой вложенных ключей, разделенных
# символами подчеркивания
def flatten_json(json_data, parent_key='', flattened_dict={}):
    for key, value in json_data.items():
        new_key = parent_key + '_' + key if parent_key else key
        if isinstance(value, dict):
            flatten_json(value, new_key, flattened_dict)
        elif isinstance(value, list):
            for i, item in enumerate(value):
                if isinstance(item, dict):
                    flatten_json(item, f"{new_key}_{i}", flattened_dict)
                else:
                    flattened_dict[f"{new_key}_{i}"] = item
        else:
            flattened_dict[new_key] = value
    return flattened_dict


# Любое резюме, например
resume = 'dd57-491f-8c06-b734c4534323.json'
with open(os.path.join('resumes', resume), 'r') as f:
    resume_json = json.load(f) # JSON с полями: списки, словари, списки словарей, вложенные словари

flattened_resume = flatten_json(resume_json)
for k, v in flattened_resume.items():
    print(f'{k}: {v}')

In [None]:
#@title Вакансия (словарь с одним уровнем вложенности)

# Функция, которая будет проходить через все элементы многоуровневого вложенного JSON и формировать
# словарь с одним уровнем вложенности, где ключ формируется цепочкой вложенных ключей, разделенных
# символами подчеркивания
def flatten_json(json_data, parent_key='', flattened_dict={}):
    for key, value in json_data.items():
        new_key = parent_key + '_' + key if parent_key else key
        if isinstance(value, dict):
            flatten_json(value, new_key, flattened_dict)
        elif isinstance(value, list):
            for i, item in enumerate(value):
                if isinstance(item, dict):
                    flatten_json(item, f"{new_key}_{i}", flattened_dict)
                else:
                    flattened_dict[f"{new_key}_{i}"] = item
        else:
            flattened_dict[new_key] = value
    return flattened_dict


# Любая вакансия, например
vacancy = 'Java_разработчик-6e8ea5e3-47fb-4a15-84fa-6ab618dcbbe0.json'
with open(os.path.join('vacancies', vacancy), 'r') as f:
    vacancy_json = json.load(f)

flattened_vacancy = flatten_json(vacancy_json)
for k, v in flattened_vacancy.items():
    print(f'{k}: {v}')

In [None]:
#@title Формирую строки по нужным полям Резюме для дальнейшей обработки

def create_resume_string(flattened_resume,
                         is_title = True,
                         is_professional_roles = True,
                         is_skill_set = True,
                         is_skills = True,
                         is_experience = True,
                         is_lang = True):
    title = 'Position: '
    professional_roles = ''
    skill_set = 'Skills: '
    skills = 'Additional skills: '
    experience = ''
    lang = 'Languages: '

    # Формирую строки
    for key, value in flattened_resume.items():
        if 'title' in key:
            title += f'{value}. '
        if re.match(r'^professional_roles.*name$', key): # professional_roles / name - начало и конец ключа
            professional_roles += f'{value}. '
        if 'skill_set' in key:
            skill_set += f'{value}, '
        if 'skills' in key:
            skills += f'{value}. '
        if re.match(r'^experience.*position$', key): # experience / position - начало и конец ключа
            experience += f'Job experience as a {value}: '
        if re.match(r'^experience.*description$', key): # experience / description - начало и конец ключа
            experience += f'{value}. '
        if re.match(r'language.*?(\d+_name)', key): # experience / (число)_name - начало и конец ключа
            lang += f'{value} - '
        if re.match(r'language.*level_name', key): # experience / level_name - начало и конец ключа
            lang += f'level: {value}. '
        # ... и т.д. по необходимости

    title = f"{title}" if is_title else ''
    professional_roles = f"{professional_roles}" if is_professional_roles else ''
    skill_set = f"{skill_set}" if is_skill_set else ''
    skills = f"{skills}" if is_skills else ''
    experience = f"{experience}" if is_experience else ''
    lang = f"{lang}" if is_lang else ''

    text_line = re.sub(r'\s+', ' ', title + professional_roles + skill_set + skills + experience + lang)
    return {'ru': text_line, 'en': transtator.translate(text_line)}


# Пример подготовленной строки Резюме для создания чанков или для запроса к ChatGPT
create_resume_string(flattened_resume, is_professional_roles=False)['en']

'Position: Java developer. Skills: Java, Spring Framework, Hibernate ORM, ORACLE, PostgreSQL, Git, SQL, Linux, Docker, springboot, kubernetes, Kafka, apache Kafka, RabbitMQ, REST, SOAP, Additional skills: I am always looking for the best approach to solving a given problem.. SQL. Setting the tasks to developers. Remote work. PostgreSQL is. Automating the process. Refactoring the code. Java. The CI/CD. Docker. The CI. Unit Testing. The Gitlab CI. Reengineering of business processes. Business processes are analyzed. Modeling of processes. Project documentation. Docker is. GitLab. Kafka. The Kubernetes. Microservices. Gitlab. Supporting the software. Apache Kafka. Spring. micro-services. Spring Boot. Hibernate. CoreData. JPA. Camunda. The Principle. The kubernetes. Integration testing. Docker-compose. Postgres. Technical task. Process management. Support for users. Optimizing the processes. Optimizing business processes. Modeling of business processes. The development. Business processes 

In [None]:
#@title Формирую строки по нужным полям Вакансии для дальнейшей обработки

def create_vacancy_string(flattened_vacancy,
                          is_position = True,
                          is_skills = True,
                          is_mandatory_requirements = True,
                          is_experience_level = True,
                          is_project_tasks = True):
    position = 'Position: '
    skills = 'Skills: '
    mandatory_requirements = 'Mandatory requirements: '
    experience_level = 'Experience Levels: '
    project_tasks = 'Project tasks: '

    # Формирую строки
    for key, value in flattened_vacancy.items():
        if 'position' in key:
            position += f'{value}. '
        if 'skills' in key:
            skills += f'{value}, '
        if 'mandatoryRequirements' in key:
            mandatory_requirements += f'{value} '
        if 'experienceLevels' in key:
            experience_level += f'{value}. '
        if 'projectTasks' in key:
            project_tasks += f'{value} '
        # ... и т.д. по необходимости

    position = f"{position}" if is_position else ''
    skills = f"{skills}" if is_skills else ''
    mandatory_requirements = f"{mandatory_requirements}" if is_mandatory_requirements else ''
    experience_level = f"{experience_level}" if is_experience_level else ''
    project_tasks = f"{project_tasks}" if is_project_tasks else ''

    text_line = re.sub(r'\s+', ' ', position + skills + mandatory_requirements + experience_level + project_tasks)
    return {'ru': text_line, 'en': transtator.translate(text_line)}


# Пример подготовленной строки Вакансии для создания чанков или для запроса к ChatGPT
create_vacancy_string(flattened_vacancy)['en']

"Position: Java developer. Skills: SQL, Setting the tasks to developers, Remote work, PostgreSQL is, Automating the process, Refactoring the code, Java, The CI/CD, Docker, The CI, Unit Testing, The Gitlab CI, Reengineering of business processes, Business processes are analyzed, Modeling of processes, Project documentation, Docker is, GitLab, Kafka, The Kubernetes, Microservices, Gitlab, Supporting the software, Apache Kafka, Spring, micro-services, Spring Boot, Hibernate, CoreData, JPA, Camunda, The Principle , The kubernetes, Integration testing, Docker-compose, Postgres, Technical task, Process management, Support for users, Optimizing the processes, Optimizing business processes, Modeling of business processes, The development, Business processes are described, data integration, Core data, Spring Cloud, Project work, Docker/Kubernetes, my uncle., Developing technical tasks, Optimizing the code, Project activities, Formation of technical tasks for development, Testing, System integra