### Импорт библиотек, вспомогательные функции, подготовка модели и данных.


In [None]:
!pip install transformers
!pip install --upgrade pandas==1.4.3
!pip install sentence-transformers
!pip install datasets
!pip install yadisk
!rm -rf sample_data

In [3]:
!nvidia-smi

Thu Mar 30 14:14:47 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   39C    P8     9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [12]:
import pandas as pd
import numpy as np
from torch import nn
from sentence_transformers import SentenceTransformer, models
import torch
from transformers import AutoTokenizer, AutoModel
import yadisk 
from tqdm import tqdm
from scipy import spatial
from sentence_transformers import InputExample
from torch.utils.data import DataLoader
from sentence_transformers import losses

from config import token_key, resumes_bert_ya_path, resumes_bert_local_path, \
                              vacancies_bert_ya_path, vacancies_bert_local_path, \
                              model_ya_path, model_local_path

In [5]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Device: {device}')

Device: cuda


In [6]:
%%time
y = yadisk.YaDisk(token=token_key)
y.download(resumes_bert_ya_path, resumes_bert_local_path)
y.download(vacancies_bert_ya_path, vacancies_bert_local_path)

CPU times: user 13.5 s, sys: 8.36 s, total: 21.9 s
Wall time: 1min 2s


<ResourceLinkObject{'href': 'https://cloud-api.yandex.net/v1/disk/resources?path=disk%3A%2Fhh_bert%2Fvacancies_bert.pkl', 'method': 'GET', 'templated': False, 'path': 'disk:/hh_bert/vacancies_bert.pkl', 'public_key': None, 'public_url': None}>

In [7]:
df_vacancies = pd.read_pickle('vacancies_bert.pkl') 
df_resumes = pd.read_pickle('resumes_bert.pkl')

df_vacancies = df_vacancies[~df_vacancies.labels.isna()]
df_resumes = df_resumes[~df_resumes.labels.isna()]

df_vacancies = df_vacancies.explode('labels')
df_resumes = df_resumes.explode('labels')

ROLES = [ 'dwh', 'db developer', 'db architect', 'c++', 'c#', 'java', 'golang', 'python',
          'kotlin', 'mobile', 'ios', 'android', 'web', 'js', '.net', 'big data', 'game',
          'php', 'bi analyst', 'business analyst', 'web analyst', 'data analyst', 'system analyst',
          'product analyst', 'project manager', 'product manager',
          'release manager', 'product owner', 'db administrator', 'system administrator',
          'qa', 'backend', 'frontend', 'fullstack', 'uxui', 'devops',
          'swift', 'ruby', 'scala', 'etl', 'perl', 'agile', 'scrum master',
          'network administrator', 'cyber security', 'sql', 'machine learning', 'deep learning',
          'computer vision', 'natural language processing', 'data scientist', 'data engineer',
          'react', 'vue', 'native', 'tech writer', '1c', 'architect', 'rust', 'erlang',
          'analyst', 'architect', 'cto', 'sap', 'unity', 'it', 'seosmm', 'tech support', 'developer', 'db'
        ]

ROLES = list(np.unique(ROLES))
SPECIFIC_ANALYST_ROLES = [el for el in ROLES if 'analyst' in el]

LABEL_ROLE_MAP = dict(zip(np.arange(len(ROLES)), ROLES))
ROLE_LABEL_MAP = dict(zip(ROLES, np.arange(len(ROLES))))

SELECTED_ROLES = [ 'dwh', 'db developer', 'db architect', 'c++', 'c#', 'java', 'golang', 'python',
                  'kotlin', 'mobile', 'ios', 'android', 'web', 'js', '.net', 'big data', 'game',
                  'php', 'bi analyst', 'business analyst', 'web analyst', 'data analyst', 'system analyst',
                  'product analyst', 'project manager', 'product manager',
                  'release manager', 'product owner', 'db administrator', 'system administrator',
                  'qa', 'backend', 'frontend', 'fullstack', 'uxui', 'devops',
                  'swift', 'ruby', 'scala', 'etl', 'perl', 'agile', 'scrum master',
                  'network administrator', 'cyber security', 'sql', 'machine learning', 'deep learning',
                  'computer vision', 'natural language processing', 'data scientist', 'data engineer',
                  'react', 'vue', 'native', 'tech writer', '1c', 'rust', 'erlang',
                  'cto', 'sap', 'unity', 'seosmm', 'tech support', 'db'
                ]


SELECTED_LABELS = sorted([ROLE_LABEL_MAP[role] for role in SELECTED_ROLES])

df_vacancies = df_vacancies[df_vacancies.labels.map(lambda x: x in SELECTED_LABELS)]
df_resumes = df_resumes[df_resumes.labels.map(lambda x: x in SELECTED_LABELS)]

In [None]:
df_vacancies = df_vacancies.reset_index(drop=True)
df_resumes = df_resumes.reset_index(drop=True)

In [9]:
def get_base_model(extend_words=None):

    def model_extend_vocabulary(model, extend_words):
        size_before = len(model.tokenizer)
        model.tokenizer.add_tokens(list(extend_words))
        model.auto_model.resize_token_embeddings(len(model.tokenizer))

        size_after = len(model.tokenizer)

        print(f'Tokenizer vocabulary was extended from {size_before} to {size_after}')

    word_embedding_model = models.Transformer('cointegrated/rubert-tiny2')
    
    if extend_words is not None:
        model_extend_vocabulary(word_embedding_model, extend_words)
    
    pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

    return model

In [10]:
y.download(model_ya_path, model_local_path)
model = get_base_model(None)
model.load_state_dict(torch.load(model_local_path))

model.to(device)
model.eval()

Downloading (…)lve/main/config.json:   0%|          | 0.00/715 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/118M [00:00<?, ?B/s]

Some weights of the model checkpoint at cointegrated/rubert-tiny2 were not used when initializing BertModel: ['cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading (…)okenizer_config.json:   0%|          | 0.00/401 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.74M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

SentenceTransformer(
  (0): Transformer({'max_seq_length': 2048, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 312, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)

In [57]:
def get_cosine_similarity(vec1, vec2):
    return 1 - spatial.distance.cosine(vec1, vec2)

def get_texts_similarity(model, text1, text2):
    with torch.no_grad():
        emb1 = model.encode(text1)
        emb2 = model.encode(text2)

    return get_cosine_simmilarity(emb1, emb2)

In [39]:
def sample_resume_by_role(role, n=1):
    label = ROLE_LABEL_MAP[role]
    resumes_sampled = df_resumes[df_resumes.labels == label].sample(n).text_bert_input
    if n == 1:
        resumes_sampled = resumes_sampled.item()
    return resumes_sampled

def sample_vacancy_by_role(role, n=1):
    label = ROLE_LABEL_MAP[role]
    vacancies_sampled = df_vacancies[df_vacancies.labels == label].sample(n).text_bert_input
    if n == 1:
        vacancies_sampled = vacancies_sampled.item()
    return vacancies_sampled

In [133]:
def calculate_similarity_table(vacancy_role, resumes_roles):
    vacancy = sample_vacancy_by_role(vacancy_role)
    resumes = [sample_resume_by_role(role) for role in resumes_roles]

    sims = [get_texts_similarity(model, vacancy, resume) for resume in resumes]

    res_df = pd.DataFrame(zip(resumes, resumes_roles, sims), columns=['text', 'role', 'similarity'])

    res_df = res_df.sort_values(by='similarity', axis=0, ascending=False).reset_index(drop=True)
    vacancy_df = pd.DataFrame([(vacancy, vacancy_role)], columns=['text', 'role'])

    return vacancy_df, res_df

In [132]:
def display_similarity_result(sim_df):
    print('Вакансия:')
    display(sim_df[0])
    print('\nРанжированные резюме:',)
    display(sim_df[1])

### Данные

#### Резюме

In [26]:
df_resumes.sample(5)

Unnamed: 0,text_bert_input,labels
94374,Директор по информационным технологиям. Agile ...,1
57940,"Руководитель проекта, Руководитель отдела, Зам...",47
24638,IT-специалист / Системный администратор. MS SQ...,0
66403,"Frontend developer. Automated Testing, Bootstr...",28
15929,"Системный администратор. Asterisk, Esxi, Hyper...",60


#### Вакансии

In [25]:
df_vacancies.sample(5)

Unnamed: 0,text_bert_input,labels
504355,"Программист 1С. 1С программирование, 1С: Зарпл...",1
558587,Старший специалист технической поддержки клиен...,62
150365,"Senior Java Developer (card processing). Git, ...",34
303986,"Бизнес-аналитик DWH-BI. Cognos, MS SQL, Navisi...",25
349272,"PHP-программист. Apache HTTP Server, MySQL, Ng...",43


### Вычисление схожести пар резюме-вакансия

#### Пример №1

In [140]:
sim_table = calculate_similarity_table(vacancy_role='ios',
                                       resumes_roles=['ios', 'ios', 'ios', 'android', 'data scientist', 'scrum master', 'vue',
                                                      'etl','c++','rust', 'js', 'frontend', 'backend', 'product manager', 'web analyst'])

In [141]:
display_similarity_result(sim_table)

Вакансия:


Unnamed: 0,text,role
0,Regular / Senior iOS Developer. Опис проекту: ...,ios



Ранжированные резюме:


Unnamed: 0,text,role,similarity
0,"Программист iOS. ARC, AutoLayout, GCD, Git, MV...",ios,0.957424
1,"IOS developer. C++, Objective-C, Xcode, iOS, О...",ios,0.945775
2,IOS developer - Looking for new opportunities ...,ios,0.941138
3,"Desktop Developer.NET Framework, ADO.NET, AWS,...",android,0.503109
4,"Junior JavaScript Developer. JavaScript, Node....",js,0.097058
5,"Python Middle. AngularJS, Bash, CSS, Celery, D...",backend,0.020762
6,Программист разработчик Frontend удаленно. Ang...,vue,-0.008571
7,Scrum Master / Скрам-Мастер. Atlassian Conflue...,scrum master,-0.022907
8,Продакт-менеджер. Продакт-менеджер. Не работаю...,product manager,-0.049588
9,Front-end Developer. Front-end developer. js-c...,frontend,-0.071538


#### Пример №2

In [147]:
sim_table = calculate_similarity_table(vacancy_role='php',
                                       resumes_roles=['php', 'php', 'php', 'fullstack', '1c', 'data scientist', 'scrum master', 'vue',
                                                      'etl','c++','rust', 'js', 'frontend', 'backend', 'product manager', 'web analyst'])

In [148]:
display_similarity_result(sim_table)

Вакансия:


Unnamed: 0,text,role
0,"PHP-разработчик. 1С-Битрикс, Ajax, CSS, CSS3, ...",php



Ранжированные резюме:


Unnamed: 0,text,role,similarity
0,"PHP Developer. ClickHouse, Git, Golang, Kafka,...",php,0.982938
1,"PHP developer / PHP разработчик. Ajax, Amazon ...",php,0.77598
2,"php программист. дизайнер, верстальщик, програ...",php,0.543818
3,"Python разработчик. FastApi, Git, Linux, Postg...",backend,0.46086
4,"Консультант 1С. 1С: Предприятие 8, Банк-клиент...",1c,0.353734
5,"Software developer.NET Framework, ADO.NET, Ang...",fullstack,0.238939
6,"Программист-разработчик. Adobe Illustrator, Ad...",web analyst,0.200561
7,"Front-End Developer (React developer). Ajax, C...",js,0.103175
8,"Junior Frontend Developer. CSS, Git, HTML, Jav...",frontend,0.094661
9,"Software Engineer. Atlassian Jira, Bash, Boost...",c++,0.046354


#### Пример №3

In [162]:
sim_table = calculate_similarity_table(vacancy_role='tech writer',
                                       resumes_roles=['tech writer', 'tech writer', 'tech writer', 'project manager',
                                                      'product manager', 'project manager', 'product manager', 'frontend',
                                                      'backend', 'ios', 'android', 'php', 'c++'])

In [163]:
display_similarity_result(sim_table)

Вакансия:


Unnamed: 0,text,role
0,Технический писатель. Компания ООО «Сигма» ище...,tech writer



Ранжированные резюме:


Unnamed: 0,text,role,similarity
0,"Технический писатель. MS Excel, MS Office, MS ...",tech writer,0.974519
1,"Технический писатель. Atlassian Jira, ERP, Git...",tech writer,0.956454
2,"Технический писатель, переводчик. Atlassian Co...",tech writer,0.95093
3,"Инженер-программист. Borland C++, CSS, HTML, M...",php,0.105271
4,"Менеджер проекта. AutoCAD, Ввод в эксплуатацию...",project manager,0.040131
5,"Product manager. JTBD, OKR, PMBOK, Product Hyp...",product manager,-0.025455
6,project manager. Product manager. Разработка р...,product manager,-0.040637
7,"IOS developer. Agile Project Management, Code ...",ios,-0.106395
8,"программист C/C++. AutoCAD, C#, C++, Flash Act...",c++,-0.123417
9,"начальник отдела ИТ. Главный иженер, Ведущий с...",project manager,-0.131767


#### Пример №4

In [168]:
sim_table = calculate_similarity_table(vacancy_role='computer vision',
                                       resumes_roles=['computer vision', 'computer vision', 'data scientist', 'natural language processing',
                                                      'natural language processing', 'big data', 'big data', 'deep learning'])

In [169]:
display_similarity_result(sim_table)

Вакансия:


Unnamed: 0,text,role
0,CV (Computer Vision) специалист. Deep Learning...,computer vision



Ранжированные резюме:


Unnamed: 0,text,role,similarity
0,"Computer Vision Engineer. Boost, C++1, Caffe, ...",computer vision,0.935924
1,"Computer Vision Engineer. Data Science, Deep L...",computer vision,0.909287
2,Deep Learning Engineer. Deep Learning Engineer...,deep learning,0.708397
3,"Data Scientist. Bash, C/C++, Computer vision, ...",natural language processing,0.370417
4,"Руководитель проектов. BPWin, MS Access, MS Of...",big data,0.348476
5,"Аналитик. Airflow, C++, Git, Kubeflow, Linux, ...",data scientist,0.300276
6,Data Analyst. BMC Remedy Action Request System...,natural language processing,0.237864
7,"Ведущий программист Java EE, Архитектор, Тим л...",big data,-0.008374


#### Пример №5

In [174]:
sim_table = calculate_similarity_table(vacancy_role='dwh',
                                       resumes_roles=['dwh', 'dwh', 'dwh', 'etl', 'etl', 'frontend', 'backend',
                                                      'data scientist', 'big data', 'tech writer', 'scrum master'])

In [175]:
display_similarity_result(sim_table)

Вакансия:


Unnamed: 0,text,role
0,"Системный аналитик DWH. DWH, ETL, SQL, Анализ ...",dwh



Ранжированные резюме:


Unnamed: 0,text,role,similarity
0,"Ведущий инженер-программист. ASH, Atlassian Ji...",dwh,0.805334
1,"Senior Software Engineer(DWH, BI, ETL ). Busin...",etl,0.737059
2,"ETL\DWH инженер, BI инженер. Исполнительность....",dwh,0.712773
3,Senior Data Warehouse/BI developer. Business I...,dwh,0.700525
4,Senior SAP ABAP Consultant. ABAP. ETL consulta...,etl,0.594102
5,Технический писатель (Technical Writer). Adobe...,tech writer,0.267142
6,SENIOR DATA / MACHINE LEARNING ENGINEER / BIG ...,big data,0.054569
7,"Data Scientist. Git, GreenPlum, PySpark, Pytho...",data scientist,-0.024785
8,"Scrum Master. Atlassian Jira, Kanban, PSM1, Sc...",scrum master,-0.088918
9,"Java-программист. Game Programming, Java, MS V...",backend,-0.289199


#### Пример №6

In [180]:
sim_table = calculate_similarity_table(vacancy_role='product manager',
                                       resumes_roles=['product manager', 'product manager', 'product manager', 'project manager',
                                                      'release manager', 'scrum master', 'tech writer', 'scrum master'])

In [181]:
display_similarity_result(sim_table)

Вакансия:


Unnamed: 0,text,role
0,"Product Manager/Owner (Кипр, Лимассол). Крупна...",product manager



Ранжированные резюме:


Unnamed: 0,text,role,similarity
0,"Product owner. Agile Project Management, Event...",product manager,0.973883
1,"Product Manager. Agile Project Management, Atl...",product manager,0.956208
2,Продакт-менеджер. Продакт менеджер. Обязанност...,product manager,0.809983
3,Помощник руководителя (Бизнес ассистент). Бизн...,project manager,0.312084
4,"Scrum Master. Agile, Agile Project Management,...",scrum master,0.220706
5,Специалист отдела информационных технологий. Р...,release manager,0.206099
6,Software engineer C / Team Leader / Scrum mast...,scrum master,0.151159
7,Бизнес-аналитик / Консультант ИТ / Логист. Тех...,tech writer,0.055449


#### Пример №7

In [185]:
sim_table = calculate_similarity_table(vacancy_role='devops',
                                       resumes_roles=['devops', 'devops', 'devops', 'js', 'backend', 'frontend', 'data engineer', '.net', 'web',
                                                      'data scientist', 'bi analyst', 'release manager', 'scrum master', 'tech writer', 'scrum master'])

In [186]:
display_similarity_result(sim_table)

Вакансия:


Unnamed: 0,text,role
0,"DevOps/Build Engineer. AWS, Azure, Bash, C#, D...",devops



Ранжированные резюме:


Unnamed: 0,text,role,similarity
0,"DevOps / системный администратор Linux. Bash, ...",devops,0.890392
1,"DevOps. Active Directory, AngularJS, Apache HT...",devops,0.884493
2,"DevOps Support. Bash, Linux. DevOps. Working w...",devops,0.875111
3,"PHP-программист. Ajax, CSS3, Git, HTML5, JavaS...",backend,0.19919
4,"Release manager. Ведение переговоров, Деловая ...",release manager,0.051729
5,Менеджер ИТ проектов и команды разработки (Scr...,scrum master,0.050774
6,"Data Engineer. AWS, AWS Boto3, AWS Glue, AWS L...",data engineer,-0.010077
7,"Network engineer. Cisco, Cisco ASA, Cisco NEXU...",.net,-0.025141
8,"Веб-разработчик NodeJS. C++1, C/C++1, JavaScri...",js,-0.027075
9,"Sсrum master. Atlassian Jira, GrowthHacking, K...",scrum master,-0.029025
