In [1]:
from catboost import CatBoostRanker, Pool, MetricVisualizer
from copy import deepcopy
import numpy as np
import os
import pandas as pd
import json

In [2]:
def get_llm_extracted_data(vacancies):

    llm = open('./data/llm_vacancies.json')
    llm_vacancies = json.load(llm)
    
    for i in range(len(vacancies)):
        llm_vacancies[i]['uuid'] = vacancies[i]['vacancy']['uuid']

    return llm_vacancies

In [3]:
import importlib 
from data.heuristic_extractor import HeuristicExtractor
from data.resume_extractor import ResumeExtractor
from data.llm_extractor import LlmExtractor


orig = open('./data/case_2_data_for_members.json')
vacancies = json.load(orig)

heuristic_extractor = HeuristicExtractor(vacancies)
resume_extractor = ResumeExtractor(vacancies)
llm_extractor = LlmExtractor(vacancies)


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/aaderevyagin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/aaderevyagin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


get_grade data extractor fullness: 14/29
get_work_type data extractor fullness: 19/29
get_experience data extractor fullness: 12/29
get_experience data extractor fullness: 29/29


In [4]:
heuristic_dataset = heuristic_extractor.form_dataset()
llm_dataset = llm_extractor.form_dataset()

In [5]:
llm_dataset

Unnamed: 0,vacancy_uuid,hard_skills,extra_skills,benefits,experience,work_type,monthly_wage,is_junior,is_middle,is_senior,is_teamlead
0,779f3a59-206a-3241-adc4-d7db504f960b,"[java, spring boot, quarkus, micronaut, vert x...",[],7,3,distance,450000.0,False,True,True,False
1,7a4813fc-43bc-3896-a607-4c8682b01002,"[rest, grpc, kafka, rabbitmq, sql, uml, bpmn]",[],0,3,,350000.0,False,True,True,False
2,c03085c3-9b1e-3564-bb1e-59aa72e5fbca,"[oracle, sql, pl sql, ms sql reporting service...",[],9,3,distance,300000.0,False,True,False,False
3,a8dd83c3-178d-3c70-90c2-7c3648f6b96a,"[mq очередь, kafka, sql, jira, confluence, xml...","[умение управлять ожидание заказчик, коммуника...",5,2,flexible,150000.0,False,True,False,False
4,9d98eba0-13bb-38d3-b742-4fd445954b3d,[],"[гибкость, обучаемость, структурность, проакти...",5,3,flexible,630000.0,False,False,False,True
5,4e2299c0-13fc-301d-8f3c-3ccfd0281ce6,"[java 8, sql, intellij idea, json, maven, grad...",[],6,1,,150000.0,True,False,False,False
6,a8f56ed3-3ef3-365d-ade4-2df4db5d4af8,"[java spring boot, postgresql, react js, sql, ...","[гибкость, обучаемость, коммуникабельность]",7,3,flexible,150000.0,False,False,False,False
7,cdbe4d64-991a-35a1-8b71-a05a5ce24123,"[java, spring, redis, kafka, spark streaming, ...","[лидерский качество, способность обучаться обу...",5,3,flexible,150000.0,False,False,True,False
8,7eca3dc1-2108-3152-85be-cd0ee7aa1493,"[teamcity, jenkins, helm, ansible, kubernetes,...","[коммуникативный навык, внимательность деталь,...",0,2-4,,,False,False,False,False
9,01713376-e04d-3f9d-9287-7a5ff74918c3,"[uml, rest, брокер сообщение, mq, sql, xml, json]","[коммуникативный навык, аналитический мышление...",8,3,flexible,150000.0,False,True,True,False


In [6]:
heuristic_dataset

Unnamed: 0,uuid,is_junior,is_middle,is_senior,is_teamlead,work_type,experience,skills
0,779f3a59-206a-3241-adc4-d7db504f960b,True,False,False,False,distance,3.0,"['бизнес', 'kafka', 'обучение', 'quarkus', 'my..."
1,7a4813fc-43bc-3896-a607-4c8682b01002,False,True,False,False,,2.0,"['etl', 'бизнес', 'rest', 'kafka', 'rabbitmq',..."
2,c03085c3-9b1e-3564-bb1e-59aa72e5fbca,,,,,distance,,"['разработка', 'бизнес', 'sql', 'исследование'..."
3,a8dd83c3-178d-3c70-90c2-7c3648f6b96a,,,,,flexible,2.0,"['коммуникабельность', 'postgresql', 'разработ..."
4,9d98eba0-13bb-38d3-b742-4fd445954b3d,False,False,False,False,flexible,,"['банк', 'hr', 'ux', 'скоринг', 'аналитика']"
5,4e2299c0-13fc-301d-8f3c-3ccfd0281ce6,,,,,,1.0,"['javascript', 'java', 'docker', 'spark', 'раз..."
6,a8f56ed3-3ef3-365d-ade4-2df4db5d4af8,,,,,flexible,,"['бизнес', 'postgresql', 'обучение', 'it', 'sp..."
7,cdbe4d64-991a-35a1-8b71-a05a5ce24123,False,True,False,False,flexible,3.0,"['kafka', 'обучение', 'it', 'spring', 'банк', ..."
8,7eca3dc1-2108-3152-85be-cd0ee7aa1493,,,,,office,4.0,"['postgresql', 'nexus', 'java', 'terraform', '..."
9,01713376-e04d-3f9d-9287-7a5ff74918c3,False,True,False,False,flexible,,"['бизнес', 'ответственность', 'rest', 'xsd', '..."


In [7]:
resume_dataset = resume_extractor.form_dataset()

In [8]:
resume_dataset

Unnamed: 0,vacancy_uuid,is_junior,is_middle,is_senior,is_teamlead,relevance,positions,key_skills,extra_skills,working_months,uuid,age
0,779f3a59-206a-3241-adc4-d7db504f960b,False,True,False,False,1,"{java-developer, java-разработчик}","[rest, postgresql, apachekafka, camunda, javas...","[review, avaya, jasperreports, time, to, maven...",74,8c8cf797-2c6b-3f4b-b28b-20d57bd88b82,37.0
1,779f3a59-206a-3241-adc4-d7db504f960b,False,True,False,False,1,"{Head Programmer, Software Java engineer, Desi...","[opencv, analyticalskills, androidsdk, c c, at...","[of, window, door, network, models, reports, t...",150,74221d62-5ea6-3a68-8849-25acd97e208b,36.0
2,779f3a59-206a-3241-adc4-d7db504f960b,False,True,False,False,1,{Java-разработчик},"[ansible, html, postgresql, apachetomcat, juni...","[javafx, voip, mysql, firebird, source, delphi...",115,9d7eae36-11f8-3cac-9cb2-4cb0ff9d0ae7,34.0
3,779f3a59-206a-3241-adc4-d7db504f960b,False,True,False,False,1,"{Наставник, Lead Software Developer}","[webpack, веб разработка, javascript, springbo...","[self, jquery, junit, thymeleaf, frontend, vau...",96,da4c44dd-7c00-3f75-98b4-096b533488a4,28.0
4,779f3a59-206a-3241-adc4-d7db504f960b,False,True,False,False,1,"{Teamlead, Java developer}","[postgresql, алгоритмыиструктурыдавать, apache...","[stomp, rabbitmq, websocket, service, ignite, ...",46,73d59615-b5b2-35fd-a15d-28963fe143d1,29.0
...,...,...,...,...,...,...,...,...,...,...,...,...
651,259bf318-e6a7-3b6c-93f9-e1804a89ee63,False,True,False,False,0,"{Electro Mechanical Technician, Frontend Devel...","[swiper, scrum, javascript, css3, vuex, html5,...","[network, enhance, the, hubs, html, to, worked...",132,8f753cad-0ec3-3948-b01b-cc816770451e,45.0
652,259bf318-e6a7-3b6c-93f9-e1804a89ee63,False,True,False,False,0,{Frontend developer},"[tailwindcss, graphql, axios, docker, nodejs, ...","[pepsico, storybook, socket, react, module, ra...",54,a184f447-01c5-3a47-94e7-47e27d65541f,22.0
653,259bf318-e6a7-3b6c-93f9-e1804a89ee63,False,True,False,False,0,{Frontend-разработчик},"[typescript, javascript, nuxtjs, stylus, git, ...","[review, jquery, js, nuxt, frontend, code, com...",38,917840a8-3eeb-30f9-86a2-3a853c78ab22,26.0
654,259bf318-e6a7-3b6c-93f9-e1804a89ee63,False,True,False,False,0,{Frontend-разработчик},"[pinia, react, javascript, nuxtjs, html5, jira...","[api, figma, cloud, js, node, agile, jwt, merg...",37,abebef46-9820-311d-8d43-0d29edb8e739,23.0


In [323]:
vacancies_compress = {}
for index, item in enumerate(vacancies):
    vacancies_compress[item['vacancy']['uuid']] = index

def form_train_sequence(vacancy_dataset, user_dataset, index, is_train=True):
    count_skipped = 0
    user_dict = user_dataset.iloc[index]
    vacancy_uuid = user_dict['vacancy_uuid']
    vacancy_dict = vacancy_dataset[vacancy_dataset['vacancy_uuid'] == vacancy_uuid].iloc[0]

    result_dict = {}

    # Formulate features

    for grade_type in ['junior', 'middle', 'senior']:
        result_dict[f'{grade_type}_match'] = (user_dict[f'is_{grade_type}'] == vacancy_dict[f'is_{grade_type}'])

    user_hard_skills = set(user_dict['key_skills'])
    user_extra_skills = set(user_dict['extra_skills'])
    vacancy_hard_skills = set(vacancy_dict['hard_skills'])
    vacancy_extra_skills = set(vacancy_dict['extra_skills'])

    result_dict['hard_skills_intersection'] = len(vacancy_hard_skills) / max(len(vacancy_hard_skills.union(user_hard_skills)), 1)
    # result_dict['soft_skills_intersection'] = len(vacancy_extra_skills) / max(len(vacancy_extra_skills.union(user_extra_skills)), 1)
    result_dict['working_months'] = int(user_dict['working_months'])
    if vacancy_dict['monthly_wage'] is not None and vacancy_dict['monthly_wage'] != 'null':
        result_dict['monthly_wage'] = int(vacancy_dict['monthly_wage']) / 1000
    else:
        count_skipped += 1
        result_dict['monthly_wage'] = None
    try:
        result_dict['experience'] = int(vacancy_dict['experience'])
    except ValueError:
        count_skipped += 1
        result_dict['experience'] = None
    result_dict['age'] = user_dict['age']
    result_dict['benefits'] = vacancy_dict['benefits']
    if is_train:
        result_dict['relevance'] = user_dict['relevance']
        result_dict['group'] = vacancies_compress[user_dict['vacancy_uuid']]
        result_dict['weights'] = np.float64(3 if result_dict['monthly_wage'] is not None and result_dict['monthly_wage'] >= 200 else 1)

    return result_dict


In [303]:
cumulative_data = [form_train_sequence(llm_dataset, resume_dataset, index) for index in range(len(resume_dataset))]

preprocessed_train = pd.DataFrame.from_dict(cumulative_data).sort_values(by=['group'])

preprocessed_train

Unnamed: 0,junior_match,middle_match,senior_match,hard_skills_intersection,working_months,monthly_wage,experience,age,benefits,relevance,group,weights
0,True,True,False,0.578947,74,450.0,3.0,37.0,7,1,0,3.0
17,True,True,False,0.407407,89,450.0,3.0,30.0,7,0,0,3.0
16,True,True,False,0.305556,51,450.0,3.0,32.0,7,0,0,3.0
15,True,True,False,0.500000,182,450.0,3.0,37.0,7,0,0,3.0
13,True,True,False,0.478261,51,450.0,3.0,36.0,7,0,0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...
641,True,True,True,0.708333,84,150.0,3.0,32.0,12,1,28,1.0
640,True,True,True,0.653846,61,150.0,3.0,26.0,12,1,28,1.0
639,False,False,True,0.629630,91,150.0,3.0,28.0,12,1,28,1.0
646,True,True,True,0.404762,66,150.0,3.0,32.0,12,1,28,1.0


In [304]:
from sklearn.model_selection import train_test_split
import numpy as np

indexes = np.arange(len(vacancies_compress))
np.random.shuffle(indexes)
group_tr, group_te = indexes[:25], indexes[25:]
train_indexes, test_indexes = [], []
for index, row in preprocessed_train.iterrows():
    if index in group_tr:
        train_indexes.append(index)
    else:
        test_indexes.append(index)

train_indexes = np.array(train_indexes)
test_indexes = np.array(test_indexes)

X_train, X_test = preprocessed_train.iloc[train_indexes], preprocessed_train.iloc[test_indexes]

y_train = X_train['relevance']
train_groups = X_train['group']
train_weights = X_train['weights']
X_train.drop(columns=['relevance', 'group', 'weights'], inplace=True)

y_test = X_test['relevance']
test_groups = X_test['group']
test_weights = X_test['weights']
X_test.drop(columns=['relevance', 'group', 'weights'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train.drop(columns=['relevance', 'group', 'weights'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test.drop(columns=['relevance', 'group', 'weights'], inplace=True)


In [305]:
train = Pool(
    data=X_train,
    label=y_train,
    group_id=train_groups,
)

test = Pool(
    data=X_test,
    label=y_test,
    group_id=test_groups,
)

In [309]:
default_parameters = {
    'iterations': 400,
    'custom_metric': ['NDCG', 'PFound', 'AverageGain:top=10'],
    'verbose': False,
    'random_seed': 0,
    'max_depth': 6,
    'learning_rate': 1 * 10 ** -4
}

parameters = {}

In [307]:
def fit_model(loss_function, additional_params=None, train_pool=train, test_pool=test):
    parameters = deepcopy(default_parameters)
    parameters['loss_function'] = loss_function
    parameters['train_dir'] = loss_function

    if additional_params is not None:
        parameters.update(additional_params)

    model = CatBoostRanker(**parameters)
    model.fit(train_pool, eval_set=test_pool, plot=True)

    return model

In [310]:
def fit_model(loss_function, additional_params=None, train_pool=train, test_pool=test):
    parameters = deepcopy(default_parameters)
    parameters['loss_function'] = loss_function
    parameters['train_dir'] = loss_function

    if additional_params is not None:
        parameters.update(additional_params)

    model = CatBoostRanker(**parameters)
    model.fit(train_pool, eval_set=test_pool, plot=True)

    return model

model = fit_model('PairLogit', {'custom_metric': ['PrecisionAt:top=5', 'RecallAt:top=5', 'MAP:top=5', 'NDCG:top=10']})

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [312]:
test_index = group_te[0]

vacancies[test_index]

{'vacancy': {'uuid': 'c03085c3-9b1e-3564-bb1e-59aa72e5fbca',
  'name': 'Ведущий/ Главный аналитик DWH',
  'keywords': None,
  'description': ' Желательно знания Oracle   сегодня — не просто банк. Это сервисы, которыми пользуются люди с разными потребностями, интересами и желаниями. И для каждого нашего клиента мы стараемся сделать удобные и качественные ИТ-продукты. Без сильной, экспертной ИТ-команды это было бы невозможно, поэтому у нас всегда есть интересные проекты для проверки твоих скиллов, собственная IT-академия для начинающих и классный мерч, который можно получить за хорошую работу. Сейчас у нас открыта вакансия Ведущий аналитик DWH в департамент аналитики. Чем предстоит заниматься: – Участие в обсуждении инициатив бизнес-пользователей с точки зрения их возможной реализации в информационных системах; – Интервьюирование пользователей, сбор бизнес-требований; – Исследование информационных систем Банка и интервьюирование технических владельцев для определения источников данных, и

In [313]:
failed_resume = vacancies[test_index]['failed_resumes'][0]

failed_resume

{'uuid': '8746a855-022c-34d4-9b55-58da5483c255',
 'first_name': 'Иммануил',
 'last_name': 'Соболева',
 'birth_date': '1995-01-12',
 'country': 'Россия',
 'city': 'Казань',
 'about': ' Увлекаюсь психологией и спортом. Есть своя любительская команда по мини-футболу. Умение развиваться и быть открытым ко всему, считаю полезными качествами, чтобы был личностный рост. Классно развиваться в профессиональной сфере и видеть результат о проделанной работы, а также помогать развиваться другим. Нотации: IDEF0, UML, BPMN. СУБД: - MS SQL, Netezza, Greenplum BI инструменты: - Cognos BI, Tableau, BAT (Business Analysis Tool), Reporting Инструменты: - Jira, Confluence, YouTrack, Минерва Методологии управления проектами: - Kanban (KMP 1), Scrum ',
 'key_skills': 'Agile, Project management, Data Governance, SQL, IDEF0, UML, BPMN, MS SQL, Netezza, Greenplum, Cognos BI, Tableau, BAT (Business Analysis Tool), Reporting, Jira, Confluence, YouTrack, Kanban (KMP 1), Scrum.',
 'experienceItem': [{'starts': '20

In [324]:
vacancy_dset = llm_dataset[llm_dataset['vacancy_uuid'] == vacancies[test_index]['vacancy']['uuid']].iloc[0]
user_dset = resume_dataset[resume_dataset['uuid'] == failed_resume['uuid']]

fail_description = form_train_sequence(llm_dataset, user_dset, 0, is_train=False)

fail_description

{'junior_match': True,
 'middle_match': True,
 'senior_match': True,
 'hard_skills_intersection': 0.34615384615384615,
 'working_months': 203,
 'monthly_wage': 300.0,
 'experience': 3,
 'age': 29.0,
 'benefits': 9}

In [326]:
model.predict(pd.DataFrame([fail_description]))

array([0.0117843])

In [329]:
accepted_resume = vacancies[test_index]['failed_resumes'][2]

def predict_rank(resume):
    user_dset = resume_dataset[resume_dataset['uuid'] == resume['uuid']]    
    description = form_train_sequence(llm_dataset, user_dset, 0, is_train=False)
    return model.predict(pd.DataFrame([description]))

predict_rank(accepted_resume)

array([-0.0213782])