In [44]:
%load_ext autoreload
%autoreload 2

from core.models.clustering_model import StackedModels, hyperparameters_tuning
import pickle

import numpy as np
import pandas as pd
from tqdm.auto import tqdm

from core import ROOT_PATH
from core.embedding_models import EmbeddingModel, FastTextEmbeddingModel
from core.models.clustering_model import ClusteringModel

tqdm.pandas()

In [6]:
ft_embedding_model = FastTextEmbeddingModel()
hf_embedding_model = EmbeddingModel()

In [16]:
data_path = ROOT_PATH / "data"
data_vacancies_path = data_path / "data_vacancies_processed_1k.csv"
df_vacancies = pd.read_csv(data_vacancies_path, index_col=0)
df_vacancies["work_skills"] = df_vacancies.work_skills.apply(eval)

df_vacancies.head()

Unnamed: 0,id,custom_position,schedule,salary_from,salary_to,offer_education_id,education_name,education_is_base,education_order_num,city_id,list_regions,work_skills,tags_id
0,48202096,Сварщик-сборщик,полный рабочий день,60000,120000,0,любое,True,0,2,[4],"[сварочные работы, сборка изделий по чертежам,...",
1,48202097,Сварщик-монтажник,полный рабочий день,60000,120000,0,любое,True,0,2,[4],"[монтажные работы, строительные работы, электр...",
2,48202098,Слесарь-сборщик,полный рабочий день,60000,80000,0,любое,True,0,2,[4],"[работа на фрезерных станках, слесарный ремонт...",
3,48202356,Грузчик-упаковщик,частичная занятость,30000,35000,0,любое,True,0,1,[3],"[комплектация товара, маркировка, стрессоустой...","[6, 9]"
4,48202357,Грузчик-упаковщик,частичная занятость,30000,35000,0,любое,True,0,57,"[181, 182, 183, 185, 186, 187, 188, 189, 190, ...","[маркировка, стрессоустойчивость, погрузочно-р...","[6, 9]"


# Skills embeddings FastText

In [17]:
unique_skills = df_vacancies.work_skills.explode().unique()
unique_skills_embeddings_fasttext = {
    k: ft_embedding_model.generate(k) for k in tqdm(unique_skills)
}

path = data_path / "unique_skills_embeddings_fasttext_dict.pkl"
with open(str(path), "wb") as f:
    pickle.dump(unique_skills_embeddings_fasttext, f)

  0%|          | 0/896 [00:00<?, ?it/s]

In [19]:
df_vacancies["mean_skills_embedding"] = df_vacancies.work_skills.apply(
    lambda x: list(map(unique_skills_embeddings_fasttext.get, x))
).apply(lambda x: np.mean(x, axis=0))
df_vacancies.head()

Unnamed: 0,id,custom_position,schedule,salary_from,salary_to,offer_education_id,education_name,education_is_base,education_order_num,city_id,list_regions,work_skills,tags_id,mean_skills_embedding
0,48202096,Сварщик-сборщик,полный рабочий день,60000,120000,0,любое,True,0,2,[4],"[сварочные работы, сборка изделий по чертежам,...",,"[0.038997207, 0.0051816893, 0.011276969, 0.044..."
1,48202097,Сварщик-монтажник,полный рабочий день,60000,120000,0,любое,True,0,2,[4],"[монтажные работы, строительные работы, электр...",,"[0.02835547, 0.0015033595, 0.011215197, 0.0164..."
2,48202098,Слесарь-сборщик,полный рабочий день,60000,80000,0,любое,True,0,2,[4],"[работа на фрезерных станках, слесарный ремонт...",,"[0.009839139, 0.009206571, 0.026602346, 0.0124..."
3,48202356,Грузчик-упаковщик,частичная занятость,30000,35000,0,любое,True,0,1,[3],"[комплектация товара, маркировка, стрессоустой...","[6, 9]","[0.043267135, -0.01055224, 0.015114057, -0.004..."
4,48202357,Грузчик-упаковщик,частичная занятость,30000,35000,0,любое,True,0,57,"[181, 182, 183, 185, 186, 187, 188, 189, 190, ...","[маркировка, стрессоустойчивость, погрузочно-р...","[6, 9]","[0.043267135, -0.01055224, 0.015114057, -0.004..."


## Position embedding

In [21]:
df_vacancies["position_embedding"] = df_vacancies.custom_position.apply(
    ft_embedding_model.generate
)
df_vacancies.head()

Unnamed: 0,id,custom_position,schedule,salary_from,salary_to,offer_education_id,education_name,education_is_base,education_order_num,city_id,list_regions,work_skills,tags_id,mean_skills_embedding,position_embedding
0,48202096,Сварщик-сборщик,полный рабочий день,60000,120000,0,любое,True,0,2,[4],"[сварочные работы, сборка изделий по чертежам,...",,"[0.038997207, 0.0051816893, 0.011276969, 0.044...","[0.025920482, -0.0037517531, -0.039083816, -0...."
1,48202097,Сварщик-монтажник,полный рабочий день,60000,120000,0,любое,True,0,2,[4],"[монтажные работы, строительные работы, электр...",,"[0.02835547, 0.0015033595, 0.011215197, 0.0164...","[0.015991792, -0.012740483, -0.021870257, 0.00..."
2,48202098,Слесарь-сборщик,полный рабочий день,60000,80000,0,любое,True,0,2,[4],"[работа на фрезерных станках, слесарный ремонт...",,"[0.009839139, 0.009206571, 0.026602346, 0.0124...","[0.0334669, -0.006879592, -0.030826846, -0.006..."
3,48202356,Грузчик-упаковщик,частичная занятость,30000,35000,0,любое,True,0,1,[3],"[комплектация товара, маркировка, стрессоустой...","[6, 9]","[0.043267135, -0.01055224, 0.015114057, -0.004...","[0.0063961684, 0.013592867, -0.02881398, -0.00..."
4,48202357,Грузчик-упаковщик,частичная занятость,30000,35000,0,любое,True,0,57,"[181, 182, 183, 185, 186, 187, 188, 189, 190, ...","[маркировка, стрессоустойчивость, погрузочно-р...","[6, 9]","[0.043267135, -0.01055224, 0.015114057, -0.004...","[0.0063961684, 0.013592867, -0.02881398, -0.00..."


In [22]:
df_vacancies["mean_salary"] = (df_vacancies.salary_to + df_vacancies.salary_from) / 2
df_vacancies.head()

Unnamed: 0,id,custom_position,schedule,salary_from,salary_to,offer_education_id,education_name,education_is_base,education_order_num,city_id,list_regions,work_skills,tags_id,mean_skills_embedding,position_embedding,mean_salary
0,48202096,Сварщик-сборщик,полный рабочий день,60000,120000,0,любое,True,0,2,[4],"[сварочные работы, сборка изделий по чертежам,...",,"[0.038997207, 0.0051816893, 0.011276969, 0.044...","[0.025920482, -0.0037517531, -0.039083816, -0....",90000.0
1,48202097,Сварщик-монтажник,полный рабочий день,60000,120000,0,любое,True,0,2,[4],"[монтажные работы, строительные работы, электр...",,"[0.02835547, 0.0015033595, 0.011215197, 0.0164...","[0.015991792, -0.012740483, -0.021870257, 0.00...",90000.0
2,48202098,Слесарь-сборщик,полный рабочий день,60000,80000,0,любое,True,0,2,[4],"[работа на фрезерных станках, слесарный ремонт...",,"[0.009839139, 0.009206571, 0.026602346, 0.0124...","[0.0334669, -0.006879592, -0.030826846, -0.006...",70000.0
3,48202356,Грузчик-упаковщик,частичная занятость,30000,35000,0,любое,True,0,1,[3],"[комплектация товара, маркировка, стрессоустой...","[6, 9]","[0.043267135, -0.01055224, 0.015114057, -0.004...","[0.0063961684, 0.013592867, -0.02881398, -0.00...",32500.0
4,48202357,Грузчик-упаковщик,частичная занятость,30000,35000,0,любое,True,0,57,"[181, 182, 183, 185, 186, 187, 188, 189, 190, ...","[маркировка, стрессоустойчивость, погрузочно-р...","[6, 9]","[0.043267135, -0.01055224, 0.015114057, -0.004...","[0.0063961684, 0.013592867, -0.02881398, -0.00...",32500.0


## All text embeddings

In [40]:
skills_separator = ", "
df_vacancies["work_skills_concat"] = df_vacancies.work_skills.apply(
    lambda x: skills_separator.join(x)
)

columns_separator = "\n"
columns_to_concat = [
    "custom_position",
    "schedule",
    "education_name",
    "work_skills_concat",
]
df_vacancies["vacancy_concat"] = df_vacancies[columns_to_concat].agg(
    columns_separator.join, axis=1
)
df_vacancies["vacancy_concat_embedding"] = df_vacancies.vacancy_concat.apply(
    ft_embedding_model.generate
)
df_vacancies.head()

Unnamed: 0,id,custom_position,schedule,salary_from,salary_to,offer_education_id,education_name,education_is_base,education_order_num,city_id,...,work_skills,tags_id,mean_skills_embedding,position_embedding,mean_salary,cluster_label_based_on_mean_skills_embedding,cluster_label_based_on_position_embedding,work_skills_concat,vacancy_concat,vacancy_concat_embedding
0,48202096,Сварщик-сборщик,полный рабочий день,60000,120000,0,любое,True,0,2,...,"[сварочные работы, сборка изделий по чертежам,...",,"[0.038997207, 0.0051816893, 0.011276969, 0.044...","[0.025920482, -0.0037517531, -0.039083816, -0....",90000.0,1,6,"сварочные работы, сборка изделий по чертежам, ...",Сварщик-сборщик\nполный рабочий день\nлюбое\nс...,"[0.03263462, 0.004344731, 0.011846015, 0.01777..."
1,48202097,Сварщик-монтажник,полный рабочий день,60000,120000,0,любое,True,0,2,...,"[монтажные работы, строительные работы, электр...",,"[0.02835547, 0.0015033595, 0.011215197, 0.0164...","[0.015991792, -0.012740483, -0.021870257, 0.00...",90000.0,1,6,"монтажные работы, строительные работы, электро...",Сварщик-монтажник\nполный рабочий день\nлюбое\...,"[0.025237968, -0.007739478, 0.0049960185, 0.00..."
2,48202098,Слесарь-сборщик,полный рабочий день,60000,80000,0,любое,True,0,2,...,"[работа на фрезерных станках, слесарный ремонт...",,"[0.009839139, 0.009206571, 0.026602346, 0.0124...","[0.0334669, -0.006879592, -0.030826846, -0.006...",70000.0,1,6,"работа на фрезерных станках, слесарный ремонт,...",Слесарь-сборщик\nполный рабочий день\nлюбое\nр...,"[0.020780401, 0.0021014009, 0.009381698, 0.005..."
3,48202356,Грузчик-упаковщик,частичная занятость,30000,35000,0,любое,True,0,1,...,"[комплектация товара, маркировка, стрессоустой...","[6, 9]","[0.043267135, -0.01055224, 0.015114057, -0.004...","[0.0063961684, 0.013592867, -0.02881398, -0.00...",32500.0,10,6,"комплектация товара, маркировка, стрессоустойч...",Грузчик-упаковщик\nчастичная занятость\nлюбое\...,"[0.029587226, -0.014430203, 0.01080143, -0.005..."
4,48202357,Грузчик-упаковщик,частичная занятость,30000,35000,0,любое,True,0,57,...,"[маркировка, стрессоустойчивость, погрузочно-р...","[6, 9]","[0.043267135, -0.01055224, 0.015114057, -0.004...","[0.0063961684, 0.013592867, -0.02881398, -0.00...",32500.0,10,6,"маркировка, стрессоустойчивость, погрузочно-ра...",Грузчик-упаковщик\nчастичная занятость\nлюбое\...,"[0.02958723, -0.014430203, 0.010801432, -0.005..."


## Save DataFrame with embeddings

In [23]:
path = data_path / "vacancies_with_skills_and_positions_embeddings_fasttext.pkl"
df_vacancies.to_pickle(path)

## Training clusterization model

In [34]:
checkpoints_path = ROOT_PATH / "checkpoints"

In [35]:
model = ClusteringModel()

column_to_clusterize = "mean_skills_embedding"
if column_to_clusterize not in df_vacancies.columns:
    raise KeyError(f"{column_to_clusterize} not presented in the DataFrame")

embeddings = np.array(df_vacancies[column_to_clusterize].tolist())
clustering_model = model.train(embeddings)

path = checkpoints_path / f"clustering_model_{column_to_clusterize}.pkl"
clustering_model.save_model(path)

  super()._check_params_vs_input(X, default_n_init=10)


Model saved at /Users/Konstantin.Grotov/Documents/programming/projects/jbr/salary-prediction/checkpoints/clustering_model_mean_skills_embedding.pkl


In [33]:
df_vacancies[f"cluster_label_based_on_{column_to_clusterize}"] = df_vacancies[
    column_to_clusterize
].progress_apply(clustering_model.predict)
df_vacancies.head()

  0%|          | 0/1000 [00:00<?, ?it/s]

Unnamed: 0,id,custom_position,schedule,salary_from,salary_to,offer_education_id,education_name,education_is_base,education_order_num,city_id,list_regions,work_skills,tags_id,mean_skills_embedding,position_embedding,mean_salary,cluster_label_based_on_mean_skills_embedding
0,48202096,Сварщик-сборщик,полный рабочий день,60000,120000,0,любое,True,0,2,[4],"[сварочные работы, сборка изделий по чертежам,...",,"[0.038997207, 0.0051816893, 0.011276969, 0.044...","[0.025920482, -0.0037517531, -0.039083816, -0....",90000.0,1
1,48202097,Сварщик-монтажник,полный рабочий день,60000,120000,0,любое,True,0,2,[4],"[монтажные работы, строительные работы, электр...",,"[0.02835547, 0.0015033595, 0.011215197, 0.0164...","[0.015991792, -0.012740483, -0.021870257, 0.00...",90000.0,1
2,48202098,Слесарь-сборщик,полный рабочий день,60000,80000,0,любое,True,0,2,[4],"[работа на фрезерных станках, слесарный ремонт...",,"[0.009839139, 0.009206571, 0.026602346, 0.0124...","[0.0334669, -0.006879592, -0.030826846, -0.006...",70000.0,1
3,48202356,Грузчик-упаковщик,частичная занятость,30000,35000,0,любое,True,0,1,[3],"[комплектация товара, маркировка, стрессоустой...","[6, 9]","[0.043267135, -0.01055224, 0.015114057, -0.004...","[0.0063961684, 0.013592867, -0.02881398, -0.00...",32500.0,10
4,48202357,Грузчик-упаковщик,частичная занятость,30000,35000,0,любое,True,0,57,"[181, 182, 183, 185, 186, 187, 188, 189, 190, ...","[маркировка, стрессоустойчивость, погрузочно-р...","[6, 9]","[0.043267135, -0.01055224, 0.015114057, -0.004...","[0.0063961684, 0.013592867, -0.02881398, -0.00...",32500.0,10


In [36]:
model = ClusteringModel()

column_to_clusterize = "position_embedding"
if column_to_clusterize not in df_vacancies.columns:
    raise KeyError(f"{column_to_clusterize} not presented in the DataFrame")

embeddings = np.array(df_vacancies[column_to_clusterize].tolist())
clustering_model = model.train(embeddings)

path = checkpoints_path / f"clustering_model_{column_to_clusterize}.pkl"
clustering_model.save_model(path)

  super()._check_params_vs_input(X, default_n_init=10)


Model saved at /Users/Konstantin.Grotov/Documents/programming/projects/jbr/salary-prediction/checkpoints/clustering_model_position_embedding.pkl


In [37]:
df_vacancies[f"cluster_label_based_on_{column_to_clusterize}"] = df_vacancies[
    column_to_clusterize
].progress_apply(clustering_model.predict)
df_vacancies.head()

  0%|          | 0/1000 [00:00<?, ?it/s]

Unnamed: 0,id,custom_position,schedule,salary_from,salary_to,offer_education_id,education_name,education_is_base,education_order_num,city_id,list_regions,work_skills,tags_id,mean_skills_embedding,position_embedding,mean_salary,cluster_label_based_on_mean_skills_embedding,cluster_label_based_on_position_embedding
0,48202096,Сварщик-сборщик,полный рабочий день,60000,120000,0,любое,True,0,2,[4],"[сварочные работы, сборка изделий по чертежам,...",,"[0.038997207, 0.0051816893, 0.011276969, 0.044...","[0.025920482, -0.0037517531, -0.039083816, -0....",90000.0,1,6
1,48202097,Сварщик-монтажник,полный рабочий день,60000,120000,0,любое,True,0,2,[4],"[монтажные работы, строительные работы, электр...",,"[0.02835547, 0.0015033595, 0.011215197, 0.0164...","[0.015991792, -0.012740483, -0.021870257, 0.00...",90000.0,1,6
2,48202098,Слесарь-сборщик,полный рабочий день,60000,80000,0,любое,True,0,2,[4],"[работа на фрезерных станках, слесарный ремонт...",,"[0.009839139, 0.009206571, 0.026602346, 0.0124...","[0.0334669, -0.006879592, -0.030826846, -0.006...",70000.0,1,6
3,48202356,Грузчик-упаковщик,частичная занятость,30000,35000,0,любое,True,0,1,[3],"[комплектация товара, маркировка, стрессоустой...","[6, 9]","[0.043267135, -0.01055224, 0.015114057, -0.004...","[0.0063961684, 0.013592867, -0.02881398, -0.00...",32500.0,10,6
4,48202357,Грузчик-упаковщик,частичная занятость,30000,35000,0,любое,True,0,57,"[181, 182, 183, 185, 186, 187, 188, 189, 190, ...","[маркировка, стрессоустойчивость, погрузочно-р...","[6, 9]","[0.043267135, -0.01055224, 0.015114057, -0.004...","[0.0063961684, 0.013592867, -0.02881398, -0.00...",32500.0,10,6


# Prepare dataset for training regression model

In [41]:
clustering_train_column = "cluster_label_based_on_mean_skills_embedding"
embedding_train_column = "vacancy_concat_embedding"
data = {
    "cluster_label": df_vacancies[clustering_train_column].tolist(),
    "emb": df_vacancies[embedding_train_column].tolist(),
    "target": df_vacancies["mean_salary"].tolist(),
}
df = pd.DataFrame(data)
df.head()

Unnamed: 0,cluster_label,emb,target
0,1,"[0.03263462, 0.004344731, 0.011846015, 0.01777...",90000.0
1,1,"[0.025237968, -0.007739478, 0.0049960185, 0.00...",90000.0
2,1,"[0.020780401, 0.0021014009, 0.009381698, 0.005...",70000.0
3,10,"[0.029587226, -0.014430203, 0.01080143, -0.005...",32500.0
4,10,"[0.02958723, -0.014430203, 0.010801432, -0.005...",32500.0


In [42]:
path = (
    data_path
    / f"train_data_clustering_{clustering_train_column}_embedding_{embedding_train_column}"
)
df.to_pickle(path)

# Training stacked regression model

In [52]:
dict_of_dfs = {key: val for key, val in df.groupby("cluster_label") if val.shape[0] > 2}
dict_of_dfs.keys()

dict_keys([0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16])

In [56]:
params = {
    "iterations": 1000,
    "learning_rate": 0.07060926866932606,
    "depth": 10,
    "subsample": 0.710251337503286,
    "colsample_bylevel": 0.6883135875379539,
    "min_data_in_leaf": 1,
}

sample_model = StackedModels(**params)

split_dataset_dict = sample_model.split_dict_dataset(dict_of_dfs, test_size=0.2)
train_dataset_dict = {k: v for k, (v, _) in split_dataset_dict.items()}
test_dataset_dict = {k: v for k, (_, v) in split_dataset_dict.items()}

sample_model = sample_model.train(train_dataset_dict, test_size=0.0)

metric = sample_model.evaluate(dataset_dict=test_dataset_dict)
print(metric)

{}
0:	learn: 35192.3456804	total: 1.2ms	remaining: 1.2s
1:	learn: 34812.9219189	total: 1.75ms	remaining: 874ms
2:	learn: 33854.6365539	total: 2.43ms	remaining: 807ms
3:	learn: 33051.7612430	total: 3.3ms	remaining: 822ms
4:	learn: 32512.3107085	total: 4.13ms	remaining: 821ms
5:	learn: 31713.6440638	total: 4.89ms	remaining: 810ms
6:	learn: 31232.9141158	total: 5.59ms	remaining: 794ms
7:	learn: 30514.1147376	total: 7.04ms	remaining: 874ms
8:	learn: 29758.4815679	total: 8.42ms	remaining: 927ms
9:	learn: 29080.9456557	total: 9.86ms	remaining: 976ms
10:	learn: 28486.1561201	total: 10.7ms	remaining: 959ms
11:	learn: 27679.7589340	total: 11.7ms	remaining: 960ms
12:	learn: 27094.1249762	total: 12.3ms	remaining: 932ms
13:	learn: 26552.3500032	total: 13ms	remaining: 914ms
14:	learn: 25961.3416401	total: 14ms	remaining: 916ms
15:	learn: 25436.3908116	total: 14.7ms	remaining: 904ms
16:	learn: 24890.6206000	total: 16.1ms	remaining: 930ms
17:	learn: 24313.2138248	total: 16.9ms	remaining: 922ms
18:	le

In [57]:
print(sample_model.evaluate(test_dataset_dict))
sample_model.save_model(checkpoints_path / "stacked_model_fasttext.pkl")

{
    "0": 12999.731445613912,
    "1": 19235.84647051514,
    "2": 10117.717632611397,
    "3": 37343.228610438455,
    "5": 10676.066667776178,
    "6": 922.3505654416513,
    "7": 6.906491762492806e-05,
    "8": 14448.014911907838,
    "9": 14331.125753458662,
    "10": 15357.797002716696,
    "11": 8807.656479506897,
    "12": 0.00016832412802614272,
    "13": 27370.88305013086,
    "14": 3883.380065807658,
    "15": 3560.3949560898786,
    "16": 21318.781334113195
}
12523.310948969842
Model saved at /Users/Konstantin.Grotov/Documents/programming/projects/jbr/salary-prediction/checkpoints/stacked_model_fasttext.pkl
