In [72]:
from pathlib import Path

import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.manifold import TSNE
from tqdm import tqdm

from core import ROOT_PATH
from core.embedding_models import FastTextEmbeddingModel
from core.models.clustering_model import ClusteringModel

In [73]:
fast_text_embedding = FastTextEmbeddingModel()

In [74]:
data_path = Path("data/data_vacancies_processed_1k.csv")
df = pd.read_csv(ROOT_PATH / data_path, index_col=0)
df["work_skills"] = df.work_skills.apply(eval)
df.head()

Unnamed: 0,id,custom_position,schedule,salary_from,salary_to,offer_education_id,education_name,education_is_base,education_order_num,city_id,list_regions,work_skills,tags_id
0,48202096,Сварщик-сборщик,полный рабочий день,60000,120000,0,любое,True,0,2,[4],"[сварочные работы, сборка изделий по чертежам,...",
1,48202097,Сварщик-монтажник,полный рабочий день,60000,120000,0,любое,True,0,2,[4],"[монтажные работы, строительные работы, электр...",
2,48202098,Слесарь-сборщик,полный рабочий день,60000,80000,0,любое,True,0,2,[4],"[работа на фрезерных станках, слесарный ремонт...",
3,48202356,Грузчик-упаковщик,частичная занятость,30000,35000,0,любое,True,0,1,[3],"[комплектация товара, маркировка, стрессоустой...","[6, 9]"
4,48202357,Грузчик-упаковщик,частичная занятость,30000,35000,0,любое,True,0,57,"[181, 182, 183, 185, 186, 187, 188, 189, 190, ...","[маркировка, стрессоустойчивость, погрузочно-р...","[6, 9]"


In [75]:
unique_skills = df.work_skills.explode().unique()
unique_skills_embeddings_fasttext = {
    k: fast_text_embedding.generate(k) for k in tqdm(unique_skills)
}

100%|██████████| 896/896 [00:00<00:00, 112450.52it/s]


In [76]:
df_skills_embeddings = pd.DataFrame(
    list(unique_skills_embeddings_fasttext.items()), columns=["name", "embedding"]
)
df_skills_embeddings.head()

Unnamed: 0,name,embedding
0,сварочные работы,"[0.042372044, 0.0050952206, 0.008594746, -0.01..."
1,сборка изделий по чертежам,"[0.026281675, 0.05808256, 0.044726472, 0.05175..."
2,ручная дуговая сварка,"[0.033271495, 0.002705688, -0.030051084, 0.085..."
3,электродуговая сварка,"[0.057141438, 0.016982837, -0.020811101, 0.077..."
4,аргонодуговая сварка,"[0.06702779, 0.0045817997, -0.012593956, 0.076..."


In [77]:
model_path = ROOT_PATH / "checkpoints/clustering_model_fasttext.pkl"
clustering_model = ClusteringModel()
clustering_model = clustering_model.load_model(model_path)

Model successfully loaded


In [78]:
df_skills_embeddings["cluster_label"] = df_skills_embeddings.embedding.apply(
    clustering_model.predict
)
df_skills_embeddings.head()

Unnamed: 0,name,embedding,cluster_label
0,сварочные работы,"[0.042372044, 0.0050952206, 0.008594746, -0.01...",13
1,сборка изделий по чертежам,"[0.026281675, 0.05808256, 0.044726472, 0.05175...",6
2,ручная дуговая сварка,"[0.033271495, 0.002705688, -0.030051084, 0.085...",13
3,электродуговая сварка,"[0.057141438, 0.016982837, -0.020811101, 0.077...",13
4,аргонодуговая сварка,"[0.06702779, 0.0045817997, -0.012593956, 0.076...",13


In [79]:
df_skills_embeddings.to_pickle(ROOT_PATH / "data/df_skills_embeddings.pkl")

In [80]:
sample_name = "Программист джанго"
sample_embedding = fast_text_embedding.generate(sample_name)
cluster_label = clustering_model.predict(sample_embedding)

df_sample = (
    pd.Series(
        {
            "name": sample_name,
            "embedding": sample_embedding,
            "cluster_label": cluster_label,
        }
    )
    .to_frame()
    .T
)

In [81]:
df_skills_embeddings = pd.concat([df_sample, df_skills_embeddings], axis=0)
df_skills_embeddings.head()

Unnamed: 0,name,embedding,cluster_label
0,Программист джанго,"[0.07448509, 0.02585837, -0.0040324545, 0.0017...",9
0,сварочные работы,"[0.042372044, 0.0050952206, 0.008594746, -0.01...",13
1,сборка изделий по чертежам,"[0.026281675, 0.05808256, 0.044726472, 0.05175...",6
2,ручная дуговая сварка,"[0.033271495, 0.002705688, -0.030051084, 0.085...",13
3,электродуговая сварка,"[0.057141438, 0.016982837, -0.020811101, 0.077...",13


In [82]:
tsne = TSNE(n_components=2)
embeddings = np.array(df_skills_embeddings["embedding"].tolist())
embeddings_2d = tsne.fit_transform(embeddings)
df_skills_embeddings.loc[:, ["x", "y"]] = embeddings_2d

In [84]:
fig = px.scatter(
    df_skills_embeddings.iloc[1:],
    x="x",
    y="y",
    color="cluster_label",
    hover_name="name",
    title="Scatter Plot of Vacancy Embeddings Colored by Cluster Label",
)
x, y = df_skills_embeddings.iloc[0][["x", "y"]]

fig.add_scatter(
    x=[x],
    y=[y],
    marker=dict(color="red", size=20),
    name=sample_name,
)

fig.update_layout(hovermode="closest")
fig.show()