In [1]:
!pip install fasttext
import pandas as pd
from google.colab import drive
import fasttext
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import plotly.express as px
import plotly.graph_objects as go

Collecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.8/68.8 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.11.1-py3-none-any.whl (227 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp310-cp310-linux_x86_64.whl size=4199772 sha256=476a7e36a3be38fe1302d08136d6ed23035df39ab2a45d26f6bab60ea1fed68c
  Stored in directory: /root/.cache/pip/wheels/a5/13/75/f811c84a8ab36eedbaef977a6a58a98990e8e0f1967f98f394
Successfully built fasttext
Installing collected packages: pybind11, fasttext
Successfully installed fasttext-0.9.2 pybind11-2.11.1


In [2]:
drive.mount('/content/drive')
!cp /content/drive/MyDrive/data_vacancies_processed.csv .

Mounted at /content/drive


In [3]:
df = pd.read_csv('data_vacancies_processed.csv')

# Train fasttext model on skills

In [4]:
df_skills = df[['salary_from', 'salary_to', 'work_skills']]
df_skills.to_csv('skill_embeddings.csv')

In [5]:
# Make embedding on skill_embeddings.csv using fasttext
model = fasttext.train_unsupervised('skill_embeddings.csv', model='skipgram', dim=32)

model.save_model('skill_embeddings.bin')

In [6]:
unique_skills = set()

for skills_list in df['work_skills']:
    unique_skills.update(eval(skills_list))

unique_skills = list(unique_skills)

In [7]:
skill_embeddings = {skill: model.get_word_vector(skill) for skill in unique_skills}

# Get embeddings for each vacancy and perform clustering

In [8]:
num_clusters = 17  # Adjusted based on elbow point
kmeans = KMeans(n_clusters=num_clusters)

In [9]:
vacancy_embeddings = df[['id', 'custom_position', 'work_skills']].copy()
vacancy_embeddings['embedding'] = None
for index, row in vacancy_embeddings.iterrows():
    id = row['id']
    skills = row['work_skills']
    custom_position = row['custom_position']

    # Retrieve embeddings for all skills associated with the vacancy
    embeddings = [skill_embeddings[skill] for skill in eval(skills) if skill in skill_embeddings]
    if embeddings:
        # Aggregate the embeddings by taking the mean
        aggregated_embedding = np.mean(embeddings, axis=0)
        vacancy_embeddings.at[index, 'embedding'] = aggregated_embedding

In [10]:
embeddings = np.array(vacancy_embeddings['embedding'].tolist())

kmeans.fit(embeddings)



In [11]:
for index, row in vacancy_embeddings.iterrows():
    embedding = vacancy_embeddings.at[index,'embedding']
    embedding = embedding.reshape(1, -1)

    cluster_label = kmeans.predict(embedding)[0]

    vacancy_embeddings.at[index, 'cluster_label'] = cluster_label

# Transform embeddings to 2d coords

In [12]:
tsne = TSNE(n_components=2)
embeddings_2d = tsne.fit_transform(embeddings)


In [13]:
# Add x and y from embeddings_2d to vacancy_embeddings
vacancy_embeddings['x'] = [coord[0] for coord in embeddings_2d]
vacancy_embeddings['y'] = [coord[1] for coord in embeddings_2d]

In [14]:
# plot x, y from df and color it according to cluster_label
x = vacancy_embeddings['x']
y = vacancy_embeddings['y']
cluster_labels = vacancy_embeddings['cluster_label']

fig = px.scatter(vacancy_embeddings, x="x", y="y", color="cluster_label", hover_name="custom_position",
                 title="Scatter Plot of Vacancy Embeddings Colored by Cluster Label")

fig.update_layout(hovermode="closest")

fig.show()

In [15]:
# Calculate SSE
X = embeddings
sse = []

for k in range(1, 100 + 1):
    kmeans = KMeans(n_clusters=k, n_init='auto')
    kmeans.fit(X)

    sse.append(kmeans.inertia_)


In [16]:
# Determine elbow point on sse

# Find the first and last points of the straight line
x1, y1 = 1, sse[0]
x2, y2 = len(sse), sse[-1]

# Calculate the slope and intercept of the straight line
m = (y2 - y1) / (x2 - x1)
b = y1 - (m * x1)

# Calculate the distances from each point to the line
distances = []
for i, sse_value in enumerate(sse):
    x = i + 1
    y = sse_value
    distance = abs(y - (m * x + b)) / np.sqrt(m ** 2 + 1)
    distances.append(distance)

# Find the index of the point with the maximum distance
max_distance_index = distances.index(max(distances))

# The elbow point is the point with the maximum distance to the straight line
elbow_point = max_distance_index + 1

print('Elbow point:', elbow_point)

Elbow point: 18


In [17]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=list(range(len(sse))), y=sse, mode='markers', name='SSE'))
fig.add_trace(go.Scatter(x=list(range(len(sse))), y=sse, mode='lines', name='SSE', line=dict(color='blue')))

fig.update_layout(title='Elbow Method', xaxis_title='Number of Clusters', yaxis_title='SSE')

# Highlight the elbow point
fig.add_trace(go.Scatter(x=[elbow_point-1], y=[sse[elbow_point - 1]], mode='markers', name='Elbow Point', marker=dict(color='red')))
fig.add_annotation(x=elbow_point-1, y=sse[elbow_point - 1], text='Elbow Point', showarrow=True, arrowhead=1)

fig.show()