In [1]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('../data/data_vacancies.csv')

df['custom_position_tokens'] = df['custom_position'].apply(lambda x: word_tokenize(x.lower()))

word2vec_model = Word2Vec(sentences=df['custom_position_tokens'], vector_size=100, window=5, min_count=1, workers=4)

def get_sentence_vector(tokens):
    vectors = [word2vec_model.wv[word] for word in tokens if word in word2vec_model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(word2vec_model.vector_size)

df['custom_position_vector'] = df['custom_position_tokens'].apply(get_sentence_vector)

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

from sklearn.metrics import mean_absolute_error

X_train, X_test, y_train, y_test = train_test_split(df['custom_position_vector'].tolist(), df['salary_to'], test_size=0.2, random_state=42)

knn_model = KNeighborsRegressor(n_neighbors=5)
knn_model.fit(X_train, y_train)

y_pred = knn_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')


Mean Squared Error: 2044275318.2940893


In [12]:
mae = mean_absolute_error(y_test, y_pred)

In [13]:
mae

25430.00548999487

In [6]:
X_train

[array([-3.18270385e-01,  4.15981293e-01,  5.27401939e-02, -1.11546163e-02,
         1.24575257e-01, -2.70157307e-01,  1.36146218e-01,  5.60481608e-01,
        -2.73143262e-01, -3.75014216e-01, -5.43964729e-02, -5.84321380e-01,
        -4.95843263e-03,  3.13082397e-01,  1.72030613e-01, -1.12069748e-01,
         7.21027851e-02, -1.67778283e-01,  9.32875723e-02, -6.14499092e-01,
         3.17856878e-01,  2.11690754e-01,  1.88155621e-01, -2.29949713e-01,
        -1.07157782e-01,  5.95640251e-03, -1.61426634e-01, -1.11408256e-01,
        -3.34405750e-01,  1.29636571e-01,  2.55559564e-01, -9.51201767e-02,
        -3.25834379e-02, -3.21185917e-01, -1.24094382e-01,  4.82195944e-01,
         1.84575655e-02, -9.05840546e-02, -3.03268488e-02, -4.30304468e-01,
         1.57630593e-01, -2.48869732e-01, -2.10075304e-01, -2.39056498e-02,
         2.24702448e-01,  1.32151395e-02, -2.12827399e-01, -1.83563799e-01,
         1.22925594e-01,  1.27953559e-01,  3.60782802e-01, -1.13715723e-01,
        -1.3

In [10]:
def predict_salary_to(custom_position):
    tokens = word_tokenize(custom_position.lower())
    vector = get_sentence_vector(tokens)
    prediction = knn_model.predict([vector])
    return prediction[0]

new_custom_position = "водитель"
predicted_salary_to = predict_salary_to(new_custom_position)
print(f'Predicted salary_to for {new_custom_position}: {predicted_salary_to}')

Predicted salary_to for водитель: 138600.0
