In [83]:
import pandas as pd
from tqdm import tqdm

from core.embedding_models import FastTextEmbeddingModel, EmbeddingModel
from core.models import LinearRegressionModel, CatboostRegressionModel

tqdm.pandas()

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [84]:
df = pd.read_csv("../data/data_vacancies.csv")
df.head()

Unnamed: 0,id,custom_position,schedule,salary_from,salary_to,salary_pay_type,offer_education_id,education_name,education_is_base,education_order_num,city_id,list_regions,work_skills,tags_id
0,48202096,Сварщик-сборщик,полный рабочий день,60000,120000,,0,любое,True,0,2,[4],"['сварочные работы', 'сборка изделий по чертеж...",
1,48202097,Сварщик-монтажник,полный рабочий день,60000,120000,,0,любое,True,0,2,[4],"['монтажные работы', 'строительные работы', 'э...",
2,48202098,Слесарь-сборщик,полный рабочий день,60000,80000,,0,любое,True,0,2,[4],"['работа на фрезерных станках', 'слесарный рем...",
3,48202356,Грузчик-упаковщик,частичная занятость,30000,35000,,0,любое,True,0,1,[3],"['комплектация товара', 'маркировка', 'стрессо...","[6, 9]"
4,48202357,Грузчик-упаковщик,частичная занятость,30000,35000,,0,любое,True,0,57,"[181, 182, 183, 185, 186, 187, 188, 189, 190, ...","['маркировка', 'стрессоустойчивость', 'погрузо...","[6, 9]"


In [85]:
df["salary_mean"] = (df.salary_from + df.salary_to) / 2
df["target"] = df.salary_mean
df = df[df.salary_mean < df.salary_mean.quantile(0.95)]

cols_to_concat = ["custom_position"]
df["combined_string"] = df[cols_to_concat].agg("\n".join, axis=1)


In [86]:
dataset = df.iloc[:]
embedding_model = FastTextEmbeddingModel()
# embedding_model = EmbeddingModel()

In [87]:
lr_model = LinearRegressionModel(embedding_model=embedding_model)


In [88]:
dataset.loc[:, "emb"] = dataset["combined_string"].progress_apply(lambda x: embedding_model.generate(x))


100%|██████████| 18468/18468 [00:00<00:00, 36190.55it/s]


In [89]:
dataset["emb"].iloc[0].shape

(300,)

In [90]:
lr_model.train(dataset)

Model trained
Test score is 16277.157743659482


In [91]:
lr_model.save_model("../data/linreg_fasttext.pkl")


Model saved at ../data/linreg_fasttext.pkl


In [92]:
cb_model = CatboostRegressionModel(embedding_model=embedding_model)


In [93]:
cb_model.train(dataset, metric_period=100)

Learning rate set to 0.062656
0:	learn: 26545.2642466	total: 31.5ms	remaining: 31.4s
100:	learn: 19490.3459576	total: 1.28s	remaining: 11.4s
200:	learn: 18005.7842333	total: 2.53s	remaining: 10.1s
300:	learn: 16894.1840994	total: 3.72s	remaining: 8.63s
400:	learn: 16115.8247947	total: 4.96s	remaining: 7.4s
500:	learn: 15548.9084739	total: 6.26s	remaining: 6.24s
600:	learn: 15090.3833873	total: 7.42s	remaining: 4.92s
700:	learn: 14724.8946470	total: 8.56s	remaining: 3.65s
800:	learn: 14430.0173454	total: 9.7s	remaining: 2.41s
900:	learn: 14181.4079883	total: 10.8s	remaining: 1.19s
999:	learn: 13981.6242004	total: 11.9s	remaining: 0us
Model trained
Test score is 12499.325032788483


In [94]:
cb_model.save_model("../data/catboost_fasttext.pkl")


Model saved at ../data/catboost_fasttext.pkl
