In [1]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import pickle

In [2]:
df = pd.read_csv('../data/data_vacancies.csv')

In [3]:
df.isnull().sum()

id                         0
custom_position            0
schedule                   0
salary_from                0
salary_to                  0
salary_pay_type        19383
offer_education_id         0
education_name             0
education_is_base          0
education_order_num        0
city_id                    0
list_regions               0
work_skills                0
tags_id                 5999
dtype: int64

In [4]:
df.head()

Unnamed: 0,id,custom_position,schedule,salary_from,salary_to,salary_pay_type,offer_education_id,education_name,education_is_base,education_order_num,city_id,list_regions,work_skills,tags_id
0,48202096,Сварщик-сборщик,полный рабочий день,60000,120000,,0,любое,True,0,2,[4],"['сварочные работы', 'сборка изделий по чертеж...",
1,48202097,Сварщик-монтажник,полный рабочий день,60000,120000,,0,любое,True,0,2,[4],"['монтажные работы', 'строительные работы', 'э...",
2,48202098,Слесарь-сборщик,полный рабочий день,60000,80000,,0,любое,True,0,2,[4],"['работа на фрезерных станках', 'слесарный рем...",
3,48202356,Грузчик-упаковщик,частичная занятость,30000,35000,,0,любое,True,0,1,[3],"['комплектация товара', 'маркировка', 'стрессо...","[6, 9]"
4,48202357,Грузчик-упаковщик,частичная занятость,30000,35000,,0,любое,True,0,57,"[181, 182, 183, 185, 186, 187, 188, 189, 190, ...","['маркировка', 'стрессоустойчивость', 'погрузо...","[6, 9]"


In [5]:
X = df[['custom_position', 'schedule', 'city_id']]
# 'salary_from', 'offer_education_id', 'work_skills',  'list_regions', 'tags_id'
y = df['salary_to']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# numeric_features = ['salary_from']
# numeric_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='median')),
#     ('scaler', StandardScaler())
# ])

In [7]:
categorical_features = ['custom_position', 'schedule', 'city_id']
# 'list_regions' 'offer_education_id'
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [8]:
preprocessor = ColumnTransformer(
    transformers=[
        # ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [9]:
knn_model = Pipeline(steps=[('preprocessor', preprocessor),
                            ('regressor', KNeighborsRegressor(n_neighbors=5))])

In [10]:
knn_model.fit(X_train, y_train)

In [11]:
y_pred = knn_model.predict(X_test)

In [12]:
y_pred

array([113800., 136000.,  55000., ...,  57600.,  67000.,  81488.])

In [13]:
from sklearn.metrics import mean_absolute_error

print(mean_absolute_error(y_test, y_pred))

30371.66146741919


In [14]:
X.head()

Unnamed: 0,custom_position,schedule,city_id
0,Сварщик-сборщик,полный рабочий день,2
1,Сварщик-монтажник,полный рабочий день,2
2,Слесарь-сборщик,полный рабочий день,2
3,Грузчик-упаковщик,частичная занятость,1
4,Грузчик-упаковщик,частичная занятость,57


In [15]:
test_value = X.head(1).copy()

In [16]:
test_value = test_value[['custom_position', 'schedule', 'city_id']].copy()
test_value.loc[:, 'custom_position'] = 'строитель'
test_value.loc[:, 'city_id'] = 2

In [17]:
test_value

Unnamed: 0,custom_position,schedule,city_id
0,строитель,полный рабочий день,2


In [18]:
knn_model.predict(test_value)

array([92000.])

In [19]:
with open('../models/knn_model.pkl', 'wb') as f:
    pickle.dump(knn_model, f)