In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [6]:
from surprise import Dataset
from surprise import Reader
from surprise import SVD
from surprise.model_selection import cross_validate

**Загружаем датасет**

In [7]:
ratings_data = pd.read_csv('./rabota_test.csv')
ratings_data

Unnamed: 0,cookie_id,vacancy_id_,rating
0,0000c4548c3944c08972bbdc1fa4eb85,137659,0
1,0000c4548c3944c08972bbdc1fa4eb85,153975,0
2,0000c4548c3944c08972bbdc1fa4eb85,174953,0
3,0000c4548c3944c08972bbdc1fa4eb85,176171,0
4,0000c4548c3944c08972bbdc1fa4eb85,182445,0
...,...,...,...
4651701,ffffdb17f45b4032b386d691d52e6c00,107125,1
4651702,ffffdb17f45b4032b386d691d52e6c00,107893,1
4651703,ffffdb17f45b4032b386d691d52e6c00,150843,1
4651704,ffffdb17f45b4032b386d691d52e6c00,160164,1


In [8]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings_data[['cookie_id', 'vacancy_id_', 'rating']], reader)

**Обучаем модель**

In [9]:
svd = SVD(verbose=True, n_epochs=10)
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    1.0432  1.0398  1.0453  1.0428  0.0023  
MAE (testset)     0.9105  0.9104  0.9114  0.9107  0.0004  
Fit time          32.57   30.45   30.21   31.08   1.06    
Test time         20.97   17.94   22.04   20.32   1.74    


{'test_rmse': array([1.04319032, 1.03977983, 1.04530311]),
 'test_mae': array([0.91049776, 0.91037315, 0.91135305]),
 'fit_time': (32.56586933135986, 30.454617500305176, 30.213212490081787),
 'test_time': (20.97262406349182, 17.939523458480835, 22.04299283027649)}

In [10]:
trainset = data.build_full_trainset()
svd.fit(trainset)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x21cdf87be20>

**Тестовое предсказание**

In [13]:
svd.predict(uid='000cd76cd33f43d4a1ac1d16d10f8bf7', iid=222177).est

1

**Пока задал тестовый список cookie_id и тестовый список вакансий в get_vacancy_list_for_user на основе mfti_public**

In [13]:
user_id_list_test_list = ['0000c4548c3944c08972bbdc1fa4eb85',
                          '0000d7508334414ca792c5ff66eb8c14',
                          'ffff8283d29644e69ad6d08cc3ca7fb7']

In [17]:
def get_vacancy_list_for_user(user_id):
    """
    Returns list of vacancies for user_id, exclude those user interacted with 
    """
    res_list=[]
    res_list_test = [174953,
                     176171,
                     187529,
                     153975,
                     137659,
                     182445,
                     209629,
                     250729,
                     135961,
                     138688,
                     138502,
                     243795]
    res_list = res_list_test
    return res_list


def get_recommendations(user_id_list, model, number_of_vacancies=5):
    """
    Returns list with 'number_of_vacancies' vacancies for each user_id in user_id_list
    """    
    res_dict = {}
    for user_id in user_id_list:
        vacancies_id_list = get_vacancy_list_for_user(user_id)
        vacancies_ratings = []
        for vacancy_id in vacancies_id_list:
            vacancies_ratings.append(model.predict(uid=user_id, iid=vacancy_id).est)
        sorted_vacancies = [vacancies_id_list for _, vacancies_id_list in sorted(zip(vacancies_ratings, vacancies_id_list))]
        res_dict[user_id] = sorted_vacancies[:number_of_vacancies]
    return res_dict

In [20]:
get_recommendations(user_id_list_test_list, svd)

{'0000c4548c3944c08972bbdc1fa4eb85': [135961, 137659, 138502, 138688, 153975],
 '0000d7508334414ca792c5ff66eb8c14': [135961, 137659, 138502, 138688, 153975],
 'ffff8283d29644e69ad6d08cc3ca7fb7': [135961, 137659, 138502, 138688, 153975]}

In [21]:
get_recommendations(user_id_list_test_list, svd, number_of_vacancies=3)

{'0000c4548c3944c08972bbdc1fa4eb85': [135961, 137659, 138502],
 '0000d7508334414ca792c5ff66eb8c14': [135961, 137659, 138502],
 'ffff8283d29644e69ad6d08cc3ca7fb7': [135961, 137659, 138502]}