In [4]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

In [5]:
from surprise import Dataset, Reader
from surprise import SVD, SVDpp, BaselineOnly

from surprise.model_selection import train_test_split
from surprise.model_selection import cross_validate

In [6]:
from scipy.sparse import csr_matrix

from implicit.als import AlternatingLeastSquares
from implicit.lmf import LogisticMatrixFactorization

In [7]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 30)

In [8]:
random_state_value = 42

id_column_name = 'common_id'
rating_column_name = 'rating'

**Загружаем датасет**

In [4]:
from google.colab import drive
drive.mount('/content/drive')

ModuleNotFoundError: No module named 'google'

In [9]:
data_dir = 'Data_Rabota'

In [10]:
train_df = pd.read_parquet(f"{data_dir}/Processed_dataset.parquet")
train_df

Unnamed: 0,common_id,event_date,event_timestamp,action_time,vacancy_id_,cookie_id,user_id,event_type,common_id_2
15,2,2022-09-05,1662374899,0.0,192850,0000d7508334414ca792c5ff66eb8c14,,preview_click_vacancy,0000d7508334414ca792c5ff66eb8c14
16,2,2022-09-05,1662374899,498.0,192850,0000d7508334414ca792c5ff66eb8c14,,show_vacancy,0000d7508334414ca792c5ff66eb8c14
17,2,2022-09-05,1662375397,1.0,230581,0000d7508334414ca792c5ff66eb8c14,,preview_click_vacancy,0000d7508334414ca792c5ff66eb8c14
18,2,2022-09-05,1662375398,205.0,230581,0000d7508334414ca792c5ff66eb8c14,,show_vacancy,0000d7508334414ca792c5ff66eb8c14
19,2,2022-09-05,1662375603,1.0,250327,0000d7508334414ca792c5ff66eb8c14,,preview_click_vacancy,0000d7508334414ca792c5ff66eb8c14
...,...,...,...,...,...,...,...,...,...
12292571,367640,2022-09-26,1664169217,1.0,197828,ffffd6d4f2134dd6aa80ff71e942b508,82a5e80f91b144f596496d6d831b49d3,preview_click_vacancy,82a5e80f91b144f596496d6d831b49d3
12292572,367640,2022-09-26,1664169218,90.0,197828,ffffd6d4f2134dd6aa80ff71e942b508,82a5e80f91b144f596496d6d831b49d3,show_vacancy,82a5e80f91b144f596496d6d831b49d3
12292573,367640,2022-09-26,1664169308,210911.0,197828,ffffd6d4f2134dd6aa80ff71e942b508,82a5e80f91b144f596496d6d831b49d3,click_contacts,82a5e80f91b144f596496d6d831b49d3
12292574,367640,2022-09-28,1664380219,132.0,233452,ffffd6d4f2134dd6aa80ff71e942b508,82a5e80f91b144f596496d6d831b49d3,click_contacts,82a5e80f91b144f596496d6d831b49d3


In [11]:
test_df = pd.read_parquet(f"{data_dir}/test_public_mfti.parquet")
test_df  

Unnamed: 0,cookie_id,vacancy_id_
0,000cd76cd33f43d4a1ac1d16d10f8bf7,"[222177, 222173, 222163, 238874, 238878, 228125, 225604, 235544, 119314]"
1,0034bc7f404341ba8412665453e7825a,"[102794, 137587, 257319, 237756, 240744, 113482, 240245, 221376, 242642, 235561, 191512, 222762, 231446, 109269, 205578]"
2,00a6c5a64a274c55a836402bdeb3b2c4,"[254292, 164602, 116438, 228634, 218819, 240655, 240659, 237341, 176563, 255299, 245637, 109275, 111505, 212953, 217549, 207108, 200475, 211355, 211449, 108956, 111071, 253239, 109649, 221735, 222030, 222174, 231207, 231022, 234380, 106944, 203948, 175325, 193534, 197505, 234769, 238933, 225709, 184466, 242642, 203015]"
3,015937a125b14e74bdff1cddc49f9172,"[246685, 138123, 115420, 210628, 212325, 235196]"
4,01de50c280794cec8804f16f45f847b7,"[219070, 251469, 166899, 212703, 214561]"
...,...,...
767,fdbcda17f22f406486837059e76c7fed,"[207851, 254989, 213344, 214180, 222146]"
768,fe6193ab26494ace9be5aae36e507618,"[115352, 230546, 225527, 120188, 109360, 232128, 237590, 259570, 244126, 229677, 135447]"
769,fe95b2826ee1452b81201ed3f4c3294d,"[240362, 114852, 253946, 251081, 127546, 244688]"
770,ff1aef256a49481698bb2e938510ff36,"[231194, 236363, 220747, 244688, 100094, 240521, 179568, 184858, 100103]"


In [12]:
nonrel_act_list = ['show_vacancy',
                'preview_click_vacancy',
                'click_favorite',
                'preview_click_favorite']

rel_act_list = ['click_response',
                'preview_click_response',
                'click_contacts',
                'preview_click_contacts',
                'click_phone',
                'preview_click_phone']

In [13]:
def get_top_n_vacancies(rating_act_list, pos_vacancies_start=0, pos_vacancies_end=100):
    show_act = train_df[train_df['event_type'].isin(rating_act_list)].groupby('vacancy_id_')['event_type'].count()
    top_n_with_event_quant = show_act.sort_values(ascending=False).iloc[pos_vacancies_start:pos_vacancies_end].to_frame().reset_index()
    return top_n_with_event_quant['vacancy_id_'].unique()

In [14]:
# ТОП-80 для рейтинга по просмотрам

vac_start = 0
vac_end = 80
rating_act = ['show_vacancy']

top80vac = get_top_n_vacancies(rating_act,
                               pos_vacancies_start=vac_start,
                               pos_vacancies_end=vac_end)
top80vac

array([260154, 198114, 203404, 202608, 116823, 164602, 207423, 148714,
       111505, 182870, 250327, 111867, 110421, 108242, 242642, 217683,
       258441, 158242, 162187, 174953, 176141, 207108, 247535, 110792,
       237341, 113305, 240744, 149024, 244077, 113482, 180382, 114583,
       182100, 113707, 105907, 210628, 227708, 110793, 247276, 181745,
       164481, 193331, 246509, 243868, 190030, 153245, 138634, 169194,
       115924, 126251, 164588, 154411, 155539, 120252, 230707, 127352,
       214513, 182084, 249571, 207156, 168935, 239021, 184440, 106944,
       187360, 206350, 117525, 111592, 150283, 253678, 112506, 117532,
       220718, 151616, 109079, 111837, 136266, 164246, 182439, 111890],
      dtype=int64)

In [15]:
# Топ вакансий с 10 до 95

vac_start = 10
vac_end = 95
rating_act = ['show_vacancy']

top95vac_from10 = get_top_n_vacancies(rating_act,
                                                      pos_vacancies_start=vac_start,
                                                      pos_vacancies_end=vac_end)
top95vac_from10

array([250327, 111867, 110421, 108242, 242642, 217683, 258441, 158242,
       162187, 174953, 176141, 207108, 247535, 110792, 237341, 113305,
       240744, 149024, 244077, 113482, 180382, 114583, 182100, 113707,
       105907, 210628, 227708, 110793, 247276, 181745, 164481, 193331,
       246509, 243868, 190030, 153245, 138634, 169194, 115924, 126251,
       164588, 154411, 155539, 120252, 230707, 127352, 214513, 182084,
       249571, 207156, 168935, 239021, 184440, 106944, 187360, 206350,
       117525, 111592, 150283, 253678, 112506, 117532, 220718, 151616,
       109079, 111837, 136266, 164246, 182439, 111890, 257631, 143721,
       248852, 171332, 128183, 140917, 111941, 212141, 102794, 212325,
       229689, 258378, 106293, 176171, 129787], dtype=int64)

In [16]:
# Создаем колонку с типом действия для рейтинга
# 1 - целевое действие, 0 - остальные действия

def check_is_purpose(event):
    if event in rel_act_list:
        return 1
    else:
        return 0

train_df[rating_column_name] = train_df['event_type'].apply(check_is_purpose)
train_df

Unnamed: 0,common_id,event_date,event_timestamp,action_time,vacancy_id_,cookie_id,user_id,event_type,common_id_2,rating
15,2,2022-09-05,1662374899,0.0,192850,0000d7508334414ca792c5ff66eb8c14,,preview_click_vacancy,0000d7508334414ca792c5ff66eb8c14,0
16,2,2022-09-05,1662374899,498.0,192850,0000d7508334414ca792c5ff66eb8c14,,show_vacancy,0000d7508334414ca792c5ff66eb8c14,0
17,2,2022-09-05,1662375397,1.0,230581,0000d7508334414ca792c5ff66eb8c14,,preview_click_vacancy,0000d7508334414ca792c5ff66eb8c14,0
18,2,2022-09-05,1662375398,205.0,230581,0000d7508334414ca792c5ff66eb8c14,,show_vacancy,0000d7508334414ca792c5ff66eb8c14,0
19,2,2022-09-05,1662375603,1.0,250327,0000d7508334414ca792c5ff66eb8c14,,preview_click_vacancy,0000d7508334414ca792c5ff66eb8c14,0
...,...,...,...,...,...,...,...,...,...,...
12292571,367640,2022-09-26,1664169217,1.0,197828,ffffd6d4f2134dd6aa80ff71e942b508,82a5e80f91b144f596496d6d831b49d3,preview_click_vacancy,82a5e80f91b144f596496d6d831b49d3,0
12292572,367640,2022-09-26,1664169218,90.0,197828,ffffd6d4f2134dd6aa80ff71e942b508,82a5e80f91b144f596496d6d831b49d3,show_vacancy,82a5e80f91b144f596496d6d831b49d3,0
12292573,367640,2022-09-26,1664169308,210911.0,197828,ffffd6d4f2134dd6aa80ff71e942b508,82a5e80f91b144f596496d6d831b49d3,click_contacts,82a5e80f91b144f596496d6d831b49d3,1
12292574,367640,2022-09-28,1664380219,132.0,233452,ffffd6d4f2134dd6aa80ff71e942b508,82a5e80f91b144f596496d6d831b49d3,click_contacts,82a5e80f91b144f596496d6d831b49d3,1


In [17]:
# Готовим датафрейм для обучения модели:

train_matrix = train_df.groupby([id_column_name, 'vacancy_id_'], as_index=False)[rating_column_name].sum()
train_matrix

Unnamed: 0,common_id,vacancy_id_,rating
0,1,137659,0
1,1,153975,0
2,1,174953,0
3,1,176171,0
4,1,182445,0
...,...,...,...
4723184,367641,107125,1
4723185,367641,107893,1
4723186,367641,150843,1
4723187,367641,160164,1


In [18]:
# Функция для создания рейтинга на основе порогового значения для целевых действий

def create_rating(input_rating, thresholds):
    rating = input_rating
    if rating >= max(rating_thresholds):
        rating = thresholds[max(rating_thresholds)]
    else:
        rating = thresholds[rating_thresholds[rating]]
    return rating

In [19]:
# Создаем рейтинг
# Принципы построения
# 1. rating >= 1 - rating = 1
# 2. rating = 0 - оставляем без изменения

rating_thresholds = {0: 0, 1: 1}

train_matrix_0_1 = train_matrix.copy()
train_matrix_0_1[rating_column_name] = train_matrix_0_1[rating_column_name].apply(create_rating, thresholds = rating_thresholds)
train_matrix_0_1
train_matrix_0_1[rating_column_name].unique()

array([0, 1], dtype=int64)

In [20]:
# Готовим данные для обучения модели

reader = Reader(rating_scale=(0, 1))
train_data_0_1 = Dataset.load_from_df(train_matrix_0_1[[id_column_name, 'vacancy_id_', rating_column_name]], reader)

In [21]:
# Обучаем модель для датасета с рейтингом 0 или 1

svd_20epochs_0_1 = SVD(verbose=True, n_epochs=20)
trainset_0_1 = train_data_0_1.build_full_trainset()
svd_20epochs_0_1.fit(trainset_0_1)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x27b848c0cd0>

In [22]:
col_names ={
    'id_col_name': id_column_name,
    'vac_col_name': 'vacancy_id_',
    'rating_col_name': rating_column_name
}

In [24]:
def transform_into_sparse_matrix(input_df, col_names):
    input_df_fixed = input_df.copy()
    input_df_fixed[col_names['id_col_name']] = input_df_fixed[col_names['id_col_name']].astype('category')
    input_df_fixed[col_names['vac_col_name']] = input_df_fixed[col_names['vac_col_name']].astype('category')
    input_df_fixed[col_names['rating_col_name']] = input_df_fixed[col_names['rating_col_name']].astype('uint8')
    user_item_matrix = csr_matrix((input_df_fixed[col_names['rating_col_name']], 
                                (input_df_fixed[col_names['id_col_name']].cat.codes, 
                                 input_df_fixed[col_names['vac_col_name']].cat.codes)))
    id_cat_codes = dict(zip(input_df_fixed[col_names['id_col_name']], input_df_fixed[col_names['id_col_name']].cat.codes))
    vac_cat_codes = dict(zip(input_df_fixed[col_names['vac_col_name']], input_df_fixed[col_names['vac_col_name']].cat.codes))
    cat_vac_codes = dict(zip(input_df_fixed[col_names['vac_col_name']].cat.codes, input_df_fixed[col_names['vac_col_name']]))
    return (user_item_matrix, id_cat_codes, vac_cat_codes, cat_vac_codes)


In [25]:
Impl_0_1_set = transform_into_sparse_matrix(train_matrix_0_1, col_names)

In [26]:
Impl_ALS_0_1 = AlternatingLeastSquares(factors=100,
                                       regularization=0.05,
                                       calculate_training_loss=True,
                                       random_state=random_state_value) # , alpha=3.0
Impl_ALS_0_1.fit(Impl_0_1_set[0])



  0%|          | 0/15 [00:00<?, ?it/s]

In [27]:
def common_for_cookie(cookie_id, n_predict = 5):
    """Функция для определения common_id от cookie_id для обучения рекомендательной системы"""
    common_freq = train_df[train_df['cookie_id'] == cookie_id].common_id.value_counts(normalize=True)
    common_ids = {com_id: round(freq*n_predict) for com_id, freq in zip(common_freq.index.to_list(), common_freq.to_list())}
    return common_ids

In [32]:
# Вспомогательные функции

def calc_correct_recomend_number(predict_col, vac_col):
    """Функция для определения количества совпадающих вакансий в двух списках"""
    return len(set(predict_col) & set(vac_col))


def common_for_cookie(cookie_id, n_predict = 5):
    """Функция для определения common_id от cookie_id для обучения рекомендательной системы"""
    common_freq = train_df[train_df['cookie_id'] == cookie_id].common_id.value_counts(normalize=True)
    common_ids = {com_id: round(freq*n_predict) for com_id, freq in zip(common_freq.index.to_list(), common_freq.to_list())}
    return common_ids


def get_prec_n(res_df, num_corr_column, n_predict = 5):
    """Функция для расчета precision@n метрики"""
    corr_recommend = res_df[num_corr_column].sum()
    prec_5 = corr_recommend / (n_predict * res_df.shape[0])
    return {'num_corr_recomend': corr_recommend, f'precision@{n_predict}': prec_5}

In [33]:
# Копируем датасет чтобы не менять изначальный

test_df_with_pred = test_df.copy()

In [34]:
res_dict = {}

SVD_0_1

In [44]:
def get_predictions_surprise(cookie_id, model, vac_set, n_predict = 5):
    top_n_vacancies = []
    common_ids = common_for_cookie(cookie_id)    
    for com_id, n in common_ids.items():
        if n != 0:
            # Получим все вакансии с которыми взаимодействовал cookie_id:
            interacted_items = train_matrix[train_matrix[id_column_name] == com_id]['vacancy_id_'].unique()
            
            # Удалим из этого перечня вакансии с которыми контактировал юзер:
            items_to_predict = np.setdiff1d(vac_set, interacted_items)
                        
            # Создадим тест датасет для данного юзера:
            test_set = [[cookie_id, item, 4.] for item in vac_set]        

            # получим топ-n вакансий для данного юзера:
            predictions = model.test(test_set)
            top_n = sorted(predictions, key=lambda x: x.est, reverse=True)[:n_predict]            
            top_n_vacancies.extend([(pred.iid, pred.est) for pred in top_n])    
        
    return top_n_vacancies

In [45]:
model_name = 'SVD_20ep_rat0-1'
model = svd_20epochs_0_1
set_name = 'top95_10'
dataset = top95vac_from10
n_pred = 5

# Создаем столбец с предсказаниями
predict_col_name = f'predict_id_{model_name}_{set_name}_n_{n_pred}'            
test_df_with_pred[predict_col_name] = test_df_with_pred['cookie_id'].apply(get_predictions_surprise,
                                                                           model = model,
                                                                           vac_set = dataset,
                                                                           n_predict = n_pred)

Implicit_ALS_0_1

In [54]:
def get_predictions_implicit(cookie_id, model, vac_set, impl_set, n_predict = 5):  
    top_n_vacancies = []
    common_ids = common_for_cookie(cookie_id)    
    for com_id, n in common_ids.items():               
        if n != 0:
            # Получим все вакансии с которыми взаимодействовал cookie_id:
            interacted_items = train_matrix[train_matrix[id_column_name] == com_id]['vacancy_id_'].unique()
            
            # Удалим из этого перечня вакансии с которыми контактировал юзер:
            items_to_predict = np.setdiff1d(vac_set, interacted_items)
            
            # получим топ-n вакансий для данного юзера:
            
            user_item = impl_set[0]
            id_cat_code = impl_set[1][com_id]
            vac_cat_codes = impl_set[2]
            cat_vac_codes = impl_set[3]
            
            filter_items = np.array([vac_cat_codes[i] for i in vac_set])
            ids, scores = model.recommend(id_cat_code,
                                          user_item[id_cat_code],
                                          N=n_predict,
                                          filter_items=filter_items,
                                          filter_already_liked_items=True)                                                
            top_n_vacancies.extend([(cat_vac_codes[i], s) for i, s in zip(ids, scores)])    
    return top_n_vacancies

In [55]:
model_name = 'Impl_ALS_rat0-1'
model = Impl_ALS_0_1
set_name = 'top95_10'
dataset = top95vac_from10
impl_set = Impl_0_1_set
n_pred = 5

# Создаем столбец с предсказаниями
predict_col_name = f'predict_id_{model_name}_{set_name}_n_{n_pred}'            
test_df_with_pred[predict_col_name] = test_df_with_pred['cookie_id'].apply(get_predictions_implicit,
                                                                           model = model,
                                                                           vac_set = dataset,
                                                                           impl_set = impl_set,
                                                                           n_predict = n_pred)