In [23]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

In [24]:
from scipy.sparse import csr_matrix

from implicit.als import AlternatingLeastSquares
from implicit.lmf import LogisticMatrixFactorization

In [25]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 30)

In [26]:
random_state_value = 42

id_column_name = 'cookie_id'

**Загружаем датасет**

In [7]:
from google.colab import drive
drive.mount('/content/drive')

ModuleNotFoundError: No module named 'google.colab'

In [27]:
data_dir = 'Data_Rabota'

In [28]:
train_df = pd.read_parquet(f"{data_dir}/Processed_dataset.parquet")
train_df

Unnamed: 0,common_id,event_date,event_timestamp,action_time,vacancy_id_,cookie_id,user_id,event_type,common_id_2
15,2,2022-09-05,1662374899,0.0,192850,0000d7508334414ca792c5ff66eb8c14,,preview_click_vacancy,0000d7508334414ca792c5ff66eb8c14
16,2,2022-09-05,1662374899,498.0,192850,0000d7508334414ca792c5ff66eb8c14,,show_vacancy,0000d7508334414ca792c5ff66eb8c14
17,2,2022-09-05,1662375397,1.0,230581,0000d7508334414ca792c5ff66eb8c14,,preview_click_vacancy,0000d7508334414ca792c5ff66eb8c14
18,2,2022-09-05,1662375398,205.0,230581,0000d7508334414ca792c5ff66eb8c14,,show_vacancy,0000d7508334414ca792c5ff66eb8c14
19,2,2022-09-05,1662375603,1.0,250327,0000d7508334414ca792c5ff66eb8c14,,preview_click_vacancy,0000d7508334414ca792c5ff66eb8c14
...,...,...,...,...,...,...,...,...,...
12292571,367640,2022-09-26,1664169217,1.0,197828,ffffd6d4f2134dd6aa80ff71e942b508,82a5e80f91b144f596496d6d831b49d3,preview_click_vacancy,82a5e80f91b144f596496d6d831b49d3
12292572,367640,2022-09-26,1664169218,90.0,197828,ffffd6d4f2134dd6aa80ff71e942b508,82a5e80f91b144f596496d6d831b49d3,show_vacancy,82a5e80f91b144f596496d6d831b49d3
12292573,367640,2022-09-26,1664169308,210911.0,197828,ffffd6d4f2134dd6aa80ff71e942b508,82a5e80f91b144f596496d6d831b49d3,click_contacts,82a5e80f91b144f596496d6d831b49d3
12292574,367640,2022-09-28,1664380219,132.0,233452,ffffd6d4f2134dd6aa80ff71e942b508,82a5e80f91b144f596496d6d831b49d3,click_contacts,82a5e80f91b144f596496d6d831b49d3


In [29]:
test_df = pd.read_parquet(f"{data_dir}/test_public_mfti.parquet")
test_df  

Unnamed: 0,cookie_id,vacancy_id_
0,000cd76cd33f43d4a1ac1d16d10f8bf7,"[222177, 222173, 222163, 238874, 238878, 228125, 225604, 235544, 119314]"
1,0034bc7f404341ba8412665453e7825a,"[102794, 137587, 257319, 237756, 240744, 113482, 240245, 221376, 242642, 235561, 191512, 222762, 231446, 109269, 205578]"
2,00a6c5a64a274c55a836402bdeb3b2c4,"[254292, 164602, 116438, 228634, 218819, 240655, 240659, 237341, 176563, 255299, 245637, 109275, 111505, 212953, 217549, 207108, 200475, 211355, 211449, 108956, 111071, 253239, 109649, 221735, 222030, 222174, 231207, 231022, 234380, 106944, 203948, 175325, 193534, 197505, 234769, 238933, 225709, 184466, 242642, 203015]"
3,015937a125b14e74bdff1cddc49f9172,"[246685, 138123, 115420, 210628, 212325, 235196]"
4,01de50c280794cec8804f16f45f847b7,"[219070, 251469, 166899, 212703, 214561]"
...,...,...
767,fdbcda17f22f406486837059e76c7fed,"[207851, 254989, 213344, 214180, 222146]"
768,fe6193ab26494ace9be5aae36e507618,"[115352, 230546, 225527, 120188, 109360, 232128, 237590, 259570, 244126, 229677, 135447]"
769,fe95b2826ee1452b81201ed3f4c3294d,"[240362, 114852, 253946, 251081, 127546, 244688]"
770,ff1aef256a49481698bb2e938510ff36,"[231194, 236363, 220747, 244688, 100094, 240521, 179568, 184858, 100103]"


In [30]:
nonrel_act_list = ['show_vacancy',
                'preview_click_vacancy',
                'click_favorite',
                'preview_click_favorite']

rel_act_list = ['click_response',
                'preview_click_response',
                'click_contacts',
                'preview_click_contacts',
                'click_phone',
                'preview_click_phone']

In [31]:
# Создаем колонку с типом действия для рейтинга
# 1 - целевое действие, 0 - остальные действия

rating_column_name = 'rating'

def check_is_purpose(event):
    if event in rel_act_list:
        return 1
    else:
        return 0

train_df[rating_column_name] = train_df['event_type'].apply(check_is_purpose)
train_df

Unnamed: 0,common_id,event_date,event_timestamp,action_time,vacancy_id_,cookie_id,user_id,event_type,common_id_2,rating
15,2,2022-09-05,1662374899,0.0,192850,0000d7508334414ca792c5ff66eb8c14,,preview_click_vacancy,0000d7508334414ca792c5ff66eb8c14,0
16,2,2022-09-05,1662374899,498.0,192850,0000d7508334414ca792c5ff66eb8c14,,show_vacancy,0000d7508334414ca792c5ff66eb8c14,0
17,2,2022-09-05,1662375397,1.0,230581,0000d7508334414ca792c5ff66eb8c14,,preview_click_vacancy,0000d7508334414ca792c5ff66eb8c14,0
18,2,2022-09-05,1662375398,205.0,230581,0000d7508334414ca792c5ff66eb8c14,,show_vacancy,0000d7508334414ca792c5ff66eb8c14,0
19,2,2022-09-05,1662375603,1.0,250327,0000d7508334414ca792c5ff66eb8c14,,preview_click_vacancy,0000d7508334414ca792c5ff66eb8c14,0
...,...,...,...,...,...,...,...,...,...,...
12292571,367640,2022-09-26,1664169217,1.0,197828,ffffd6d4f2134dd6aa80ff71e942b508,82a5e80f91b144f596496d6d831b49d3,preview_click_vacancy,82a5e80f91b144f596496d6d831b49d3,0
12292572,367640,2022-09-26,1664169218,90.0,197828,ffffd6d4f2134dd6aa80ff71e942b508,82a5e80f91b144f596496d6d831b49d3,show_vacancy,82a5e80f91b144f596496d6d831b49d3,0
12292573,367640,2022-09-26,1664169308,210911.0,197828,ffffd6d4f2134dd6aa80ff71e942b508,82a5e80f91b144f596496d6d831b49d3,click_contacts,82a5e80f91b144f596496d6d831b49d3,1
12292574,367640,2022-09-28,1664380219,132.0,233452,ffffd6d4f2134dd6aa80ff71e942b508,82a5e80f91b144f596496d6d831b49d3,click_contacts,82a5e80f91b144f596496d6d831b49d3,1


**Для рейтинга 0-6**

In [32]:
train_df_short = train_df[[id_column_name, 'vacancy_id_', 'event_type', rating_column_name]]
train_matrix_rating0_6 = train_df_short.drop_duplicates(keep='first', ignore_index=True).sort_values(by=[id_column_name, 'vacancy_id_'], ignore_index=True)
train_matrix_rating0_6 = train_matrix_rating0_6.groupby(by=[id_column_name, 'vacancy_id_'], as_index=False).agg({rating_column_name: 'sum'})
train_matrix_rating0_6

Unnamed: 0,cookie_id,vacancy_id_,rating
0,0000c4548c3944c08972bbdc1fa4eb85,137659,0
1,0000c4548c3944c08972bbdc1fa4eb85,153975,0
2,0000c4548c3944c08972bbdc1fa4eb85,174953,0
3,0000c4548c3944c08972bbdc1fa4eb85,176171,0
4,0000c4548c3944c08972bbdc1fa4eb85,182445,0
...,...,...,...
4678580,ffffdb17f45b4032b386d691d52e6c00,107125,1
4678581,ffffdb17f45b4032b386d691d52e6c00,107893,1
4678582,ffffdb17f45b4032b386d691d52e6c00,150843,1
4678583,ffffdb17f45b4032b386d691d52e6c00,160164,1


In [33]:
f'min rating = {train_matrix_rating0_6[rating_column_name].min()}, max rating = {train_matrix_rating0_6[rating_column_name].max()}'

'min rating = 0, max rating = 6'

In [34]:
# Вспомогательные функции

def get_predictions(cookie_id, model_info, initial_set, n_predict = 5):
    # Получим все вакансии с которыми взаимодействовал cookie_id:
    interacted_items = train_matrix_rating0_6[train_matrix_rating0_6['cookie_id'] == cookie_id]['vacancy_id_'].unique()
    
    # Удалим из мешка вакансии с которыми контактировал юзер:
    items_to_predict = np.setdiff1d(initial_set, interacted_items)
    
    model = model_info[0]
    model_type = model_info[1]  
    
    # получим топ-n вакансий для данного юзера:
    if model_type == 'surprise':
        top_n_vacancies = get_predictions_surprise(cookie_id, model, items_to_predict, n_predict)
    
    if model_type == 'implicit':
        filtered_set = train_matrix_rating0_6[~train_matrix_rating0_6['vacancy_id_'].isin(items_to_predict)]['vacancy_id_'].unique()
        top_n_vacancies = get_predictions_implicit(cookie_id, model, filtered_set, model_info[2], n_predict)
    
    return top_n_vacancies


def get_predictions_surprise(user_id, model, vac_set, n_predict = 5):
    # Создадим тест датасет для данного юзера:
    test_set = [[user_id, item, 4.] for item in vac_set]        
    
    predictions = model.test(test_set)
    top_n = sorted(predictions, key=lambda x: x.est, reverse=True)[:n_predict]
    return [pred.iid for pred in top_n]


def get_predictions_implicit(user_id, model, filtered_set, impl_set, n_predict = 5):
    user_item = impl_set[0]
    id_cat_code = impl_set[1][user_id]
    vac_cat_codes = impl_set[2]
    cat_vac_codes = impl_set[3]
    filter_items = np.array([vac_cat_codes[i] for i in filtered_set])
    ids, scores = model.recommend(id_cat_code,
                                  user_item[id_cat_code],
                                  N=n_predict,
                                  filter_items=filter_items,
                                  filter_already_liked_items=True)
    return [cat_vac_codes[i] for i in ids]    


def calc_correct_recomend_number(predict_col, vac_col):
    """Функция для определения количества совпадающих вакансий в двух списках"""
    return len(set(predict_col) & set(vac_col))


def get_prec_n(res_df, num_corr_column, n_predict = 5):
    """Функция для расчета precision@n метрики"""
    corr_recommend = res_df[num_corr_column].sum()
    prec_5 = corr_recommend / (n_predict * res_df.shape[0])
    return {'num_corr_recomend': corr_recommend, f'precision@{n_predict}': prec_5}

In [35]:
# Импортируем tqdm для progress bar
from tqdm.auto import tqdm

def get_res_df(models_dict, set_dict, n_predict = 5):
    res_dict = {}
    for model_name in tqdm(models_dict):        
        for set_name in tqdm(set_dict):        
            # Создаем столбец с предсказаниями
            predict_col_name = f'predict_id_{model_name}_{set_name}_n_{n_pred}'            
            test_df_with_pred[predict_col_name] = test_df_with_pred['cookie_id'].apply(get_predictions,
                                                                                       model_info = models_dict[model_name],
                                                                                       initial_set = set_dict[set_name],
                                                                                       n_predict = n_pred)                  
            
            # Создаем столбец с количеством правильных предсказаний для каждого id
            num_corr_col_name = f'num_corr_{model_name}_{set_name}_n_{n_pred}'        
            test_df_with_pred[num_corr_col_name] = test_df_with_pred.apply(
                lambda x: calc_correct_recomend_number(x[predict_col_name], x['vacancy_id_']), axis=1
            )
            
            # Считаем precision@n метрику
            res_dict[model_name+'_'+set_name] = get_prec_n(test_df_with_pred,
                                                           num_corr_column = num_corr_col_name,
                                                           n_predict = n_pred)
    return pd.DataFrame.from_dict(res_dict, orient='index')

## Часть Implicit

In [36]:
col_names ={
    'id_col_name': id_column_name,
    'vac_col_name': 'vacancy_id_',
    'rating_col_name': rating_column_name
}

In [37]:
def transform_into_sparse_matrix(input_df, col_names):
    input_df_fixed = input_df.copy()
    input_df_fixed[col_names['id_col_name']] = input_df_fixed[col_names['id_col_name']].astype('category')
    input_df_fixed[col_names['vac_col_name']] = input_df_fixed[col_names['vac_col_name']].astype('category')
    input_df_fixed[col_names['rating_col_name']] = input_df_fixed[col_names['rating_col_name']].astype('uint8')
    user_item_matrix = csr_matrix((input_df_fixed[col_names['rating_col_name']], 
                                (input_df_fixed[col_names['id_col_name']].cat.codes, 
                                 input_df_fixed[col_names['vac_col_name']].cat.codes)))
    id_cat_codes = dict(zip(input_df_fixed[col_names['id_col_name']], input_df_fixed[col_names['id_col_name']].cat.codes))
    vac_cat_codes = dict(zip(input_df_fixed[col_names['vac_col_name']], input_df_fixed[col_names['vac_col_name']].cat.codes))
    cat_vac_codes = dict(zip(input_df_fixed[col_names['vac_col_name']].cat.codes, input_df_fixed[col_names['vac_col_name']]))
    return (user_item_matrix, id_cat_codes, vac_cat_codes, cat_vac_codes)


In [38]:
Impl_rat06_set = transform_into_sparse_matrix(train_matrix_rating0_6, col_names)

In [39]:
# Создание списка неактульных вакансий, с которомы давно не было взаимодействий
# 78 973 вакансии не были провзаимодействованы за последние 2 недели рассматриваемого периода (prec = 0.031)
# 99 046 вакансии не были провзаимодействованы за последнюю неделю (prec = 0.033)
# 117 944 вакансии не были провзаимодействованы за последние 3 дня (prec = 0.036)
# 127 405 вакансии не были провзаимодействованы за последние 2 дня (prec = 0.038)
# 140 242 вакансии не были провзаимодействованы за последний 1 день - 0.04
non_valid_vacs = []
for vac, group in train_df.groupby('vacancy_id_'):
    max_date = group['event_date'].max()
    if max_date <= '2022-09-30':
        non_valid_vacs.append(vac)
len(non_valid_vacs)

140242

In [40]:
valid_vacs = train_matrix_rating0_6[~train_matrix_rating0_6['vacancy_id_'].isin(non_valid_vacs)]['vacancy_id_'].unique()
len(valid_vacs)

19925

In [None]:
# Задаем словари с моделями и датасетами чтобы перебрать их все
set_dict_Impl = {
    'non_val_vacs': valid_vacs
}

In [None]:
# Копируем датасет чтобы не менять изначальный

test_df_with_pred = test_df.copy()

In [41]:
parameters = {'factors': np.arange(50, 250, 50),
             'regularization': np.arange(0.01, 0.1, 0.02),
             'iterations': np.arange(10, 20, 5)}

In [42]:
models_dict_Impl = {}

for factor in parameters['factors']:
    models_dict_Impl[f"Impl_ALS_rat06_fac_{factor}"] = list()
    models_dict_Impl[f"Impl_ALS_rat06_fac_{factor}"].append(AlternatingLeastSquares(factors=factor,
                                                                                  regularization=0.05,
                                                                                  calculate_training_loss=True,
                                                                                  random_state=random_state_value))
    models_dict_Impl[f"Impl_ALS_rat06_fac_{factor}"].append('implicit')
    models_dict_Impl[f"Impl_ALS_rat06_fac_{factor}"].append(Impl_rat06_set)
    models_dict_Impl[f"Impl_ALS_rat06_fac_{factor}"][0].fit(Impl_rat06_set[0])

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

In [None]:
n_pred = 5

res_df = get_res_df(models_dict_Impl, set_dict_Impl, n_predict = n_pred)
res_df

In [None]:
pd.set_option('display.max_rows', None)
res_df.sort_values(by=['precision@5'], ascending=False)

In [43]:
models_dict_Impl = {}

for reg_val in parameters['regularization']:
    models_dict_Impl[f"Impl_ALS_rat06_reg_{reg_val}"] = list()
    models_dict_Impl[f"Impl_ALS_rat06_reg_{reg_val}"].append(AlternatingLeastSquares(factors=100,
                                                                                     regularization=reg_val,
                                                                                     calculate_training_loss=True,
                                                                                     random_state=random_state_value))
    models_dict_Impl[f"Impl_ALS_rat06_reg_{reg_val}"].append('implicit')
    models_dict_Impl[f"Impl_ALS_rat06_reg_{reg_val}"].append(Impl_rat06_set)
    models_dict_Impl[f"Impl_ALS_rat06_reg_{reg_val}"][0].fit(Impl_rat06_set[0])

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

In [None]:
n_pred = 5

res_df = get_res_df(models_dict_Impl, set_dict_Impl, n_predict = n_pred)
res_df

In [None]:
pd.set_option('display.max_rows', None)
res_df.sort_values(by=['precision@5'], ascending=False)

In [44]:
models_dict_Impl = {}

for iter_val in parameters['iterations']:
    models_dict_Impl[f"Impl_ALS_rat06_iter_{iter_val}"] = list()
    models_dict_Impl[f"Impl_ALS_rat06_iter_{iter_val}"].append(AlternatingLeastSquares(factors=100,
                                                                                       regularization=0.05,
                                                                                       iterations=iter_val,
                                                                                       calculate_training_loss=True,
                                                                                       random_state=random_state_value))
    models_dict_Impl[f"Impl_ALS_rat06_iter_{iter_val}"].append('implicit')
    models_dict_Impl[f"Impl_ALS_rat06_iter_{iter_val}"].append(Impl_rat06_set)
    models_dict_Impl[f"Impl_ALS_rat06_iter_{iter_val}"][0].fit(Impl_rat06_set[0])

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

In [None]:
n_pred = 5

res_df = get_res_df(models_dict_Impl, set_dict_Impl, n_predict = n_pred)
res_df

In [None]:
pd.set_option('display.max_rows', None)
res_df.sort_values(by=['precision@5'], ascending=False)