In [1]:
import pandas as pd
import numpy as np
import datetime
import time

**train_mfti.parquet** – сырые данные, которые можно использовать для обучения модели

* event_date – дата взаимодейтсвия
* event_timestamp – timestamp взаимодействия в секундах 
* vacancy_id_ - id вакансии, с которой было взаимодействие
* cookie_id – id пользователя по его браузеру/ip/устройству
* user_id – id пользователя на сайте rabota.ru (есть только для зарегистрированных пользователей)
* event_type – тип взаимодействия

In [2]:
# Загружаем датасет

data_folder = 'Data_Rabota'
df = pd.read_parquet(f"{data_folder}/train_mfti.parquet")

In [3]:
df.head()

Unnamed: 0,event_date,event_timestamp,vacancy_id_,cookie_id,user_id,event_type
0,2022-08-01,1659323026,129850,97990f1a021d4be19aa3f955b7eacab4,951f53de61764ea0b51317200a0dbbfc,show_vacancy
1,2022-08-01,1659377255,108347,03bf8c511fa949c79845a5d81b09aa1d,f5a2326a17484330aa8cb4019f1b1960,show_vacancy
2,2022-08-01,1659376695,109069,03bf8c511fa949c79845a5d81b09aa1d,f5a2326a17484330aa8cb4019f1b1960,show_vacancy
3,2022-08-01,1659376722,171425,03bf8c511fa949c79845a5d81b09aa1d,f5a2326a17484330aa8cb4019f1b1960,show_vacancy
4,2022-08-01,1659374929,252384,03bf8c511fa949c79845a5d81b09aa1d,f5a2326a17484330aa8cb4019f1b1960,show_vacancy


In [4]:
# Сортируем датасет по столбцу 'cookie_id', а потом по 'event_timestamp'
sorted_df = df.sort_values(['cookie_id','event_timestamp'])

# Добавляем столбец "action_time" и заполняем 0
sorted_df.insert(2, "action_time", 0)

# Добавляем новые индексы
sorted_df = sorted_df.reset_index()
sorted_df.head(10)

Unnamed: 0,index,event_date,event_timestamp,action_time,vacancy_id_,cookie_id,user_id,event_type
0,4550690,2022-08-30,1661866548,0,153975,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,preview_click_vacancy
1,4653413,2022-08-30,1661866548,0,153975,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,show_vacancy
2,4650401,2022-08-30,1661866559,0,182445,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,show_vacancy
3,4693679,2022-08-30,1661866559,0,182445,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,preview_click_vacancy
4,4650417,2022-08-30,1661866564,0,182445,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,show_vacancy
5,4693678,2022-08-30,1661866564,0,182445,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,preview_click_vacancy
6,4510912,2022-08-30,1661866577,0,174953,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,preview_click_vacancy
7,4649889,2022-08-30,1661866577,0,174953,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,show_vacancy
8,11346982,2022-08-30,1661866594,0,174953,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,click_favorite
9,4511048,2022-08-30,1661866625,0,176171,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,preview_click_vacancy


In [5]:
# Получаем разницу по времени между двумя событиями 

sorted_df['action_time'] = sorted_df['event_timestamp'].diff(periods=1)
sorted_df[0:25]

Unnamed: 0,index,event_date,event_timestamp,action_time,vacancy_id_,cookie_id,user_id,event_type
0,4550690,2022-08-30,1661866548,,153975,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,preview_click_vacancy
1,4653413,2022-08-30,1661866548,0.0,153975,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,show_vacancy
2,4650401,2022-08-30,1661866559,11.0,182445,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,show_vacancy
3,4693679,2022-08-30,1661866559,0.0,182445,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,preview_click_vacancy
4,4650417,2022-08-30,1661866564,5.0,182445,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,show_vacancy
5,4693678,2022-08-30,1661866564,0.0,182445,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,preview_click_vacancy
6,4510912,2022-08-30,1661866577,13.0,174953,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,preview_click_vacancy
7,4649889,2022-08-30,1661866577,0.0,174953,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,show_vacancy
8,11346982,2022-08-30,1661866594,17.0,174953,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,click_favorite
9,4511048,2022-08-30,1661866625,31.0,176171,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,preview_click_vacancy


In [6]:
sorted_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12292588 entries, 0 to 12292587
Data columns (total 8 columns):
 #   Column           Dtype  
---  ------           -----  
 0   index            int64  
 1   event_date       object 
 2   event_timestamp  int64  
 3   action_time      float64
 4   vacancy_id_      int64  
 5   cookie_id        object 
 6   user_id          object 
 7   event_type       object 
dtypes: float64(1), int64(3), object(4)
memory usage: 750.3+ MB


In [7]:
# Столбец 'action_time' преобразуем в int32 и заполняем NaN нулями

sorted_df['action_time'] = pd.to_numeric(sorted_df['action_time'], errors='coerce').fillna(0).astype(int)

In [8]:
sorted_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12292588 entries, 0 to 12292587
Data columns (total 8 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   index            int64 
 1   event_date       object
 2   event_timestamp  int64 
 3   action_time      int32 
 4   vacancy_id_      int64 
 5   cookie_id        object
 6   user_id          object
 7   event_type       object
dtypes: int32(1), int64(3), object(4)
memory usage: 703.4+ MB


In [9]:
# Отслеживаем смену cookie_id, чтобы обнулить значение между соседними cookie_id

sorted_df['shifted_cookie_id'] = sorted_df['cookie_id'].shift()
sorted_df['fixed_action_time'] = np.where(sorted_df['cookie_id'] != sorted_df['shifted_cookie_id'], 0, sorted_df['action_time'])
sorted_df['action_time'] = sorted_df['fixed_action_time']
sorted_df = sorted_df.drop(columns=['shifted_cookie_id', 'fixed_action_time'])
sorted_df[:25]

Unnamed: 0,index,event_date,event_timestamp,action_time,vacancy_id_,cookie_id,user_id,event_type
0,4550690,2022-08-30,1661866548,0,153975,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,preview_click_vacancy
1,4653413,2022-08-30,1661866548,0,153975,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,show_vacancy
2,4650401,2022-08-30,1661866559,11,182445,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,show_vacancy
3,4693679,2022-08-30,1661866559,0,182445,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,preview_click_vacancy
4,4650417,2022-08-30,1661866564,5,182445,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,show_vacancy
5,4693678,2022-08-30,1661866564,0,182445,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,preview_click_vacancy
6,4510912,2022-08-30,1661866577,13,174953,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,preview_click_vacancy
7,4649889,2022-08-30,1661866577,0,174953,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,show_vacancy
8,11346982,2022-08-30,1661866594,17,174953,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,click_favorite
9,4511048,2022-08-30,1661866625,31,176171,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,preview_click_vacancy


In [10]:
# Удаляем колонку со старым индексом

sorted_df = sorted_df.drop(columns=['index'])

In [11]:
# Сдвигаем солонку action_time на одну ячейку вверх, чтобы в текущей строке в солонке action_time 
# время которое прошло с момента текущего события до следующего события 

sorted_df['action_time'] = sorted_df['action_time'].shift(-1)
sorted_df.iloc[[-1], [2]] = 0
sorted_df

Unnamed: 0,event_date,event_timestamp,action_time,vacancy_id_,cookie_id,user_id,event_type
0,2022-08-30,1661866548,0.0,153975,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,preview_click_vacancy
1,2022-08-30,1661866548,11.0,153975,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,show_vacancy
2,2022-08-30,1661866559,0.0,182445,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,show_vacancy
3,2022-08-30,1661866559,5.0,182445,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,preview_click_vacancy
4,2022-08-30,1661866564,0.0,182445,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,show_vacancy
...,...,...,...,...,...,...,...
12292583,2022-08-22,1661164755,0.0,160164,ffffdb17f45b4032b386d691d52e6c00,,preview_click_vacancy
12292584,2022-08-22,1661164755,791.0,160164,ffffdb17f45b4032b386d691d52e6c00,,preview_click_contacts
12292585,2022-08-22,1661165546,1.0,185412,ffffdb17f45b4032b386d691d52e6c00,,preview_click_vacancy
12292586,2022-08-22,1661165547,5.0,185412,ffffdb17f45b4032b386d691d52e6c00,,show_vacancy


In [12]:
train_mfti_df_up = sorted_df.copy()
train_mfti_df_up.count()

event_date         12292588
event_timestamp    12292588
action_time        12292588
vacancy_id_        12292588
cookie_id          12292588
user_id             8711177
event_type         12292588
dtype: int64

In [13]:
# Назначаем уникальный индекс для каждой пары (куки+юзер) и их дубликатов

train_mfti_df_up.insert(loc=0, column='common_id', value=train_mfti_df_up.set_index(['cookie_id','user_id']).index.factorize()[0]+1)
train_mfti_df_up

Unnamed: 0,common_id,event_date,event_timestamp,action_time,vacancy_id_,cookie_id,user_id,event_type
0,1,2022-08-30,1661866548,0.0,153975,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,preview_click_vacancy
1,1,2022-08-30,1661866548,11.0,153975,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,show_vacancy
2,1,2022-08-30,1661866559,0.0,182445,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,show_vacancy
3,1,2022-08-30,1661866559,5.0,182445,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,preview_click_vacancy
4,1,2022-08-30,1661866564,0.0,182445,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,show_vacancy
...,...,...,...,...,...,...,...,...
12292583,367641,2022-08-22,1661164755,0.0,160164,ffffdb17f45b4032b386d691d52e6c00,,preview_click_vacancy
12292584,367641,2022-08-22,1661164755,791.0,160164,ffffdb17f45b4032b386d691d52e6c00,,preview_click_contacts
12292585,367641,2022-08-22,1661165546,1.0,185412,ffffdb17f45b4032b386d691d52e6c00,,preview_click_vacancy
12292586,367641,2022-08-22,1661165547,5.0,185412,ffffdb17f45b4032b386d691d52e6c00,,show_vacancy


In [14]:
# Отображаем общий идентификатор на примере конкретного user_id и cookie_id

train_mfti_df_up[(train_mfti_df_up['user_id']=='f5a2326a17484330aa8cb4019f1b1960')&(train_mfti_df_up['cookie_id']=='03bf8c511fa949c79845a5d81b09aa1d')]

Unnamed: 0,common_id,event_date,event_timestamp,action_time,vacancy_id_,cookie_id,user_id,event_type
174110,5420,2022-08-01,1659359891,0.0,148714,03bf8c511fa949c79845a5d81b09aa1d,f5a2326a17484330aa8cb4019f1b1960,show_vacancy
174111,5420,2022-08-01,1659359891,13688.0,148714,03bf8c511fa949c79845a5d81b09aa1d,f5a2326a17484330aa8cb4019f1b1960,preview_click_vacancy
174112,5420,2022-08-01,1659373579,0.0,258441,03bf8c511fa949c79845a5d81b09aa1d,f5a2326a17484330aa8cb4019f1b1960,show_vacancy
174113,5420,2022-08-01,1659373579,39.0,258441,03bf8c511fa949c79845a5d81b09aa1d,f5a2326a17484330aa8cb4019f1b1960,preview_click_vacancy
174114,5420,2022-08-01,1659373618,0.0,102914,03bf8c511fa949c79845a5d81b09aa1d,f5a2326a17484330aa8cb4019f1b1960,show_vacancy
...,...,...,...,...,...,...,...,...
174193,5420,2022-08-02,1659449607,1.0,126251,03bf8c511fa949c79845a5d81b09aa1d,f5a2326a17484330aa8cb4019f1b1960,preview_click_vacancy
174194,5420,2022-08-02,1659449608,5.0,126251,03bf8c511fa949c79845a5d81b09aa1d,f5a2326a17484330aa8cb4019f1b1960,show_vacancy
174195,5420,2022-08-02,1659449613,1218.0,126251,03bf8c511fa949c79845a5d81b09aa1d,f5a2326a17484330aa8cb4019f1b1960,show_vacancy
174196,5420,2022-08-02,1659450831,0.0,136081,03bf8c511fa949c79845a5d81b09aa1d,f5a2326a17484330aa8cb4019f1b1960,preview_click_vacancy


In [15]:
# Создаем common_id_2 на основе следующего подхода:
# 1. если user_id != NaN, common_id_2 = user_id
# 2. если User Id == NaN, common_id_2 = cookie_id


train_mfti_df_up_only_isna_user=train_mfti_df_up[train_mfti_df_up['user_id'].isna()]
train_mfti_df_up_only_isna_user['common_id_2']=train_mfti_df_up_only_isna_user.cookie_id.astype(str)
display(train_mfti_df_up_only_isna_user.head())

train_mfti_df_up_only_notna_user=train_mfti_df_up[train_mfti_df_up['user_id'].notna()]
train_mfti_df_up_only_notna_user['common_id_2']=train_mfti_df_up_only_notna_user.user_id.astype(str)
display(train_mfti_df_up_only_notna_user.head())


train_mfti_df_up=pd.concat([train_mfti_df_up_only_isna_user,train_mfti_df_up_only_notna_user])
train_mfti_df_up

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_mfti_df_up_only_isna_user['common_id_2']=train_mfti_df_up_only_isna_user.cookie_id.astype(str)


Unnamed: 0,common_id,event_date,event_timestamp,action_time,vacancy_id_,cookie_id,user_id,event_type,common_id_2
15,2,2022-09-05,1662374899,0.0,192850,0000d7508334414ca792c5ff66eb8c14,,preview_click_vacancy,0000d7508334414ca792c5ff66eb8c14
16,2,2022-09-05,1662374899,498.0,192850,0000d7508334414ca792c5ff66eb8c14,,show_vacancy,0000d7508334414ca792c5ff66eb8c14
17,2,2022-09-05,1662375397,1.0,230581,0000d7508334414ca792c5ff66eb8c14,,preview_click_vacancy,0000d7508334414ca792c5ff66eb8c14
18,2,2022-09-05,1662375398,205.0,230581,0000d7508334414ca792c5ff66eb8c14,,show_vacancy,0000d7508334414ca792c5ff66eb8c14
19,2,2022-09-05,1662375603,1.0,250327,0000d7508334414ca792c5ff66eb8c14,,preview_click_vacancy,0000d7508334414ca792c5ff66eb8c14


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_mfti_df_up_only_notna_user['common_id_2']=train_mfti_df_up_only_notna_user.user_id.astype(str)


Unnamed: 0,common_id,event_date,event_timestamp,action_time,vacancy_id_,cookie_id,user_id,event_type,common_id_2
0,1,2022-08-30,1661866548,0.0,153975,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,preview_click_vacancy,2eb30fab80244cdebbafd5ed096bc08f
1,1,2022-08-30,1661866548,11.0,153975,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,show_vacancy,2eb30fab80244cdebbafd5ed096bc08f
2,1,2022-08-30,1661866559,0.0,182445,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,show_vacancy,2eb30fab80244cdebbafd5ed096bc08f
3,1,2022-08-30,1661866559,5.0,182445,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,preview_click_vacancy,2eb30fab80244cdebbafd5ed096bc08f
4,1,2022-08-30,1661866564,0.0,182445,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,show_vacancy,2eb30fab80244cdebbafd5ed096bc08f


Unnamed: 0,common_id,event_date,event_timestamp,action_time,vacancy_id_,cookie_id,user_id,event_type,common_id_2
15,2,2022-09-05,1662374899,0.0,192850,0000d7508334414ca792c5ff66eb8c14,,preview_click_vacancy,0000d7508334414ca792c5ff66eb8c14
16,2,2022-09-05,1662374899,498.0,192850,0000d7508334414ca792c5ff66eb8c14,,show_vacancy,0000d7508334414ca792c5ff66eb8c14
17,2,2022-09-05,1662375397,1.0,230581,0000d7508334414ca792c5ff66eb8c14,,preview_click_vacancy,0000d7508334414ca792c5ff66eb8c14
18,2,2022-09-05,1662375398,205.0,230581,0000d7508334414ca792c5ff66eb8c14,,show_vacancy,0000d7508334414ca792c5ff66eb8c14
19,2,2022-09-05,1662375603,1.0,250327,0000d7508334414ca792c5ff66eb8c14,,preview_click_vacancy,0000d7508334414ca792c5ff66eb8c14
...,...,...,...,...,...,...,...,...,...
12292571,367640,2022-09-26,1664169217,1.0,197828,ffffd6d4f2134dd6aa80ff71e942b508,82a5e80f91b144f596496d6d831b49d3,preview_click_vacancy,82a5e80f91b144f596496d6d831b49d3
12292572,367640,2022-09-26,1664169218,90.0,197828,ffffd6d4f2134dd6aa80ff71e942b508,82a5e80f91b144f596496d6d831b49d3,show_vacancy,82a5e80f91b144f596496d6d831b49d3
12292573,367640,2022-09-26,1664169308,210911.0,197828,ffffd6d4f2134dd6aa80ff71e942b508,82a5e80f91b144f596496d6d831b49d3,click_contacts,82a5e80f91b144f596496d6d831b49d3
12292574,367640,2022-09-28,1664380219,132.0,233452,ffffd6d4f2134dd6aa80ff71e942b508,82a5e80f91b144f596496d6d831b49d3,click_contacts,82a5e80f91b144f596496d6d831b49d3


In [16]:
train_mfti_df_up.to_parquet(f"{data_folder}/Processed_dataset.parquet",compression='BROTLI')

Добавляем **user_id**, **common_id** и **common_id_2** в **test_public** и **test_private** датасеты

**test_public_mfti.parquet** – часть теста, с открытым таргетом, для проверки работоспособности решений.

* cookie_id - id пользователя по его браузеру/ip/устройству
* vacancy_id_ - список вакансий, на которые пользователь откликнулся или позвонил в течение месяца после окончания данных train

In [17]:
test_public_mfti_df = pd.read_parquet(f"{data_folder}/test_public_mfti.parquet", engine='pyarrow')
test_public_mfti_df

Unnamed: 0,cookie_id,vacancy_id_
0,000cd76cd33f43d4a1ac1d16d10f8bf7,"[222177, 222173, 222163, 238874, 238878, 22812..."
1,0034bc7f404341ba8412665453e7825a,"[102794, 137587, 257319, 237756, 240744, 11348..."
2,00a6c5a64a274c55a836402bdeb3b2c4,"[254292, 164602, 116438, 228634, 218819, 24065..."
3,015937a125b14e74bdff1cddc49f9172,"[246685, 138123, 115420, 210628, 212325, 235196]"
4,01de50c280794cec8804f16f45f847b7,"[219070, 251469, 166899, 212703, 214561]"
...,...,...
767,fdbcda17f22f406486837059e76c7fed,"[207851, 254989, 213344, 214180, 222146]"
768,fe6193ab26494ace9be5aae36e507618,"[115352, 230546, 225527, 120188, 109360, 23212..."
769,fe95b2826ee1452b81201ed3f4c3294d,"[240362, 114852, 253946, 251081, 127546, 244688]"
770,ff1aef256a49481698bb2e938510ff36,"[231194, 236363, 220747, 244688, 100094, 24052..."


In [18]:
test_public_mfti_df.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 772 entries, 0 to 771
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   cookie_id    772 non-null    object
 1   vacancy_id_  772 non-null    object
dtypes: object(2)
memory usage: 12.2+ KB


**test_private_users_mfti.parquet** – часть теста, с закрытым таргетом для итоговой проверки решений

* cookie_id - id пользователя по его браузеру/ip/устройству

In [19]:
test_private_users_mfti_df = pd.read_parquet(f"{data_folder}/test_private_users_mfti.parquet", engine='pyarrow')
test_private_users_mfti_df

Unnamed: 0,cookie_id
0,0018914ba3e54011b28fa715583d3354
1,0035c298d8c64f368ae730a9cca9bb20
2,00956458877448ec9fba87fb97443fdf
3,0099387c921b41e7bae6c99dd8254b60
4,009f65e8ae99413a8da94a491320580a
...,...
3081,ffadd195859444d2ade2479b0611c5c1
3082,ffbc08b528c64f22996873fc63872202
3083,ffdeaf3c34544529880aebf17c103f6c
3084,ffefa79a74804ee69e6c131e0d05b948


In [20]:
test_private_users_mfti_df.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3086 entries, 0 to 3085
Data columns (total 1 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   cookie_id  3086 non-null   object
dtypes: object(1)
memory usage: 24.2+ KB


In [21]:
# Создаем промежуточный датафрейм 

v = train_mfti_df_up[['cookie_id',
                      'user_id',
                      'common_id',
                      'common_id_2']].groupby(['cookie_id',
                                               'common_id',
                                               'common_id_2',
                                               'user_id'],
                                              dropna=False).size().reset_index()

In [25]:
# Добавляем user_id, common_id и common_id_2 в test_public датасет

test_public_mfti_df_up=test_public_mfti_df[['cookie_id',
                                            'vacancy_id_']].merge(v[['cookie_id',
                                                                     'user_id',
                                                                     'common_id',
                                                                     'common_id_2']],
                                                                  how='left',
                                                                  indicator=False)
display(test_public_mfti_df_up.head())
display(test_public_mfti_df_up.info())

Unnamed: 0,cookie_id,vacancy_id_,user_id,common_id,common_id_2
0,000cd76cd33f43d4a1ac1d16d10f8bf7,"[222177, 222173, 222163, 238874, 238878, 22812...",477e719bfbcc4071a8acc1b357492b00,63,477e719bfbcc4071a8acc1b357492b00
1,0034bc7f404341ba8412665453e7825a,"[102794, 137587, 257319, 237756, 240744, 11348...",753a2a268ae84ab49475e62736c02860,296,753a2a268ae84ab49475e62736c02860
2,00a6c5a64a274c55a836402bdeb3b2c4,"[254292, 164602, 116438, 228634, 218819, 24065...",,910,00a6c5a64a274c55a836402bdeb3b2c4
3,00a6c5a64a274c55a836402bdeb3b2c4,"[254292, 164602, 116438, 228634, 218819, 24065...",d01c76b282364fbb8195f326de3e893c,911,d01c76b282364fbb8195f326de3e893c
4,015937a125b14e74bdff1cddc49f9172,"[246685, 138123, 115420, 210628, 212325, 235196]",,1926,015937a125b14e74bdff1cddc49f9172


<class 'pandas.core.frame.DataFrame'>
Int64Index: 907 entries, 0 to 906
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   cookie_id    907 non-null    object
 1   vacancy_id_  907 non-null    object
 2   user_id      665 non-null    object
 3   common_id    907 non-null    int64 
 4   common_id_2  907 non-null    object
dtypes: int64(1), object(4)
memory usage: 42.5+ KB


None

In [26]:
# Добавляем user_id, common_id и common_id_2 в test_private датасет

test_private_users_mfti_df_up=test_private_users_mfti_df[['cookie_id']].merge(v[['cookie_id',
                                                                                 'user_id',
                                                                                 'common_id',
                                                                                 'common_id_2']],
                                                                              how='left',
                                                                              indicator=False)
display(test_private_users_mfti_df_up.head())
display(test_private_users_mfti_df_up.info())

Unnamed: 0,cookie_id,user_id,common_id,common_id_2
0,0018914ba3e54011b28fa715583d3354,,123,0018914ba3e54011b28fa715583d3354
1,0018914ba3e54011b28fa715583d3354,1706e3869fc6432b8d7e04c613fefb4d,124,1706e3869fc6432b8d7e04c613fefb4d
2,0035c298d8c64f368ae730a9cca9bb20,535c21153f7f442bb09de85f0d00dbd6,306,535c21153f7f442bb09de85f0d00dbd6
3,00956458877448ec9fba87fb97443fdf,9e642ae3300e459f8648e573f105fa43,808,9e642ae3300e459f8648e573f105fa43
4,0099387c921b41e7bae6c99dd8254b60,0c69442f665b4daf8616223323e234a0,832,0c69442f665b4daf8616223323e234a0


<class 'pandas.core.frame.DataFrame'>
Int64Index: 3562 entries, 0 to 3561
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   cookie_id    3562 non-null   object
 1   user_id      2678 non-null   object
 2   common_id    3562 non-null   int64 
 3   common_id_2  3562 non-null   object
dtypes: int64(1), object(3)
memory usage: 139.1+ KB


None

In [27]:
# Сохраняем дополненные датасеты

test_private_users_mfti_df_up.to_parquet(f"{data_folder}/test_private_users_mfti_up.parquet", compression='BROTLI')
test_public_mfti_df_up.to_parquet(f"{data_folder}/test_public_mfti_up.parquet", compression='BROTLI')