In [38]:
import pandas as pd
import numpy as np
import datetime
import time

**train_mfti.parquet** – сырые данные, которые можно использовать для обучения модели

* event_date – дата взаимодейтсвия
* event_timestamp – timestamp взаимодействия в секундах 
* vacancy_id_ - id вакансии, с которой было взаимодействие
* cookie_id – id пользователя по его браузеру/ip/устройству
* user_id – id пользователя на сайте rabota.ru (есть только для зарегистрированных пользователей)
* event_type – тип взаимодействия

In [39]:
# Загружаем датасет

data_folder = 'Data_Rabota'
df = pd.read_parquet(f"{data_folder}/train_mfti.parquet")

In [40]:
df.head()

Unnamed: 0,event_date,event_timestamp,vacancy_id_,cookie_id,user_id,event_type
0,2022-08-01,1659323026,129850,97990f1a021d4be19aa3f955b7eacab4,951f53de61764ea0b51317200a0dbbfc,show_vacancy
1,2022-08-01,1659377255,108347,03bf8c511fa949c79845a5d81b09aa1d,f5a2326a17484330aa8cb4019f1b1960,show_vacancy
2,2022-08-01,1659376695,109069,03bf8c511fa949c79845a5d81b09aa1d,f5a2326a17484330aa8cb4019f1b1960,show_vacancy
3,2022-08-01,1659376722,171425,03bf8c511fa949c79845a5d81b09aa1d,f5a2326a17484330aa8cb4019f1b1960,show_vacancy
4,2022-08-01,1659374929,252384,03bf8c511fa949c79845a5d81b09aa1d,f5a2326a17484330aa8cb4019f1b1960,show_vacancy


In [41]:
# Сортируем датасет по столбцу 'cookie_id', а потом по 'event_timestamp'
sorted_df = df.sort_values(['cookie_id','event_timestamp'])

# Добавляем столбец "action_time" и заполняем 0
sorted_df.insert(2, "action_time", 0)

# Добавляем новые индексы
sorted_df = sorted_df.reset_index()
sorted_df.head(10)

Unnamed: 0,index,event_date,event_timestamp,action_time,vacancy_id_,cookie_id,user_id,event_type
0,4550690,2022-08-30,1661866548,0,153975,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,preview_click_vacancy
1,4653413,2022-08-30,1661866548,0,153975,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,show_vacancy
2,4650401,2022-08-30,1661866559,0,182445,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,show_vacancy
3,4693679,2022-08-30,1661866559,0,182445,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,preview_click_vacancy
4,4650417,2022-08-30,1661866564,0,182445,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,show_vacancy
5,4693678,2022-08-30,1661866564,0,182445,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,preview_click_vacancy
6,4510912,2022-08-30,1661866577,0,174953,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,preview_click_vacancy
7,4649889,2022-08-30,1661866577,0,174953,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,show_vacancy
8,11346982,2022-08-30,1661866594,0,174953,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,click_favorite
9,4511048,2022-08-30,1661866625,0,176171,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,preview_click_vacancy


In [42]:
# Получаем разницу по времени между двумя событиями 

sorted_df['action_time'] = sorted_df['event_timestamp'].diff(periods=1)
sorted_df[0:25]

Unnamed: 0,index,event_date,event_timestamp,action_time,vacancy_id_,cookie_id,user_id,event_type
0,4550690,2022-08-30,1661866548,,153975,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,preview_click_vacancy
1,4653413,2022-08-30,1661866548,0.0,153975,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,show_vacancy
2,4650401,2022-08-30,1661866559,11.0,182445,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,show_vacancy
3,4693679,2022-08-30,1661866559,0.0,182445,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,preview_click_vacancy
4,4650417,2022-08-30,1661866564,5.0,182445,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,show_vacancy
5,4693678,2022-08-30,1661866564,0.0,182445,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,preview_click_vacancy
6,4510912,2022-08-30,1661866577,13.0,174953,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,preview_click_vacancy
7,4649889,2022-08-30,1661866577,0.0,174953,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,show_vacancy
8,11346982,2022-08-30,1661866594,17.0,174953,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,click_favorite
9,4511048,2022-08-30,1661866625,31.0,176171,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,preview_click_vacancy


In [43]:
sorted_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12292588 entries, 0 to 12292587
Data columns (total 8 columns):
 #   Column           Dtype  
---  ------           -----  
 0   index            int64  
 1   event_date       object 
 2   event_timestamp  int64  
 3   action_time      float64
 4   vacancy_id_      int64  
 5   cookie_id        object 
 6   user_id          object 
 7   event_type       object 
dtypes: float64(1), int64(3), object(4)
memory usage: 750.3+ MB


In [44]:
# Столбец 'action_time' преобразуем в int32 и заполняем NaN нулями

sorted_df['action_time'] = pd.to_numeric(sorted_df['action_time'], errors='coerce').fillna(0).astype(int)

In [45]:
sorted_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12292588 entries, 0 to 12292587
Data columns (total 8 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   index            int64 
 1   event_date       object
 2   event_timestamp  int64 
 3   action_time      int32 
 4   vacancy_id_      int64 
 5   cookie_id        object
 6   user_id          object
 7   event_type       object
dtypes: int32(1), int64(3), object(4)
memory usage: 703.4+ MB


In [46]:
# Отслеживаем смену cookie_id, чтобы обнулить значение между соседними cookie_id

sorted_df['shifted_cookie_id'] = sorted_df['cookie_id'].shift()
sorted_df['fixed_action_time'] = np.where(sorted_df['cookie_id'] != sorted_df['shifted_cookie_id'], 0, sorted_df['action_time'])
sorted_df['action_time'] = sorted_df['fixed_action_time']
sorted_df = sorted_df.drop(columns=['shifted_cookie_id', 'fixed_action_time'])
sorted_df[:25]

Unnamed: 0,index,event_date,event_timestamp,action_time,vacancy_id_,cookie_id,user_id,event_type
0,4550690,2022-08-30,1661866548,0,153975,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,preview_click_vacancy
1,4653413,2022-08-30,1661866548,0,153975,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,show_vacancy
2,4650401,2022-08-30,1661866559,11,182445,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,show_vacancy
3,4693679,2022-08-30,1661866559,0,182445,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,preview_click_vacancy
4,4650417,2022-08-30,1661866564,5,182445,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,show_vacancy
5,4693678,2022-08-30,1661866564,0,182445,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,preview_click_vacancy
6,4510912,2022-08-30,1661866577,13,174953,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,preview_click_vacancy
7,4649889,2022-08-30,1661866577,0,174953,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,show_vacancy
8,11346982,2022-08-30,1661866594,17,174953,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,click_favorite
9,4511048,2022-08-30,1661866625,31,176171,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,preview_click_vacancy


In [47]:
# Удаляем колонку со старым индексом

sorted_df = sorted_df.drop(columns=['index'])

In [48]:
# Сдвигаем солонку action_time на одну ячейку вверх, чтобы в текущей строке в солонке action_time 
# время которое прошло с момента текущего события до следующего события 

sorted_df['action_time'] = sorted_df['action_time'].shift(-1)
sorted_df.iloc[[-1], [2]] = 0
sorted_df

Unnamed: 0,event_date,event_timestamp,action_time,vacancy_id_,cookie_id,user_id,event_type
0,2022-08-30,1661866548,0.0,153975,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,preview_click_vacancy
1,2022-08-30,1661866548,11.0,153975,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,show_vacancy
2,2022-08-30,1661866559,0.0,182445,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,show_vacancy
3,2022-08-30,1661866559,5.0,182445,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,preview_click_vacancy
4,2022-08-30,1661866564,0.0,182445,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,show_vacancy
...,...,...,...,...,...,...,...
12292583,2022-08-22,1661164755,0.0,160164,ffffdb17f45b4032b386d691d52e6c00,,preview_click_vacancy
12292584,2022-08-22,1661164755,791.0,160164,ffffdb17f45b4032b386d691d52e6c00,,preview_click_contacts
12292585,2022-08-22,1661165546,1.0,185412,ffffdb17f45b4032b386d691d52e6c00,,preview_click_vacancy
12292586,2022-08-22,1661165547,5.0,185412,ffffdb17f45b4032b386d691d52e6c00,,show_vacancy


In [49]:
train_mfti_df_up = sorted_df.copy()
train_mfti_df_up.count()

event_date         12292588
event_timestamp    12292588
action_time        12292588
vacancy_id_        12292588
cookie_id          12292588
user_id             8711177
event_type         12292588
dtype: int64

In [50]:
# Назначаем уникальный индекс для каждой пары (куки+юзер) и их дубликатов

train_mfti_df_up.insert(loc=0, column='common_id', value=train_mfti_df_up.set_index(['cookie_id','user_id']).index.factorize()[0]+1)
train_mfti_df_up

Unnamed: 0,common_id,event_date,event_timestamp,action_time,vacancy_id_,cookie_id,user_id,event_type
0,1,2022-08-30,1661866548,0.0,153975,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,preview_click_vacancy
1,1,2022-08-30,1661866548,11.0,153975,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,show_vacancy
2,1,2022-08-30,1661866559,0.0,182445,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,show_vacancy
3,1,2022-08-30,1661866559,5.0,182445,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,preview_click_vacancy
4,1,2022-08-30,1661866564,0.0,182445,0000c4548c3944c08972bbdc1fa4eb85,2eb30fab80244cdebbafd5ed096bc08f,show_vacancy
...,...,...,...,...,...,...,...,...
12292583,367641,2022-08-22,1661164755,0.0,160164,ffffdb17f45b4032b386d691d52e6c00,,preview_click_vacancy
12292584,367641,2022-08-22,1661164755,791.0,160164,ffffdb17f45b4032b386d691d52e6c00,,preview_click_contacts
12292585,367641,2022-08-22,1661165546,1.0,185412,ffffdb17f45b4032b386d691d52e6c00,,preview_click_vacancy
12292586,367641,2022-08-22,1661165547,5.0,185412,ffffdb17f45b4032b386d691d52e6c00,,show_vacancy


In [51]:
# Отображаем общий идентификатор на примере конкретного user_id и cookie_id

train_mfti_df_up[(train_mfti_df_up['user_id']=='f5a2326a17484330aa8cb4019f1b1960')&(train_mfti_df_up['cookie_id']=='03bf8c511fa949c79845a5d81b09aa1d')]

Unnamed: 0,common_id,event_date,event_timestamp,action_time,vacancy_id_,cookie_id,user_id,event_type
174110,5420,2022-08-01,1659359891,0.0,148714,03bf8c511fa949c79845a5d81b09aa1d,f5a2326a17484330aa8cb4019f1b1960,show_vacancy
174111,5420,2022-08-01,1659359891,13688.0,148714,03bf8c511fa949c79845a5d81b09aa1d,f5a2326a17484330aa8cb4019f1b1960,preview_click_vacancy
174112,5420,2022-08-01,1659373579,0.0,258441,03bf8c511fa949c79845a5d81b09aa1d,f5a2326a17484330aa8cb4019f1b1960,show_vacancy
174113,5420,2022-08-01,1659373579,39.0,258441,03bf8c511fa949c79845a5d81b09aa1d,f5a2326a17484330aa8cb4019f1b1960,preview_click_vacancy
174114,5420,2022-08-01,1659373618,0.0,102914,03bf8c511fa949c79845a5d81b09aa1d,f5a2326a17484330aa8cb4019f1b1960,show_vacancy
...,...,...,...,...,...,...,...,...
174193,5420,2022-08-02,1659449607,1.0,126251,03bf8c511fa949c79845a5d81b09aa1d,f5a2326a17484330aa8cb4019f1b1960,preview_click_vacancy
174194,5420,2022-08-02,1659449608,5.0,126251,03bf8c511fa949c79845a5d81b09aa1d,f5a2326a17484330aa8cb4019f1b1960,show_vacancy
174195,5420,2022-08-02,1659449613,1218.0,126251,03bf8c511fa949c79845a5d81b09aa1d,f5a2326a17484330aa8cb4019f1b1960,show_vacancy
174196,5420,2022-08-02,1659450831,0.0,136081,03bf8c511fa949c79845a5d81b09aa1d,f5a2326a17484330aa8cb4019f1b1960,preview_click_vacancy


In [52]:
train_mfti_df_up.to_parquet(f"{data_folder}/Processed_dataset.parquet",compression='BROTLI')