## Подготовка датасета TTRS

In [28]:
from datetime import datetime
from collections import Counter, defaultdict

import os
import re
import pandas as pd
import numpy as np

In [29]:
DATASET = 'ttrs'
RAW_PATH = os.path.join('./data/', DATASET)

RANDOM_SEED = 0
NEG_ITEMS = 99

In [30]:
RAW_PATH

'./data/ttrs'

### Загрузка данных

In [32]:
file_path = RAW_PATH + '/test_transactions.csv'
data = pd.read_csv(file_path).dropna().reset_index(drop=True)
data['transaction_dttm'] = data['transaction_dttm'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))
data = data[data.transaction_type_desc=='Покупка']
data_df = data[['party_rk', 'merchant_group_rk', 'transaction_dttm']]
item_df = data[['merchant_group_rk', 'category']]

In [33]:
data.tail()

Unnamed: 0,party_rk,account_rk,financial_account_type_cd,transaction_dttm,transaction_type_desc,transaction_amt_rur,merchant_rk,merchant_type,merchant_group_rk,category
991468,92092,118227,2,2020-02-29,Покупка,74.98,1735901.0,286.0,878.0,Супермаркеты
991469,27858,26443,2,2020-02-29,Покупка,587.96,2392692.0,336.0,930.0,Супермаркеты
991470,71242,85948,2,2020-02-29,Покупка,60.5,96766.0,286.0,1350.0,Супермаркеты
991471,49382,124549,1,2020-02-29,Покупка,161.0,800756.0,286.0,866.0,Супермаркеты
991474,89344,118613,2,2020-02-29,Покупка,378.0,883431.0,330.0,1996.0,Фаст Фуд


In [34]:
data_df.columns = ['user_id', 'item_id', 'time']
item_df.columns = ['item_id', 'i_category']
data_df.head()

Unnamed: 0,user_id,item_id,time
0,56472,425.0,2020-01-01
1,49587,461.0,2020-01-01
2,13825,904.0,2020-01-01
3,44451,546.0,2020-01-01
4,90431,341.0,2020-01-01


In [35]:
# Оставляем товары и покупателей с 10 транзакциями и более
min_num = 10

print('Filter before:', len(data_df))
filter_before = -1
while filter_before != len(data_df):
    filter_before = len(data_df)
    for stage in ['user_id', 'item_id']:
        val_cnt = data_df[stage].value_counts()
        cnt_df = pd.DataFrame({stage: val_cnt.index, 'cnt': val_cnt.values})
        data_df = pd.merge(data_df, cnt_df, on=stage, how='left')
        data_df = data_df[data_df['cnt'] >= min_num].drop(columns=['cnt'])
print('Filter after:', len(data_df))

item_df = item_df[item_df['item_id'].isin(data_df['item_id'])]  # remove unuseful metadata
item_df = item_df.drop_duplicates().reset_index(drop=True)

Filter before: 948891
Filter after: 894949


### Статистики

In [36]:
n_users = data_df['user_id'].value_counts().size
n_items = data_df['item_id'].value_counts().size
n_clicks = len(data_df)
min_time = data_df['time'].min()
max_time = data_df['time'].max()

In [37]:
time_format = '%Y-%m-%d'

print('# Users:', n_users)
print('# Items:', n_items)
print('# Interactions:', n_clicks)
print('Time Span: {} - {}'.format(
    min_time.strftime(time_format),
    max_time.strftime(time_format)
))
print(f"Average bucket length: {data_df.groupby(['user_id', 'time']).agg({'item_id': 'count'}).item_id.values.mean()}")

# Users: 20464
# Items: 1354
# Interactions: 894949
Time Span: 2020-01-01 - 2020-02-29
Average bucket length: 1.9693056866667107


### Формирование датасета

#### По взаимодействиям

In [38]:
np.random.seed(RANDOM_SEED)

In [39]:
out_df = data_df[['user_id', 'item_id', 'time']]
out_df = out_df.drop_duplicates(['user_id', 'item_id', 'time'])
out_df.sort_values(by=['time', 'user_id'], kind='mergesort', inplace=True)
out_df = out_df.reset_index(drop=True)
out_df.head()

Unnamed: 0,user_id,item_id,time
0,14,811.0,2020-01-01
1,32,878.0,2020-01-01
2,61,1310.0,2020-01-01
3,64,2213.0,2020-01-01
4,64,779.0,2020-01-01


In [40]:
# reindex (start from 1)
uids = sorted(out_df['user_id'].unique())
user2id = dict(zip(uids, range(1, len(uids) + 1)))
iids = sorted(out_df['item_id'].unique())
item2id = dict(zip(iids, range(1, len(iids) + 1)))

out_df['user_id'] = out_df['user_id'].apply(lambda x: user2id[x])
out_df['item_id'] = out_df['item_id'].apply(lambda x: item2id[x])
out_df.head()

Unnamed: 0,user_id,item_id,time
0,2,351,2020-01-01
1,5,383,2020-01-01
2,13,657,2020-01-01
3,14,1112,2020-01-01
4,14,337,2020-01-01


In [41]:
# leave one out spliting
clicked_item_set = dict()
for user_id, seq_df in out_df.groupby('user_id'):
    clicked_item_set[user_id] = set(seq_df['item_id'].values.tolist())
    
def generate_dev_test(data_df):
    result_dfs = []
    n_items = data_df['item_id'].value_counts().size
    for idx in range(2):
        result_df = data_df.groupby('user_id').tail(1).copy()
        data_df = data_df.drop(result_df.index)
        neg_items = np.random.randint(1, n_items + 1, (len(result_df), NEG_ITEMS))
        for i, uid in enumerate(result_df['user_id'].values):
            user_clicked = clicked_item_set[uid]
            for j in range(len(neg_items[i])):
                while neg_items[i][j] in user_clicked:
                    neg_items[i][j] = np.random.randint(1, n_items + 1)
        result_df['neg_items'] = neg_items.tolist()
        result_dfs.append(result_df)
    return result_dfs, data_df

In [42]:
leave_df = out_df.groupby('user_id').head(1)
data_df = out_df.drop(leave_df.index)

[test_df, dev_df], data_df = generate_dev_test(data_df)
train_df = pd.concat([leave_df, data_df]).sort_index()

len(train_df), len(dev_df), len(test_df)

(689468, 20461, 20464)

In [43]:
train_df.head()

Unnamed: 0,user_id,item_id,time
0,2,351,2020-01-01
1,5,383,2020-01-01
2,13,657,2020-01-01
3,14,1112,2020-01-01
4,14,337,2020-01-01


In [44]:
test_df.head()

Unnamed: 0,user_id,item_id,time,neg_items
27613,1030,442,2020-01-04,"[685, 560, 1217, 836, 764, 1034, 278, 600, 109..."
29124,2538,285,2020-01-04,"[369, 918, 202, 384, 371, 556, 955, 24, 131, 3..."
55059,7205,291,2020-01-06,"[292, 1023, 702, 728, 556, 33, 1036, 213, 1163..."
56197,8358,397,2020-01-06,"[1118, 751, 1112, 161, 148, 585, 456, 1112, 10..."
57646,9712,521,2020-01-06,"[1173, 1009, 1284, 127, 578, 805, 1349, 977, 1..."


In [45]:
# after splitting, convert time to timestamp
train_df['time'] = train_df['time'].apply(lambda x: x.timestamp())
dev_df['time'] = dev_df['time'].apply(lambda x: x.timestamp())
test_df['time'] = test_df['time'].apply(lambda x: x.timestamp())

In [46]:
# save results
train_df.to_csv(os.path.join(RAW_PATH, 'train.csv'), sep='\t', index=False)
dev_df.to_csv(os.path.join(RAW_PATH, 'dev.csv'), sep='\t', index=False)
test_df.to_csv(os.path.join(RAW_PATH, 'test.csv'), sep='\t', index=False)

### Граф знаний

In [47]:
cids = sorted(item_df['i_category'].unique())
cat2id = dict(zip(cids, range(1, len(cids) + 1)))

In [48]:
item_df['i_category'] = item_df['i_category'].apply(lambda x: cat2id[x])
item_df['item_id'] = item_df['item_id'].apply(lambda x: item2id[x])

In [49]:
item_df.head()

Unnamed: 0,item_id,i_category
0,183,21
1,201,29
2,400,29
3,237,27
4,146,27


In [50]:
# complements
bought_item_set = defaultdict(list)
for user_id, seq_df in train_df.groupby('user_id'):
    bought_item_set[user_id] = [x for x in set(seq_df['item_id'].values.tolist())]
    
bought_item_set = dict(bought_item_set)
occurences_dict = defaultdict(list)
items = set(sorted(item_df['item_id'].unique()))
for ls in bought_item_set.values():
    for item in ls:
        occurences_dict[item] += [x for x in ls if x!=item]
            
occurences_dict = dict(occurences_dict)
most_related_items = defaultdict(list)
for k, v in occurences_dict.items():
    most_common_items = Counter(v).most_common(5)
    most_related_items[k] += [x[0] for x in most_common_items]
    
most_related_items = dict(most_related_items)

# substitutes
category_items = {}
for category in item_df.i_category.unique():
    category_items[category] = [x for x in item_df[item_df.i_category==category]['item_id'].value_counts().head(10).index]
    
item_substitutes = {}
for item in item_df.item_id.unique():
    category = item_df[item_df.item_id==item].i_category.values[0]
    substitutes = category_items.get(category, [])
    item_substitutes[item] = [x for x in substitutes if x!= item]

In [51]:
item_df['r_complement'] = item_df.item_id.apply(lambda x: most_related_items.get(x, []))
item_df['r_substitute'] = item_df.item_id.apply(lambda x: item_substitutes.get(x, []))

In [52]:
item_df.head()

Unnamed: 0,item_id,i_category,r_complement,r_substitute
0,183,21,"[383, 146, 70, 293, 418]","[168, 172, 134, 678, 164, 675, 157, 1172, 657,..."
1,201,29,"[383, 70, 293, 146, 397]","[1279, 1278, 1168, 1130, 677, 422, 1063, 1320,..."
2,400,29,"[383, 437, 70, 293, 146]","[1279, 1278, 1168, 1130, 677, 422, 1063, 1320,..."
3,237,27,"[405, 383, 315, 154, 508]","[1066, 693, 154, 1179, 668, 363, 671, 128, 163..."
4,146,27,"[383, 418, 70, 293, 397]","[1066, 693, 154, 1179, 668, 363, 671, 128, 163..."


In [53]:
len(item_df)

2422

In [54]:
# save results
item_df.to_csv(os.path.join(RAW_PATH, 'item_meta.csv'), sep='\t', index=False)