## Подготовка датасета Ta-Feng

In [1]:
from datetime import datetime
from collections import Counter, defaultdict

import os
import re
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [2]:
DATASET = 'ta_feng'
RAW_PATH = os.path.join('./data/', DATASET)

RANDOM_SEED = 0
NEG_ITEMS = 99

In [3]:
RAW_PATH

'./data/ta_feng'

### Загрузка данных

In [5]:
file_path = RAW_PATH + '/ta_feng.csv'
data = pd.read_csv(file_path).dropna().reset_index(drop=True)
data_df = data[['CUSTOMER_ID', 'PRODUCT_ID', 'TRANSACTION_DT']]
item_df = data[['PRODUCT_ID', 'PRODUCT_SUBCLASS']]

In [6]:
data_df.columns = ['user_id', 'item_id', 'time']
data_df['time'] = data_df['time'].apply(lambda x: datetime.strptime(x, '%m/%d/%Y'))
item_df.columns = ['item_id', 'i_category']
data_df.head()

Unnamed: 0,user_id,item_id,time
0,1104905,4710199010372,2000-11-01
1,418683,4710857472535,2000-11-01
2,1057331,4710043654103,2000-11-01
3,1849332,4710126092129,2000-11-01
4,1981995,4710176021445,2000-11-01


In [7]:
# Оставляем товары и покупателей с 10 транзакциями и более
min_num = 10

print('Filter before:', len(data_df))
filter_before = -1
while filter_before != len(data_df):
    filter_before = len(data_df)
    for stage in ['user_id', 'item_id']:
        val_cnt = data_df[stage].value_counts()
        cnt_df = pd.DataFrame({stage: val_cnt.index, 'cnt': val_cnt.values})
        data_df = pd.merge(data_df, cnt_df, on=stage, how='left')
        data_df = data_df[data_df['cnt'] >= min_num].drop(columns=['cnt'])
print('Filter after:', len(data_df))

item_df = item_df[item_df['item_id'].isin(data_df['item_id'])]  # remove unuseful metadata
item_df = item_df.drop_duplicates().reset_index(drop=True)

Filter before: 795379
Filter after: 686390


### Статистики

In [8]:
n_users = data_df['user_id'].nunique()
n_items = data_df['item_id'].nunique()
n_clicks = len(data_df)
min_time = data_df['time'].min()
max_time = data_df['time'].max()

In [9]:
data_df

Unnamed: 0,user_id,item_id,time
0,1104905,4710199010372,2000-11-01
1,418683,4710857472535,2000-11-01
2,1057331,4710043654103,2000-11-01
3,1849332,4710126092129,2000-11-01
4,1981995,4710176021445,2000-11-01
...,...,...,...
686385,234658,4710168182031,2001-02-28
686386,556941,8888021800401,2001-02-28
686387,57486,4710731060124,2001-02-28
686388,733526,4716340052307,2001-02-28


In [10]:
time_format = '%Y-%m-%d'

print('# Users:', n_users)
print('# Items:', n_items)
print('# Interactions:', n_clicks)
print('Time Span: {}/{}'.format(
    min_time.strftime(time_format),
    max_time.strftime(time_format))
)

# Users: 19360
# Items: 10561
# Interactions: 686390
Time Span: 2000-11-01/2001-02-28


### Формирование датасета

#### По взаимодействиям

In [11]:
np.random.seed(RANDOM_SEED)

In [12]:
out_df = data_df[['user_id', 'item_id', 'time']]
out_df = out_df.drop_duplicates(['user_id', 'item_id', 'time'])
out_df.sort_values(by=['time', 'user_id'], kind='mergesort', inplace=True)
out_df = out_df.reset_index(drop=True)
print(len(out_df))
out_df.head()

686390


Unnamed: 0,user_id,item_id,time
0,45902,4710147100018,2000-11-01
1,45902,4710088434692,2000-11-01
2,45902,4710594912028,2000-11-01
3,45902,4710285000126,2000-11-01
4,45957,4710265849066,2000-11-01


In [13]:
# reindex (start from 1)
uids = sorted(out_df['user_id'].unique())
user2id = dict(zip(uids, range(1, len(uids) + 1)))
iids = sorted(out_df['item_id'].unique())
item2id = dict(zip(iids, range(1, len(iids) + 1)))

out_df['user_id'] = out_df['user_id'].apply(lambda x: user2id[x])
out_df['item_id'] = out_df['item_id'].apply(lambda x: item2id[x])
out_df.head()

Unnamed: 0,user_id,item_id,time
0,214,3294,2000-11-01
1,214,2598,2000-11-01
2,214,5001,2000-11-01
3,214,3910,2000-11-01
4,215,3885,2000-11-01


In [14]:
# leave one out spliting
clicked_item_set = dict()
for user_id, seq_df in out_df.groupby('user_id'):
    clicked_item_set[user_id] = set(seq_df['item_id'].values.tolist())
    
def generate_dev_test(data_df):
    result_dfs = []
    n_items = data_df['item_id'].value_counts().size
    for idx in range(2):
        result_df = data_df.groupby('user_id').tail(1).copy()
        data_df = data_df.drop(result_df.index)
        neg_items = np.random.randint(1, n_items + 1, (len(result_df), NEG_ITEMS))
        for i, uid in enumerate(result_df['user_id'].values):
            user_clicked = clicked_item_set[uid]
            for j in range(len(neg_items[i])):
                while neg_items[i][j] in user_clicked:
                    neg_items[i][j] = np.random.randint(1, n_items + 1)
        result_df['neg_items'] = neg_items.tolist()
        result_dfs.append(result_df)
    return result_dfs, data_df

In [15]:
leave_df = out_df.groupby('user_id').head(1)
data_df = out_df.drop(leave_df.index)

[test_df, dev_df], data_df = generate_dev_test(data_df)
train_df = pd.concat([leave_df, data_df]).sort_index()

len(train_df), len(dev_df), len(test_df)

(647670, 19360, 19360)

In [16]:
# after splitting, convert time to timestamp
train_df['time'] = train_df['time'].apply(lambda x: x.timestamp())
dev_df['time'] = dev_df['time'].apply(lambda x: x.timestamp())
test_df['time'] = test_df['time'].apply(lambda x: x.timestamp())

In [17]:
# save results
train_df.to_csv(os.path.join(RAW_PATH, 'train.csv'), sep='\t', index=False)
dev_df.to_csv(os.path.join(RAW_PATH, 'dev.csv'), sep='\t', index=False)
test_df.to_csv(os.path.join(RAW_PATH, 'test.csv'), sep='\t', index=False)

### Граф знаний

In [18]:
cids = sorted(item_df['i_category'].unique())
cat2id = dict(zip(cids, range(1, len(cids) + 1)))
# cat2id 

In [19]:
item_df['i_category'] = item_df['i_category'].apply(lambda x: cat2id[x])
item_df['item_id'] = item_df['item_id'].apply(lambda x: item2id[x])

In [20]:
train_df['item_id'].values.max()

10561

In [23]:
from collections import Counter, defaultdict

# complement
bought_item_set = defaultdict(list)
for user_id, seq_df in train_df.groupby('user_id'):
    bought_item_set[user_id] = [x for x in set(seq_df['item_id'].values.tolist())]
    
bought_item_set = dict(bought_item_set)
occurences_dict = defaultdict(list)
items = set(sorted(item_df['item_id'].unique()))
for ls in bought_item_set.values():
    for item in ls:
        occurences_dict[item] += [x for x in ls if x!=item]
            
occurences_dict = dict(occurences_dict)
most_related_items = defaultdict(list)
for k, v in occurences_dict.items():
    most_common_items = Counter(v).most_common(5)
    most_related_items[k] += [x[0] for x in most_common_items]
    
most_related_items = dict(most_related_items)

# substitutes
category_items = {}
for category in item_df.i_category.unique():
    category_items[category] = [x for x in item_df[item_df.i_category==category]['item_id'].value_counts().head(10).index]
    
item_substitutes = {}
for item in item_df.item_id.unique():
    category = item_df[item_df.item_id==item].i_category.values[0]
    substitutes = category_items.get(category, [])
    item_substitutes[item] = [x for x in substitutes if x!= item]

In [24]:
item_df['r_complement'] = item_df.item_id.apply(lambda x: most_related_items.get(x, []))
item_df['r_substitute'] = item_df.item_id.apply(lambda x: item_substitutes.get(x, []))

In [25]:
item_df.head()

Unnamed: 0,item_id,i_category,r_complement,r_substitute
0,3638,208,"[8154, 2459, 6013, 2498, 1674]","[3020, 2553, 2460, 2522, 3827, 2459, 1426, 245..."
1,5515,263,"[8154, 6013, 6163, 6167, 8967]","[8255, 6301, 8641, 6466, 6467, 5510, 5511, 173..."
2,2061,61,"[6013, 8154, 5580, 1980, 3051]","[6709, 2062, 4757, 4051, 10258, 10257, 10256, ..."
3,3173,264,"[6173, 8154, 1486, 6171, 1477]","[2618, 2622, 6529, 7873, 9143, 8328, 7564, 756..."
4,3488,18,"[1707, 8154, 6013, 1477, 5785]","[9626, 1030, 10413, 10412, 10411, 10410, 10409..."


In [26]:
# save results
item_df.to_csv(os.path.join(RAW_PATH, 'item_meta.csv'), sep='\t', index=False)