In [1]:
import pandas as pd
import numpy as np

import torch

from tqdm.autonotebook import tqdm


import glob
import os

import logging
from logging import getLogger
from recbole.config import Config
from recbole.data import create_dataset, data_preparation
from recbole.trainer import Trainer
from recbole.utils import init_seed, init_logger
import yaml

from recbole.data.interaction import Interaction
from recbole.model.sequential_recommender import CORE

np.random.seed(1488)

  from tqdm.autonotebook import tqdm


In [2]:
DATA_DIR = "hw/train_data"

In [3]:
data = pd.concat([
    pd.read_json(data_path, lines=True) 
    for data_path 
    in glob.glob(DATA_DIR + "/*/data.json")
])

data['timestamp'] = data['timestamp'].astype('int64') // 10**6

In [4]:
data

Unnamed: 0,message,timestamp,user,track,time,latency,recommendation,experiments
0,next,1744372533675,4578,44152,1.00,0.004421,30726.0,{'PERSONALIZED': 'T1'}
1,next,1744372533692,5029,941,0.97,0.000878,16711.0,{'PERSONALIZED': 'T1'}
2,next,1744372533702,5009,44461,0.00,0.000712,44654.0,{'PERSONALIZED': 'C'}
3,next,1744372533712,5975,31045,0.08,0.000787,8676.0,{'PERSONALIZED': 'T1'}
4,next,1744372533722,3998,4803,0.61,0.000725,24918.0,{'PERSONALIZED': 'C'}
...,...,...,...,...,...,...,...,...
365482,next,1744375060812,3867,30823,0.56,0.000482,19608.0,{'PERSONALIZED': 'C'}
365483,next,1744375060841,3495,9025,0.00,0.000510,28456.0,{'PERSONALIZED': 'T1'}
365484,last,1744375060871,7584,28549,0.00,0.000133,,{'PERSONALIZED': 'C'}
365485,next,1744375060905,4009,29513,0.00,0.000453,896.0,{'PERSONALIZED': 'T1'}


In [58]:
track_metadata = pd.read_json("botify/data/tracks.json", lines=True).drop_duplicates(subset=["track"])
track_metadata

Unnamed: 0,artist,album,title,genre,pop,duration,track
0,Михаил Бублик,ART-Обстрел I-часть,Сорок тысяч верст,"[1, 47]",-0.500252,282,41164
1,Xamdam Sobirov,Baxtli Bo'lolmadik,Baxtli Bo'lolmadik,[1],-0.942953,205,27544
2,Сергей Какенов,Ишимская шпана,Крутые лагеря,[147],-0.801382,252,34702
3,Loc-Dog,Electrodog 2,Еду убивать,[17],-0.577525,276,45907
4,Gafur,Февраль,Февраль,[1],-0.738636,160,14978
...,...,...,...,...,...,...,...
49995,Дмитрий Гревцев,Вечер под луной,Вечер под луной,[81],-0.572433,186,1794
49996,TSOY,Торнадо,Торнадо,[1],-1.171082,162,6622
49997,Тимати,Транзит,Хавчик,[10],-0.239359,185,25165
49998,Краски,Он не знает ничего (DFM Mix),Он не знает ничего (DFM Mix),[1],-0.028292,160,7780


In [39]:
data_filtered = data[data.time > 0.8][['timestamp', 'user', 'track', 'time']]
data_filtered

Unnamed: 0,timestamp,user,track,time
0,1744372533675,4578,44152,1.00
1,1744372533692,5029,941,0.97
5,1744372533734,5630,18177,1.00
11,1744372533774,3169,32980,1.00
16,1744372533814,6915,16370,1.00
...,...,...,...,...
365454,1744375060416,5206,14284,1.00
365461,1744375060483,1419,16203,0.83
365462,1744375060493,1992,15672,0.89
365472,1744375060591,7485,4494,1.00


In [None]:
def prepare_recbole_atomic_file(df, feature_type_dict):
    """
    Convert DataFrame to RecBole atomic file format
    
    Args:
        df (pandas.DataFrame): Original DataFrame
        feature_type_dict (dict): Mapping from column name to feature type
                                 (token, token_seq, float, float_seq)
    
    Returns:
        pandas.DataFrame: DataFrame with columns renamed to RecBole format
    """
    recbole_df = df.copy()
    
    column_mapping = {col: f"{col}:{feature_type_dict[col]}" for col in df.columns}
    recbole_df.columns = column_mapping.values()
    
    return recbole_df


def save_as_atomic_file(df, dataset_name, atomic_type, output_dir):
    """
    Save DataFrame as RecBole atomic file
    
    Args:
        df (pandas.DataFrame): DataFrame with RecBole-formatted columns
        dataset_name (str): Name of the dataset
        atomic_type (str): Type of atomic file (inter, user, item, kg, link, net)
        output_dir (str): Directory to save the file
    """
    
    os.makedirs(output_dir, exist_ok=True)
    
    file_path = os.path.join(output_dir, f"{dataset_name}.{atomic_type}")
    
    df.to_csv(file_path, sep='\t', index=False)


In [42]:
inter_feature_types = {
    'user': 'token',
    'track': 'token',
    'time': 'float',
    'timestamp': 'float'
}


recbole_inter_df = prepare_recbole_atomic_file(data_filtered[['user', 'track', 'time', 'timestamp']], inter_feature_types)


save_as_atomic_file(recbole_inter_df, 'botify_filtered', 'inter', './dataset/botify_filtered')

In [43]:
track_metadata['genre'] = track_metadata['genre'].apply(lambda x: ' '.join(map(str, x)))

In [44]:
track_feature_types = {
    'artist': 'token_seq',
    'album': 'token_seq',
    'title': 'token_seq',
    'genre': 'token_seq',
    'pop': 'float',
    'track': 'token'
}

recbole_track_df = prepare_recbole_atomic_file(track_metadata[['artist', 'album', 'title', 'genre', 'pop', 'track']], track_feature_types)

save_as_atomic_file(recbole_track_df, 'botify_filtered', 'item', './dataset/botify_filtered')

In [59]:
config_dict = {
    'epochs' : 5,
    'seed' : 1488,
    'train_batch_size' : 2048,
    'eval_batch_size' : 4096,
    'train_neg_sample_args' : None,
    'loss_type' : 'CE',
    'USER_ID_FIELD' : 'user',
    'ITEM_ID_FIELD' : 'track',
    'TIME_FIELD' : 'timestamp',
    'user_inter_num_interval' : "[3,inf)",
    'item_inter_num_interval' : "[3,inf)",
    'load_col' : {
        'inter' : ['timestamp', 'user', 'track', 'time'],
        'item' : ['artist', 'album', 'title', 'genre', 'pop', 'track']
        },
    'field_separator' : '\t',
    'encoding' : 'utf-8',
    'seq_separator' : " ",
    'numerical_features' : ['pop'],
    'eval_args' : {
        'split' : {'RS': [9, 1, 0]},
        'group_by' : 'user',
        'order' : 'TO',
        'mode' : 'full'
        },
    'MAX_ITEM_LIST_LENGTH' : 50
    }

yaml_string=yaml.dump(config_dict)
with open("seq_config.yaml", "w") as f:
    f.write(yaml_string)

In [12]:
config = Config(model='CORE', dataset='botify_filtered', config_file_list=["seq_config.yaml"])

# init random seed
init_seed(config['seed'], config['reproducibility'])

# logger initialization
init_logger(config)
logger = getLogger()
# Create handlers
c_handler = logging.StreamHandler()
c_handler.setLevel(logging.INFO)
logger.addHandler(c_handler)

# write config info into log
logger.info(config)

11 Apr 17:34    INFO  
General Hyper Parameters:
gpu_id = 0
use_gpu = True
seed = 1488
state = INFO
reproducibility = True
data_path = dataset/botify_filtered
checkpoint_dir = saved
show_progress = True
save_dataset = False
dataset_save_path = None
save_dataloaders = False
dataloaders_save_path = None
log_wandb = False

Training Hyper Parameters:
epochs = 5
train_batch_size = 2048
learner = adam
learning_rate = 0.001
train_neg_sample_args = {'distribution': 'none', 'sample_num': 'none', 'alpha': 'none', 'dynamic': False, 'candidate_num': 0}
eval_step = 1
stopping_step = 10
clip_grad_norm = None
weight_decay = 0.0
loss_decimal_place = 4

Evaluation Hyper Parameters:
eval_args = {'split': {'RS': [9, 1, 0]}, 'order': 'TO', 'group_by': 'user', 'mode': {'valid': 'full', 'test': 'full'}}
repeatable = True
metrics = ['Recall', 'MRR', 'NDCG', 'Hit', 'Precision']
topk = [10]
valid_metric = MRR@10
valid_metric_bigger = True
eval_batch_size = 4096
metric_decimal_place = 4

Dataset Hyper Parameter

In [13]:
dataset = create_dataset(config)
logger.info(dataset)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[field].fillna(value="", inplace=True)
  split_point = np.cumsum(feat[field].agg(len))[:-1]
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  feat[field].fillna(value=0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermed

In [14]:
train_data, valid_data, test_data = data_preparation(config, dataset)

11 Apr 17:35    INFO  [Training]: train_batch_size = [2048] train_neg_sample_args: [{'distribution': 'none', 'sample_num': 'none', 'alpha': 'none', 'dynamic': False, 'candidate_num': 0}]
[Training]: train_batch_size = [2048] train_neg_sample_args: [{'distribution': 'none', 'sample_num': 'none', 'alpha': 'none', 'dynamic': False, 'candidate_num': 0}]
11 Apr 17:35    INFO  [Evaluation]: eval_batch_size = [4096] eval_args: [{'split': {'RS': [9, 1, 0]}, 'order': 'TO', 'group_by': 'user', 'mode': {'valid': 'full', 'test': 'full'}}]
[Evaluation]: eval_batch_size = [4096] eval_args: [{'split': {'RS': [9, 1, 0]}, 'order': 'TO', 'group_by': 'user', 'mode': {'valid': 'full', 'test': 'full'}}]


In [15]:
# model loading and initialization
model = CORE(config, train_data.dataset).to(config['device'])
logger.info(model)

# trainer loading and initialization
trainer = Trainer(config, model)

# model training
best_valid_score, best_valid_result = trainer.fit(train_data)

11 Apr 17:36    INFO  CORE(
  (sess_dropout): Dropout(p=0.2, inplace=False)
  (item_dropout): Dropout(p=0.2, inplace=False)
  (item_embedding): Embedding(36063, 64, padding_idx=0)
  (net): TransNet(
    (position_embedding): Embedding(50, 64)
    (trm_encoder): TransformerEncoder(
      (layer): ModuleList(
        (0-1): 2 x TransformerLayer(
          (multi_head_attention): MultiHeadAttention(
            (query): Linear(in_features=64, out_features=64, bias=True)
            (key): Linear(in_features=64, out_features=64, bias=True)
            (value): Linear(in_features=64, out_features=64, bias=True)
            (softmax): Softmax(dim=-1)
            (attn_dropout): Dropout(p=0.5, inplace=False)
            (dense): Linear(in_features=64, out_features=64, bias=True)
            (LayerNorm): LayerNorm((64,), eps=1e-12, elementwise_affine=True)
            (out_dropout): Dropout(p=0.5, inplace=False)
          )
          (feed_forward): FeedForward(
            (dense_1): Linear(i

In [23]:
external_user_ids = dataset.id2token(dataset.uid_field, list(range(dataset.user_num)))[1:]#fist element in array is 'PAD'(default of Recbole) ->remove it 
external_user_ids

array(['4578', '5029', '5630', ..., '7388', '6011', '6980'], dtype='<U5')

In [31]:
def add_last_item(old_interaction, last_item_id, max_len=50):
    new_seq_items = old_interaction['track_list'][-1]
    if old_interaction['item_length'][-1].item() < max_len:
        new_seq_items[old_interaction['item_length'][-1].item()] = last_item_id
    else:
        new_seq_items = torch.roll(new_seq_items, -1)
        new_seq_items[-1] = last_item_id
    return new_seq_items.view(1, len(new_seq_items))

def predict_for_all_item(external_user_id, dataset, model):
    model.eval()
    with torch.no_grad():
        uid_series = dataset.token2id(dataset.uid_field, [external_user_id])
        index = np.isin(dataset.inter_feat[dataset.uid_field].numpy(), uid_series)
        input_interaction = dataset[index]
        test = {
            'track_list': add_last_item(input_interaction, 
                                          input_interaction['track'][-1].item(), model.max_seq_length),
            'item_length': torch.tensor(
                [input_interaction['item_length'][-1].item() + 1
                 if input_interaction['item_length'][-1].item() < model.max_seq_length else model.max_seq_length])
        }
        new_inter = Interaction(test)
        new_inter = new_inter.to(config['device'])
        new_scores = model.full_sort_predict(new_inter)
        new_scores = new_scores.view(-1, test_data.dataset.item_num)
        new_scores[:, 0] = -np.inf  # set scores of [pad] to -inf
    return torch.topk(new_scores, 100)

In [33]:
topk_items = []
for external_user_id in tqdm(external_user_ids):
    _, topk_iid_list = predict_for_all_item(external_user_id, dataset, model)
    last_topk_iid_list = topk_iid_list[-1]
    external_item_list = dataset.id2token(dataset.iid_field, last_topk_iid_list.cpu()).tolist()
    topk_items.append(external_item_list)
print(len(topk_items))

  0%|          | 0/10000 [00:00<?, ?it/s]

10000


In [41]:
result = pd.DataFrame(external_user_ids, columns=['user'])
result['tracks'] = [list(map(int, rec)) for rec in topk_items]
result.head()

Unnamed: 0,user,tracks
0,4578,"[3790, 49513, 44152, 652, 45500, 45505, 35289,..."
1,5029,"[5556, 1170, 10548, 1009, 1239, 1172, 5120, 15..."
2,5630,"[48797, 18177, 12746, 4963, 23720, 28424, 4913..."
3,3169,"[5131, 32980, 33449, 26180, 24929, 33116, 2622..."
4,6915,"[7121, 13510, 16370, 8109, 26884, 6491, 4584, ..."


In [42]:
result.to_json('./botify/data/recommendations_core.json', orient='records', lines=True)