In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

In [2]:
train_df = pd.read_csv('./data/mind/train/behaviors.tsv', sep='\t', names=['impression_id', 'uid', 'time', 'history', 'impressions'])

In [3]:
train_df

Unnamed: 0,impression_id,uid,time,history,impressions
0,1,U87243,11/10/2019 11:30:54 AM,N8668 N39081 N65259 N79529 N73408 N43615 N2937...,N78206-0 N26368-0 N7578-0 N58592-0 N19858-0 N5...
1,2,U598644,11/12/2019 1:45:29 PM,N56056 N8726 N70353 N67998 N83823 N111108 N107...,N47996-0 N82719-0 N117066-0 N8491-0 N123784-0 ...
2,3,U532401,11/13/2019 11:23:03 AM,N128643 N87446 N122948 N9375 N82348 N129412 N5...,N103852-0 N53474-0 N127836-0 N47925-1
3,4,U593596,11/12/2019 12:24:09 PM,N31043 N39592 N4104 N8223 N114581 N92747 N1207...,N38902-0 N76434-0 N71593-0 N100073-0 N108736-0...
4,5,U239687,11/14/2019 8:03:01 PM,N65250 N122359 N71723 N53796 N41663 N41484 N11...,N76209-0 N48841-0 N67937-0 N62235-0 N6307-0 N3...
...,...,...,...,...,...
2232743,2232744,U316192,11/13/2019 6:50:02 PM,N122359 N37069 N95876 N28787 N73408 N11266 N61321,N113723-0 N123683-1 N5287-0 N76677-0 N53474-0
2232744,2232745,U451238,11/12/2019 8:54:06 AM,N12575 N93816 N71643 N87236 N87236,N18861-0 N20990-0 N43085-0 N7937-1
2232745,2232746,U151246,11/13/2019 12:42:51 PM,N27587 N49668,N39887-1 N22811-0 N110709-1 N1923-0 N24001-1 N...
2232746,2232747,U330725,11/12/2019 1:22:57 PM,N121944 N91510 N42280 N60061 N63032 N125223 N4...,N18947-0 N88808-1 N10012-0 N38902-0 N33078-0 N...


In [4]:
old_interactions = train_df[~train_df['history'].isna()].groupby('uid')['history'].apply(lambda x: x.iloc[0].split()).explode()

In [5]:
old_interactions

uid
U0         N39011
U0        N112324
U0         N78884
U0        N111503
U0         N63941
           ...   
U99998     N53265
U99999     N85484
U99999     N34094
U99999     N17892
U99999     N12027
Name: history, Length: 13350150, dtype: object

In [6]:
import gc
gc.collect()

0

In [7]:
train_df['impressions'] = train_df['impressions'].apply(lambda x: [imp[:-2] for imp in x.split() if imp[-1] == '1'])

In [8]:
new_interactions = train_df[['uid', 'time', 'impressions']].explode('impressions')
new_interactions

Unnamed: 0,uid,time,impressions
0,U87243,11/10/2019 11:30:54 AM,N94157
0,U87243,11/10/2019 11:30:54 AM,N78699
0,U87243,11/10/2019 11:30:54 AM,N71090
0,U87243,11/10/2019 11:30:54 AM,N31174
1,U598644,11/12/2019 1:45:29 PM,N25587
...,...,...,...
2232745,U151246,11/13/2019 12:42:51 PM,N24001
2232745,U151246,11/13/2019 12:42:51 PM,N127572
2232745,U151246,11/13/2019 12:42:51 PM,N10285
2232746,U330725,11/12/2019 1:22:57 PM,N88808


In [9]:
old_interactions = old_interactions.reset_index(drop=False)
old_interactions

Unnamed: 0,uid,history
0,U0,N39011
1,U0,N112324
2,U0,N78884
3,U0,N111503
4,U0,N63941
...,...,...
13350145,U99998,N53265
13350146,U99999,N85484
13350147,U99999,N34094
13350148,U99999,N17892


In [10]:
new_interactions['dttm'] = pd.to_datetime(new_interactions['time'], format='%m/%d/%Y %I:%M:%S %p')
new_interactions['ts'] = new_interactions['dttm'].values.astype(np.int64) // 10 ** 9
new_interactions

Unnamed: 0,uid,time,impressions,dttm,ts
0,U87243,11/10/2019 11:30:54 AM,N94157,2019-11-10 11:30:54,1573385454
0,U87243,11/10/2019 11:30:54 AM,N78699,2019-11-10 11:30:54,1573385454
0,U87243,11/10/2019 11:30:54 AM,N71090,2019-11-10 11:30:54,1573385454
0,U87243,11/10/2019 11:30:54 AM,N31174,2019-11-10 11:30:54,1573385454
1,U598644,11/12/2019 1:45:29 PM,N25587,2019-11-12 13:45:29,1573566329
...,...,...,...,...,...
2232745,U151246,11/13/2019 12:42:51 PM,N24001,2019-11-13 12:42:51,1573648971
2232745,U151246,11/13/2019 12:42:51 PM,N127572,2019-11-13 12:42:51,1573648971
2232745,U151246,11/13/2019 12:42:51 PM,N10285,2019-11-13 12:42:51,1573648971
2232746,U330725,11/12/2019 1:22:57 PM,N88808,2019-11-12 13:22:57,1573564977


In [11]:
old_interactions['ts'] = 0

In [12]:
old_interactions.rename({'history': 'news_id:token', 'uid': 'user_id:token', 'ts': 'timestamp:float'}, axis=1)

Unnamed: 0,user_id:token,news_id:token,timestamp:float
0,U0,N39011,0
1,U0,N112324,0
2,U0,N78884,0
3,U0,N111503,0
4,U0,N63941,0
...,...,...,...
13350145,U99998,N53265,0
13350146,U99999,N85484,0
13350147,U99999,N34094,0
13350148,U99999,N17892,0


In [13]:
new_interactions[['impressions', 'uid', 'ts']].rename({'impressions': 'news_id:token', 'uid': 'user_id:token', 'ts': 'timestamp:float'}, axis=1)

Unnamed: 0,news_id:token,user_id:token,timestamp:float
0,N94157,U87243,1573385454
0,N78699,U87243,1573385454
0,N71090,U87243,1573385454
0,N31174,U87243,1573385454
1,N25587,U598644,1573566329
...,...,...,...
2232745,N24001,U151246,1573648971
2232745,N127572,U151246,1573648971
2232745,N10285,U151246,1573648971
2232746,N88808,U330725,1573564977


In [14]:
interactions_df =  pd.concat(
    [
        old_interactions.rename({'history': 'news_id:token', 'uid': 'user_id:token', 'ts': 'timestamp:float'}, axis=1),
        new_interactions[['impressions', 'uid', 'ts']].rename({'impressions': 'news_id:token', 'uid': 'user_id:token', 'ts': 'timestamp:float'}, axis=1)
    ],
    axis=0
)
interactions_df

Unnamed: 0,user_id:token,news_id:token,timestamp:float
0,U0,N39011,0
1,U0,N112324,0
2,U0,N78884,0
3,U0,N111503,0
4,U0,N63941,0
...,...,...,...
2232745,U151246,N24001,1573648971
2232745,U151246,N127572,1573648971
2232745,U151246,N10285,1573648971
2232746,U330725,N88808,1573564977


In [15]:
# !mkdir ./recbox_data
interactions_df.to_csv('./recbox_data/recbox_data.inter', index=False, sep='\t')

In [16]:
del interactions_df, train_df
gc.collect()

0

In [21]:
import logging
from logging import getLogger
from recbole.config import Config
from recbole.data import create_dataset, data_preparation
from recbole.model.sequential_recommender import GRU4Rec, FPMC
from recbole.model.general_recommender import BPR, FISM
from recbole.trainer import Trainer
from recbole.utils import init_seed, init_logger

In [22]:
parameter_dict = {
    'data_path': './',
    'USER_ID_FIELD': 'user_id',
    'ITEM_ID_FIELD': 'news_id',
    'TIME_FIELD': 'timestamp',
    'load_col': {'inter': ['user_id', 'news_id', 'timestamp']},
    'log_wandb': True,
    'train_neg_sample_args': {
        "uniform" : 0.1
    },
    'epochs': 50,
    'eval_args': {
        'split': {'RS': [8, 1, 1]},
        'group_by': 'user',
        'order': 'TO',
        'mode': 'full'}
}


# config = Config(model='GRU4Rec', dataset='recbox_data', config_dict=parameter_dict)
config = Config(model='BPR', dataset='recbox_data', config_dict=parameter_dict)

# init random seed
init_seed(config['seed'], config['reproducibility'])

# logger initialization
init_logger(config)
logger = getLogger()
# Create handlers
c_handler = logging.StreamHandler()
c_handler.setLevel(logging.INFO)
logger.addHandler(c_handler)

# write config info into log
logger.info(config)

08 May 02:48    INFO  
General Hyper Parameters:
gpu_id = 0
use_gpu = True
seed = 2020
state = INFO
reproducibility = True
data_path = ./recbox_data
checkpoint_dir = saved
show_progress = True
save_dataset = False
dataset_save_path = None
save_dataloaders = False
dataloaders_save_path = None
log_wandb = True

Training Hyper Parameters:
epochs = 50
train_batch_size = 2048
learner = adam
learning_rate = 0.001
train_neg_sample_args = {'uniform': 1, 'distribution': 'uniform', 'sample_num': 1, 'alpha': 1.0, 'dynamic': False, 'candidate_num': 0}
eval_step = 1
stopping_step = 10
clip_grad_norm = None
weight_decay = 0.0
loss_decimal_place = 4

Evaluation Hyper Parameters:
eval_args = {'split': {'RS': [8, 1, 1]}, 'group_by': 'user', 'order': 'TO', 'mode': 'full'}
repeatable = False
metrics = ['Recall', 'MRR', 'NDCG', 'Hit', 'Precision']
topk = [10]
valid_metric = MRR@10
valid_metric_bigger = True
eval_batch_size = 4096
metric_decimal_place = 4

Dataset Hyper Parameters:
field_separator = 	
seq_

In [23]:
config


[1;35mGeneral Hyper Parameters:
[0m[1;36mgpu_id[0m =[1;33m 0[0m
[1;36muse_gpu[0m =[1;33m True[0m
[1;36mseed[0m =[1;33m 2020[0m
[1;36mstate[0m =[1;33m INFO[0m
[1;36mreproducibility[0m =[1;33m True[0m
[1;36mdata_path[0m =[1;33m ./recbox_data[0m
[1;36mcheckpoint_dir[0m =[1;33m saved[0m
[1;36mshow_progress[0m =[1;33m True[0m
[1;36msave_dataset[0m =[1;33m False[0m
[1;36mdataset_save_path[0m =[1;33m None[0m
[1;36msave_dataloaders[0m =[1;33m False[0m
[1;36mdataloaders_save_path[0m =[1;33m None[0m
[1;36mlog_wandb[0m =[1;33m True[0m

[1;35mTraining Hyper Parameters:
[0m[1;36mepochs[0m =[1;33m 50[0m
[1;36mtrain_batch_size[0m =[1;33m 2048[0m
[1;36mlearner[0m =[1;33m adam[0m
[1;36mlearning_rate[0m =[1;33m 0.001[0m
[1;36mtrain_neg_sample_args[0m =[1;33m {'uniform': 1, 'distribution': 'uniform', 'sample_num': 1, 'alpha': 1.0, 'dynamic': False, 'candidate_num': 0}[0m
[1;36meval_step[0m =[1;33m 1[0m
[1;36mstopping_step[

In [24]:
dataset = create_dataset(config)
logger.info(dataset)

08 May 02:49    INFO  recbox_data
The number of users: 711223
Average actions of users: 23.5282457516781
The number of items: 94443
Average actions of items: 177.1860612862921
The number of inters: 16733806
The sparsity of the dataset: 99.97508739380336%
Remain Fields: ['user_id', 'news_id', 'timestamp']
recbox_data
The number of users: 711223
Average actions of users: 23.5282457516781
The number of items: 94443
Average actions of items: 177.1860612862921
The number of inters: 16733806
The sparsity of the dataset: 99.97508739380336%
Remain Fields: ['user_id', 'news_id', 'timestamp']


In [26]:
# dataset splitting
train_data, valid_data, test_data = data_preparation(config, dataset)

08 May 02:56    INFO  [Training]: train_batch_size = [2048] train_neg_sample_args: [{'uniform': 1, 'distribution': 'uniform', 'sample_num': 1, 'alpha': 1.0, 'dynamic': False, 'candidate_num': 0}]
[Training]: train_batch_size = [2048] train_neg_sample_args: [{'uniform': 1, 'distribution': 'uniform', 'sample_num': 1, 'alpha': 1.0, 'dynamic': False, 'candidate_num': 0}]
08 May 02:56    INFO  [Evaluation]: eval_batch_size = [4096] eval_args: [{'split': {'RS': [8, 1, 1]}, 'group_by': 'user', 'order': 'TO', 'mode': 'full'}]
[Evaluation]: eval_batch_size = [4096] eval_args: [{'split': {'RS': [8, 1, 1]}, 'group_by': 'user', 'order': 'TO', 'mode': 'full'}]


In [27]:
train_data._batch_size

2048

In [29]:
len(train_data), len(test_data)

(6635, 709551)

In [28]:
test_data._batch_size

94443

In [34]:
# model loading and initialization
# model = GRU4Rec(config, train_data.dataset).to(config['device'])
model = BPR(config, train_data.dataset).to(config['device'])
logger.info(model)

# trainer loading and initialization
trainer = Trainer(config, model)

# model training
best_valid_score, best_valid_result = trainer.fit(train_data, 
                                                  valid_data=valid_data,
                                                  show_progress=True)

08 May 01:32    INFO  BPR(
  (user_embedding): Embedding(711223, 64)
  (item_embedding): Embedding(94443, 64)
  (loss): BPRLoss()
)
Trainable parameters: 51562624
BPR(
  (user_embedding): Embedding(711223, 64)
  (item_embedding): Embedding(94443, 64)
  (loss): BPRLoss()
)
Trainable parameters: 51562624
BPR(
  (user_embedding): Embedding(711223, 64)
  (item_embedding): Embedding(94443, 64)
  (loss): BPRLoss()
)
Trainable parameters: 51562624
BPR(
  (user_embedding): Embedding(711223, 64)
  (item_embedding): Embedding(94443, 64)
  (loss): BPRLoss()
)
Trainable parameters: 51562624
BPR(
  (user_embedding): Embedding(711223, 64)
  (item_embedding): Embedding(94443, 64)
  (loss): BPRLoss()
)
Trainable parameters: 51562624
Train     0:   0%|                                                         | 0/6635 [00:00<?, ?it/s]