Some code is from a team member, which is based on https://www.kaggle.com/code/astrung/recbole-lstm-sequential-for-recomendation-tutorial

# Imports

In [1]:
!pip install recbole

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting recbole
  Downloading recbole-1.1.1-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
Collecting colorama==0.4.4
  Downloading colorama-0.4.4-py2.py3-none-any.whl (16 kB)
Collecting colorlog==4.7.2
  Downloading colorlog-4.7.2-py2.py3-none-any.whl (10 kB)
Collecting thop>=0.1.1.post2207130030
  Downloading thop-0.1.1.post2209072238-py3-none-any.whl (15 kB)
Installing collected packages: colorlog, colorama, thop, recbole
Successfully installed colorama-0.4.4 colorlog-4.7.2 recbole-1.1.1 thop-0.1.1.post2209072238


In [2]:
from google.colab import drive

import zipfile

import pandas

import numpy

In [3]:
drive.mount('/content/drive')

Mounted at /content/drive


# Load Data

In [28]:
%cd 'drive/MyDrive/Colab Notebooks/351/Project'

[Errno 2] No such file or directory: 'drive/MyDrive/Colab Notebooks/351/Project'
/content/drive/MyDrive/Colab Notebooks/351/Project


In [29]:
df = pandas.read_csv('Tenrec/sbr_data_1M.csv', nrows = 1500000)
df

Unnamed: 0,user_id,item_id,click,follow,like,share,video_category,watching_times,gender,age
0,1,1,1,0,0,0,1,1,1,4
1,1,2,1,0,0,0,1,1,1,4
2,1,3,1,0,0,0,0,1,1,4
3,1,80936,1,0,0,0,1,1,1,4
4,1,781,1,0,0,0,1,1,1,4
...,...,...,...,...,...,...,...,...,...,...
1499995,32436,14085,1,0,0,0,1,2,1,3
1499996,32436,7034,1,0,0,0,0,1,1,3
1499997,32436,236774,1,0,0,0,1,1,1,3
1499998,32436,2982,1,0,0,0,0,4,1,3


In [30]:
# Remove last user because there could be more rows that were cut off due to RAM limit
df = df[df['user_id'] != 109973]

In [31]:
df

Unnamed: 0,user_id,item_id,click,follow,like,share,video_category,watching_times,gender,age
0,1,1,1,0,0,0,1,1,1,4
1,1,2,1,0,0,0,1,1,1,4
2,1,3,1,0,0,0,0,1,1,4
3,1,80936,1,0,0,0,1,1,1,4
4,1,781,1,0,0,0,1,1,1,4
...,...,...,...,...,...,...,...,...,...,...
1499995,32436,14085,1,0,0,0,1,2,1,3
1499996,32436,7034,1,0,0,0,0,1,1,3
1499997,32436,236774,1,0,0,0,1,1,1,3
1499998,32436,2982,1,0,0,0,0,4,1,3


# Data Exploration

Fix video_category column

In [32]:
df['video_category'].unique()

array(['1', '0', '\\N'], dtype=object)

In [33]:
df['video_category'] = numpy.where(df['video_category'] == '1', 1, df['video_category'])
df['video_category'] = numpy.where(df['video_category'] == '0', 0, df['video_category'])
df['video_category'] = numpy.where(df['video_category'] == '\\N', numpy.nan, df['video_category'])
df['video_category'] = df['video_category'].astype('float')

In [34]:
for c in df.columns:
  print(c)
  print('unique values:', numpy.unique(df[c]).shape[0])
  print()

user_id
unique values: 32436

item_id
unique values: 276483

click
unique values: 1

follow
unique values: 2

like
unique values: 2

share
unique values: 2

video_category
unique values: 3

watching_times
unique values: 184

gender
unique values: 3

age
unique values: 8



Note: The rows do not represent users, they represent behaviours/interactions with different items (videos).

# Pre-Processing

In [35]:
df['timestamp'] = df.index

mapping = {'user_id': 'user_id:token', 'item_id': 'item_id:token', 'timestamp': 'timestamp:float'}
df = df.rename(columns = mapping)

In [39]:
# Only keep users with 35 to 55 rows
for user in numpy.unique(df['user_id:token']):
  if len(df[df['user_id:token'] == user]) not in range(35, 56):
    df = df.drop(df[df['user_id:token'] == user].index)

In [40]:
# Split dataset according to age
df_ages = []
df_ages.append(df[df['age'] == 0])
df_ages.append(df[df['age'] == 1])
df_ages.append(df[df['age'] == 2])
df_ages.append(df[df['age'] == 3])
df_ages.append(df[df['age'] == 4])
df_ages.append(df[df['age'] == 5])
df_ages.append(df[df['age'] == 6])
df_ages.append(df[df['age'] == 7])

# Split dataset according to gender
df_genders = []
df_genders.append(df[df['gender'] == 0])
df_genders.append(df[df['gender'] == 1])
df_genders.append(df[df['gender'] == 2])

# Remove some features from subsets
for i in range(len(df_ages)):
  df_ages[i] = df_ages[i].drop(columns = ['click', 'follow', 'like', 'share', 'video_category', 'watching_times', 'age', 'gender'])
for i in range(len(df_genders)):
  df_genders[i] = df_genders[i].drop(columns = ['click', 'follow', 'like', 'share', 'video_category', 'watching_times', 'age', 'gender'])

df_all = df.drop(columns = ['click', 'follow', 'like', 'share', 'video_category', 'watching_times', 'age', 'gender'])

In [41]:
# Print sizes of subsets
for i in range(len(df_ages)):
  print('age = ' + str(i))
  print('\tnumber of rows: ' + str(df_ages[i].shape[0]))
  print('\tunique users:   ' + str(numpy.unique(df_ages[i]['user_id:token']).shape[0]))
  print('\tunique items:   ' + str(numpy.unique(df_ages[i]['item_id:token']).shape[0]))

print()

for i in range(len(df_genders)):
  print('gender = ' + str(i))
  print('\tnumber of rows: ' + str(df_genders[i].shape[0]))
  print('\tunique users:   ' + str(numpy.unique(df_genders[i]['user_id:token']).shape[0]))
  print('\tunique items:   ' + str(numpy.unique(df_genders[i]['item_id:token']).shape[0]))

age = 0
	number of rows: 56597
	unique users:   1284
	unique items:   31742
age = 1
	number of rows: 3332
	unique users:   75
	unique items:   2842
age = 2
	number of rows: 72596
	unique users:   1656
	unique items:   33247
age = 3
	number of rows: 99289
	unique users:   2260
	unique items:   43509
age = 4
	number of rows: 63813
	unique users:   1459
	unique items:   33726
age = 5
	number of rows: 9825
	unique users:   224
	unique items:   7819
age = 6
	number of rows: 2077
	unique users:   47
	unique items:   1871
age = 7
	number of rows: 88
	unique users:   2
	unique items:   88

gender = 0
	number of rows: 57045
	unique users:   1294
	unique items:   31892
gender = 1
	number of rows: 184771
	unique users:   4204
	unique items:   65649
gender = 2
	number of rows: 65801
	unique users:   1509
	unique items:   33931


# Modeling

In [15]:
from recbole.config import Config
from recbole.data import create_dataset, data_preparation
from recbole.model.sequential_recommender import GRU4Rec, BERT4Rec
from recbole.trainer import Trainer
from recbole.utils import init_seed

from recbole.utils.case_study import full_sort_topk

## GRU4Rec

In [69]:
MAX_ITEM = 20

parameter_dict = {
    'data_path': '/content/drive/MyDrive/Colab Notebooks/351/Project',
    'USER_ID_FIELD': 'user_id',
    'ITEM_ID_FIELD': 'item_id',
    'TIME_FIELD': 'timestamp',
    'user_inter_num_interval': '[0,Inf)',
    'item_inter_num_interval': '[0,Inf)',
    'load_col': {'inter': ['user_id', 'item_id', 'timestamp'], 'item': ['item_id', 'item_emb']},
    'train_neg_sample_args': None,
    'epochs': 10,
    'stopping_step': 3,
    'eval_batch_size': 1024,
    'train_batch_size': 1024,
    'MAX_ITEM_LIST_LENGTH': MAX_ITEM,
    'eval_args': {
        'split': {'RS': [9, 1, 0]},
        'group_by': 'user',
        'order': 'TO',
        'mode': 'full'},
    'shuffle': False
}

config = Config(model = 'GRU4Rec', dataset = 'data', config_dict = parameter_dict)

init_seed(config['seed'], config['reproducibility'])

In [70]:
data = df_all
data.to_csv('data/data.inter', index = False, sep = '\t')

dataset = create_dataset(config)

train_data, valid_data, test_data = data_preparation(config, dataset)

model = GRU4Rec(config, train_data.dataset).to(config['device'])
trainer = Trainer(config, model)
best_valid_score, best_valid_result = trainer.fit(train_data, valid_data)

print(best_valid_score)
print(best_valid_result)

KeyboardInterrupt: ignored

In [71]:
for i in range(len(df_ages)):
  data = df_ages[i]
  data.to_csv('data/data.inter', index = False, sep = '\t')

  dataset = create_dataset(config)

  train_data, valid_data, test_data = data_preparation(config, dataset)

  model = GRU4Rec(config, train_data.dataset).to(config['device'])
  trainer = Trainer(config, model)
  best_valid_score, best_valid_result = trainer.fit(train_data, valid_data)

  print(i)
  print(best_valid_score)
  print(best_valid_result)
  print()

0
0.0039
OrderedDict([('recall@10', 0.0103), ('mrr@10', 0.0039), ('ndcg@10', 0.0054), ('hit@10', 0.0103), ('precision@10', 0.001)])

1
0.0065
OrderedDict([('recall@10', 0.0172), ('mrr@10', 0.0065), ('ndcg@10', 0.0091), ('hit@10', 0.0172), ('precision@10', 0.0017)])

2
0.0049
OrderedDict([('recall@10', 0.0141), ('mrr@10', 0.0049), ('ndcg@10', 0.007), ('hit@10', 0.0141), ('precision@10', 0.0014)])

3
0.004
OrderedDict([('recall@10', 0.0118), ('mrr@10', 0.004), ('ndcg@10', 0.0058), ('hit@10', 0.0118), ('precision@10', 0.0012)])

4
0.0044
OrderedDict([('recall@10', 0.0126), ('mrr@10', 0.0044), ('ndcg@10', 0.0063), ('hit@10', 0.0126), ('precision@10', 0.0013)])

5
0.0037
OrderedDict([('recall@10', 0.007), ('mrr@10', 0.0037), ('ndcg@10', 0.0045), ('hit@10', 0.007), ('precision@10', 0.0007)])

6
0.0
OrderedDict([('recall@10', 0.0), ('mrr@10', 0.0), ('ndcg@10', 0.0), ('hit@10', 0.0), ('precision@10', 0.0)])

7
0.0
OrderedDict([('recall@10', 0.0), ('mrr@10', 0.0), ('ndcg@10', 0.0), ('hit@10', 0

## BERT4Rec

In [72]:
MAX_ITEM = 20

parameter_dict = {
    'data_path': '/content/drive/MyDrive/Colab Notebooks/351/Project',
    'USER_ID_FIELD': 'user_id',
    'ITEM_ID_FIELD': 'item_id',
    'TIME_FIELD': 'timestamp',
    'user_inter_num_interval': '[1,Inf)',
    'item_inter_num_interval': '[1,Inf)',
    'load_col': {'inter': ['user_id', 'item_id', 'timestamp'], 'item': ['item_id', 'item_emb']},
    'train_neg_sample_args': None,
    'epochs': 10,
    'stopping_step': 3,
    'eval_batch_size': 1024,
    'train_batch_size': 1024,
    'MAX_ITEM_LIST_LENGTH': MAX_ITEM,
    'eval_args': {
        'split': {'RS': [9, 1, 0]},
        'group_by': 'user',
        'order': 'TO',
        'mode': 'full'},
    'shuffle': False
}

config = Config(model = 'BERT4Rec', dataset = 'data', config_dict = parameter_dict)

init_seed(config['seed'], config['reproducibility'])

In [None]:
data = df_all
data.to_csv('data/data.inter', index = False, sep = '\t')

dataset = create_dataset(config)

train_data, valid_data, test_data = data_preparation(config, dataset)

model = BERT4Rec(config, train_data.dataset).to(config['device'])
trainer = Trainer(config, model)
best_valid_score, best_valid_result = trainer.fit(train_data, valid_data)

print(best_valid_score)
print(best_valid_result)

In [None]:
for i in range(len(df_ages)):
  data = df_ages[i]
  data.to_csv('data/data.inter', index = False, sep = '\t')

  dataset = create_dataset(config)

  train_data, valid_data, test_data = data_preparation(config, dataset)

  model = BERT4Rec(config, train_data.dataset).to(config['device'])
  trainer = Trainer(config, model)
  best_valid_score, best_valid_result = trainer.fit(train_data, valid_data)

  print(i)
  print(best_valid_score)
  print(best_valid_result)
  print()

0
0.0039
OrderedDict([('recall@10', 0.0101), ('mrr@10', 0.0039), ('ndcg@10', 0.0053), ('hit@10', 0.0101), ('precision@10', 0.001)])

1
0.0025
OrderedDict([('recall@10', 0.0172), ('mrr@10', 0.0025), ('ndcg@10', 0.0058), ('hit@10', 0.0172), ('precision@10', 0.0017)])

