In [1]:
import argparse
import numpy as np
import torch
from torch.autograd import Variable
from utils.GraphMaker import GraphMaker
from model.trainer import CrossTrainer
import os
import json
import sys
import pickle
import pdb
import time
import copy

True
0
NVIDIA GeForce GTX 1650


In [2]:
sys.path.insert(1, 'utils')
from utils.data import *

In [3]:
def create_arg_parser():
    """Create argument parser for our baseline. """
    parser = argparse.ArgumentParser('WSDM')

    # DATA  Arguments
    parser.add_argument('--domains', type=str, default='m1_m2_m3_m4_m5', help='specify none ("none") or a few source markets ("-" seperated) to augment the data for training')
    parser.add_argument('--task', type=str, default='multi-item-intra', help='dual-user-intra, dual-user-inter, multi-item-intra, multi-user-intra')

    # MODEL Arguments
    parser.add_argument('--model', type=str, default='UniCDR', help='right model name')
    parser.add_argument('--mask_rate', type=float, default=0.1, help='mask rate of interactions')
    parser.add_argument('--num_epoch', type=int, default=100, help='number of epoches')
    parser.add_argument('--aggregator', type=str, default='item_similarity', help='switching the user-item aggregation')
    parser.add_argument('--batch_size', type=int, default=1024, help='batch size')
    parser.add_argument('--optim', choices=['sgd', 'adagrad', 'adam', 'adamax'], default='adam',
                        help='Optimizer: sgd, adagrad, adam or adamax.')
    parser.add_argument('--lr', type=float, default=0.005, help='learning rate')
    parser.add_argument('--l2_reg', type=float, default=1e-6, help='the L2 weight')
    parser.add_argument('--lr_decay', type=float, default=0.98, help='decay learning rate')
    parser.add_argument('--weight_decay', type=float, default=1e-5, help='decay learning rate')
    parser.add_argument('--latent_dim', type=int, default=128, help='latent dimensions')
    parser.add_argument('--num_negative', type=int, default=10, help='num of negative samples during training')
    parser.add_argument('--maxlen', type=int, default=10, help='num of item sequence')
    parser.add_argument('--dropout', type=float, default=0.3, help='random drop out rate')
    parser.add_argument('--save',default='save', action='store_true', help='save model?')
    parser.add_argument('--lambda', type=float, default=50, help='the parameter of EASE')
    parser.add_argument('--lambda_a', type=float, default=0.5, help='for our aggregators')
    parser.add_argument('--lambda_loss', type=float, default=0.4, help='the parameter of loss function')
    parser.add_argument('--static_sample', default='static_sample',action='store_true', help='accelerate the dataloader')

    # others
    parser.add_argument('--cuda',default='cuda', action='store_true', help='use of cuda')
    parser.add_argument('--seed', type=int, default=42, help='manual seed init')
    parser.add_argument('--decay_epoch', type=int, default=10, help='Decay learning rate after this epoch.')

    return parser

In [4]:
parser = create_arg_parser()
opt, _ = parser.parse_known_args()  # Ignore unknown arguments
opt = vars(opt)

opt["device"] = torch.device('cuda' if torch.cuda.is_available() and opt["cuda"] else 'cpu')



In [5]:
def print_config(config):
        info = "Running with the following configs:\n"
        for k, v in config.items():
            info += "\t{} : {}\n".format(k, str(v))
        print("\n" + info + "\n")

if opt["task"] == "multi-user-intra":
        opt["maxlen"] = 50

print_config(opt)


Running with the following configs:
	domains : m1_m2_m3_m4_m5
	task : multi-item-intra
	model : UniCDR
	mask_rate : 0.1
	num_epoch : 100
	aggregator : item_similarity
	batch_size : 1024
	optim : adam
	lr : 0.005
	l2_reg : 1e-06
	lr_decay : 0.98
	weight_decay : 1e-05
	latent_dim : 128
	num_negative : 10
	maxlen : 10
	dropout : 0.3
	save : save
	lambda : 50
	lambda_a : 0.5
	lambda_loss : 0.4
	static_sample : static_sample
	cuda : cuda
	seed : 42
	decay_epoch : 10
	device : cuda




In [6]:
print(f'Running experiment on device: {opt["device"]}')


Running experiment on device: cuda


In [7]:
def seed_everything(seed=1111):
        random.seed(seed)
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        np.random.seed(seed)
        os.environ['PYTHONHASHSEED'] = str(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

seed_everything(opt["seed"])

In [8]:
opt["domains"]

'm1_m2_m3_m4_m5'

In [9]:
if "dual" in opt["task"]:
        filename = opt["domains"].split("_")
        opt["domains"] = []
        opt["domains"].append(filename[0] + "_" + filename[1])
        opt["domains"].append(filename[1] + "_" + filename[0])


else:
        opt["domains"] = opt["domains"].split('_')

print("Loading domains:", opt["domains"])

Loading domains: ['m1', 'm2', 'm3', 'm4', 'm5']


In [10]:
domain_list = opt["domains"]
opt["user_max"] = []
opt["item_max"] = []
task_gen_all = {}
domain_id = {}
all_domain_list = []
all_domain_set = []
all_inter = 0

In [11]:
for idx, cur_domain in enumerate(domain_list):
        cur_src_data_dir = os.path.join("datasets/"+str(opt["task"]) + "/dataset/", cur_domain + "/train.txt")
        print(f'Loading {cur_domain}: {cur_src_data_dir}')
        
        all_domain_list.append({})
        all_domain_set.append({})
        max_user = 0
        max_item = 0
        print(cur_src_data_dir)
        print(cur_domain)
        print(opt["domains"])
        with codecs.open(cur_src_data_dir, "r", encoding="utf-8") as infile:
            for line in infile:
                all_inter+=1
                line = line.strip().split("\t")
                user = int(line[0])
                item = int(line[1]) + 1
                max_user = max(max_user, user)
                max_item = max(max_item, item)
                if user not in all_domain_list[idx].keys():
                    all_domain_list[idx][user] = []
                    all_domain_set[idx][user] = set()
                if item not in all_domain_set[idx][user]:
                    all_domain_list[idx][user].append(item)
                    all_domain_set[idx][user].add(item)

        opt["user_max"].append(max_user + 1)
        opt["item_max"].append(max_item + 1)

total_graphs = GraphMaker(opt, all_domain_list)

Loading m1: datasets/multi-item-intra/dataset/m1/train.txt
datasets/multi-item-intra/dataset/m1/train.txt
m1
['m1', 'm2', 'm3', 'm4', 'm5']


Loading m2: datasets/multi-item-intra/dataset/m2/train.txt
datasets/multi-item-intra/dataset/m2/train.txt
m2
['m1', 'm2', 'm3', 'm4', 'm5']
Loading m3: datasets/multi-item-intra/dataset/m3/train.txt
datasets/multi-item-intra/dataset/m3/train.txt
m3
['m1', 'm2', 'm3', 'm4', 'm5']
Loading m4: datasets/multi-item-intra/dataset/m4/train.txt
datasets/multi-item-intra/dataset/m4/train.txt
m4
['m1', 'm2', 'm3', 'm4', 'm5']
Loading m5: datasets/multi-item-intra/dataset/m5/train.txt
datasets/multi-item-intra/dataset/m5/train.txt
m5
['m1', 'm2', 'm3', 'm4', 'm5']
begin graphmaker................
The alignment id 0 0
7109 2199
Start the EASE
load
15625582
49169


  r_inv = np.power(rowsum, -1).flatten()
  return torch.sparse.FloatTensor(indices, values, shape)


EASE End
The alignment id 7109 0
2697 2693
Start the EASE
load
3592711
17953
EASE End
The alignment id 9806 0
3328 2979
Start the EASE
load
4143360
23363
EASE End
The alignment id 13134 0
5482 4931
Start the EASE
load
15990994
31907
EASE End
The alignment id 18616 0
6466 12016
Start the EASE
load
63121092
77136
EASE End
graphmaker done.........


In [12]:
all_domain_list = []
all_domain_set = []
all_inter = 0

In [13]:
for idx, cur_domain in enumerate(domain_list):
        cur_src_data_dir = os.path.join("datasets/" + str(opt["task"]) + "/dataset/", cur_domain + "/train.txt")
        print(f'Loading {cur_domain}: {cur_src_data_dir}')

        if opt["aggregator"] == "item_similarity":
            ease_dense = total_graphs.ease[idx].to_dense()

        all_domain_list.append({})
        all_domain_set.append({})
        
        with codecs.open(cur_src_data_dir, "r", encoding="utf-8") as infile:
            for line in infile:
                all_inter += 1
                line = line.strip().split("\t")
                user = int(line[0])
                item = int(line[1]) + 1
                if user not in all_domain_list[idx].keys():
                    all_domain_list[idx][user] = []
                    all_domain_set[idx][user] = set()
                if item not in all_domain_set[idx][user]:
                    if opt["aggregator"] == "item_similarity":
                        all_domain_list[idx][user].append([item, ease_dense[user][item]])
                    else:
                        all_domain_list[idx][user].append([item, 1])
                    all_domain_set[idx][user].add(item)

        print(f'Loading {cur_domain}: {cur_src_data_dir}')
        cur_src_task_generator = TaskGenerator(cur_src_data_dir, opt, all_domain_list, all_domain_set, idx,
                                               total_graphs)
        task_gen_all[idx] = cur_src_task_generator
        domain_id[cur_domain] = idx



Loading m1: datasets/multi-item-intra/dataset/m1/train.txt
Loading m1: datasets/multi-item-intra/dataset/m1/train.txt
the min/max user/item number of  datasets/multi-item-intra/dataset/m1/train.txt
user: 0 7108
item: 1 2198
Loading m2: datasets/multi-item-intra/dataset/m2/train.txt
Loading m2: datasets/multi-item-intra/dataset/m2/train.txt
the min/max user/item number of  datasets/multi-item-intra/dataset/m2/train.txt
user: 0 2696
item: 1 2692
Loading m3: datasets/multi-item-intra/dataset/m3/train.txt
Loading m3: datasets/multi-item-intra/dataset/m3/train.txt
the min/max user/item number of  datasets/multi-item-intra/dataset/m3/train.txt
user: 0 3327
item: 1 2978
Loading m4: datasets/multi-item-intra/dataset/m4/train.txt
Loading m4: datasets/multi-item-intra/dataset/m4/train.txt
the min/max user/item number of  datasets/multi-item-intra/dataset/m4/train.txt
user: 0 5481
item: 2 4930
Loading m5: datasets/multi-item-intra/dataset/m5/train.txt
Loading m5: datasets/multi-item-intra/dataset

In [14]:
train_domains = MetaDomain_Dataset(task_gen_all, num_negatives=opt["num_negative"], meta_split='train')
train_dataloader = MetaDomain_DataLoader(train_domains, sample_batch_size=opt["batch_size"] // len(domain_list), shuffle=True)
opt["num_domains"] = train_dataloader.num_domains
opt["domain_id"] = domain_id



In [15]:
############
## Validation and Test
############
if "inter" in opt["task"]:
    opt["shared_user"] = 1e9
valid_dataloader = {}
test_dataloader = {}
for cur_domain in domain_list:
        if opt["task"] == "dual-user-intra":
            domain_valid = os.path.join("datasets/" + str(opt["task"]) + "/dataset/", cur_domain + "/test.txt")
        else:
            domain_valid = os.path.join("datasets/" + str(opt["task"]) + "/dataset/", cur_domain + "/valid.txt")
        domain_test = os.path.join("datasets/"+str(opt["task"]) + "/dataset/", cur_domain + "/test.txt")
        valid_dataloader[cur_domain] = task_gen_all[domain_id[cur_domain]].instance_a_valid_dataloader(
            domain_valid, 100)
        test_dataloader[cur_domain] = task_gen_all[domain_id[cur_domain]].instance_a_valid_dataloader(
            domain_test, 100)

print("the user number of different domains", opt["user_max"])
print("the item number of different domains", opt["item_max"])

the evaluation data:  datasets/multi-item-intra/dataset/m1/valid.txt
the evaluation data:  datasets/multi-item-intra/dataset/m1/test.txt
the evaluation data:  datasets/multi-item-intra/dataset/m2/valid.txt
the evaluation data:  datasets/multi-item-intra/dataset/m2/test.txt
the evaluation data:  datasets/multi-item-intra/dataset/m3/valid.txt
the evaluation data:  datasets/multi-item-intra/dataset/m3/test.txt
the evaluation data:  datasets/multi-item-intra/dataset/m4/valid.txt
the evaluation data:  datasets/multi-item-intra/dataset/m4/test.txt
the evaluation data:  datasets/multi-item-intra/dataset/m5/valid.txt
the evaluation data:  datasets/multi-item-intra/dataset/m5/test.txt
the user number of different domains [7109, 2697, 3328, 5482, 6466]
the item number of different domains [2199, 2693, 2979, 4931, 12016]


# Inference Code

In [16]:
import torch
from utils.GraphMaker import GraphMaker
from model.trainer import CrossTrainer
import os
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Define paths to saved weights and dataset
model_weights_path = 'Trained_Model\model_m1m2m3m4m5_epoch_120.pt'


# Initialize model with the same architecture as training
model = CrossTrainer(opt)  # `opt` should be the same dictionary used during training



# Load the model using the custom `load()` method of CrossTrainer
model.load(model_weights_path)

# Set model to evaluation mode
model.model.eval()

# Load and preprocess the dataset using GraphMaker
graph_maker = total_graphs
UV_adj, VU_adj, ease_pred = graph_maker.UV, graph_maker.VU, graph_maker.ease

  model_weights_path = 'Trained_Model\model_m1m2m3m4m5_epoch_120.pt'
  checkpoint = torch.load(filename)


#### Inference on particular example

In [17]:
# Define user index and item indices for inference
user_idx = torch.tensor([1], dtype=torch.long).to(device)  # Move user index to the same device
item_indices = torch.tensor([1, 2,4, 6], dtype=torch.long).to(device)  # Move item indices to the same device
print(f"Length of UV_adj: {len(UV_adj)}")
print(f"Length of VU_adj: {len(VU_adj)}")
print(f"Length of ease_pred: {len(ease_pred)}")
print(f"User index: {user_idx}")

Length of UV_adj: 5
Length of VU_adj: 5
Length of ease_pred: 5
User index: tensor([1], device='cuda:0')


In [18]:
opt['domain_id']

{'m1': 0, 'm2': 1, 'm3': 2, 'm4': 3, 'm5': 4}

In [19]:



# Ensure item embeddings are initialized (fix for con_item_emb_list)
model.model.item_embedding_select()

# Specify a domain to run inference on, here it's 'game_video' and 'video_game'
domain_id = opt['domain_id']['m1']  # Change based on domain

# Define user index and item indices for inference
user_idx = torch.tensor([0], dtype=torch.long).to(device)  # Move user index to the same device
item_indices = torch.tensor([1,2, 4, 6], dtype=torch.long).to(device)  # Move item indices to the same device

# Ensure user_idx is valid
if user_idx >= len(UV_adj) or user_idx >= len(VU_adj) or user_idx >= len(ease_pred):
    raise ValueError(f"user_idx {user_idx} is out of bounds for the dataset.")
# Define dummy context and global scores, and move them to the correct device
context_item = torch.zeros((1, 10), dtype=torch.long).to(device)  # Adjust based on your model
context_score = torch.zeros((1, 10), dtype=torch.float).to(device)  # Adjust based on your model
global_item = torch.zeros((1, len(item_indices), len(item_indices)), dtype=torch.long).to(device)  # Adjust shape
global_score = torch.zeros((1, len(item_indices)), dtype=torch.float).to(device)  # Adjust shape

# Get user embeddings using the model
user_embedding = model.model.forward_user(domain_id, user_idx, context_item, context_score, global_item, global_score)

# Get item embeddings using the model
item_embeddings = model.model.forward_item(domain_id, item_indices)

# Use the predict_dot method to generate the predictions (dot product of user and item embeddings)
scores = model.model.predict_dot(user_embedding, item_embeddings)

# Sort the scores to get top recommended items
top_items = torch.argsort(scores, descending=True)

# Display or return top recommended items
print("Top recommended items for user:", top_items)

Top recommended items for user: tensor([2, 0, 1, 3], device='cuda:0')


#### Inference on  whole testing data

In [20]:
opt['domain_id']

{'m1': 0, 'm2': 1, 'm3': 2, 'm4': 3, 'm5': 4}

In [21]:
test_dataloader

{'m1': <torch.utils.data.dataloader.DataLoader at 0x29471a0abd0>,
 'm2': <torch.utils.data.dataloader.DataLoader at 0x294425bf050>,
 'm3': <torch.utils.data.dataloader.DataLoader at 0x29430c66ff0>,
 'm4': <torch.utils.data.dataloader.DataLoader at 0x2947d214470>,
 'm5': <torch.utils.data.dataloader.DataLoader at 0x29465b14b30>}

In [22]:
metrics_00=model.predict(opt['domain_id']['m1'],test_dataloader['m1'])
print(metrics_00)

+++++++++++++++++++++++++++++++++++
{'NDCG_10': 0.6085969339044249, 'HT_10': 0.6877459246767847}


In [23]:
metrics_11=model.predict(opt['domain_id']['m2'],test_dataloader['m2'])
print(metrics_11)

+++++++++++++
{'NDCG_10': 0.4789027750796816, 'HT_10': 0.5770992366412214}


In [24]:
metrics_22=model.predict(opt['domain_id']['m3'],test_dataloader['m3'])
print(metrics_22) 

++++++++++++++++
{'NDCG_10': 0.5262964111285614, 'HT_10': 0.6388557806912991}


In [25]:
metrics_33=model.predict(opt['domain_id']['m4'],test_dataloader['m4'])
print(metrics_33 ) 

+++++++++++++++++++++++++++
{'NDCG_10': 0.41693872967824097, 'HT_10': 0.46461312797946464}


In [26]:
metrics_44=model.predict(opt['domain_id']['m5'],test_dataloader['m5'])
print(metrics_44 )

+++++++++++++++++++++++++++++++
{'NDCG_10': 0.14830805677220618, 'HT_10': 0.18262523779327838}


| **Dataset** | **Metric** | **Paper Results** | **Our Results** |
|-------------|------------|-------------------|-------------------------------------|
| **M1**      | NDCG@10   | 55.83             | 60.86                               |
|             | HT@10     | 69.08             | 68.77                               |
| **M2**      | NDCG@10   | 40.04             | 47.89                               |
|             | HT@10     | 58.01             | 57.71                               |
| **M3**      | NDCG@10   | 43.35             | 52.63                               |
|             | HT@10     | 64.60             | 63.89                               |
| **M4**      | NDCG@10   | 42.54             | 41.69                               |
|             | HT@10     | 47.52             | 46.46                               |
| **M5**      | NDCG@10   | 17.04             | 14.83                               |
|             | HT@10     | 19.78             | 18.26                               |

**Table**: Comparison of multi-item intra recommendation results for M1, M2, M3, M4, and M5 datasets.
