In [1]:
import argparse
import numpy as np
import torch
from torch.autograd import Variable
from utils.GraphMaker import GraphMaker
from model.trainer import CrossTrainer
import os
import json
import sys
import pickle
import pdb
import time
import copy

True
0
NVIDIA GeForce GTX 1650


In [2]:
sys.path.insert(1, 'utils')
from utils.data import *

In [18]:
def create_arg_parser():
    """Create argument parser for our baseline. """
    parser = argparse.ArgumentParser('WSDM')

    # DATA  Arguments
    parser.add_argument('--domains', type=str, default='d1_d2_d3', help='specify none ("none") or a few source markets ("-" seperated) to augment the data for training')
    parser.add_argument('--task', type=str, default='multi-user-intra', help='dual-user-intra, dual-user-inter, multi-item-intra, multi-user-intra')

    # MODEL Arguments
    parser.add_argument('--model', type=str, default='UniCDR', help='right model name')
    parser.add_argument('--mask_rate', type=float, default=0.1, help='mask rate of interactions')
    parser.add_argument('--num_epoch', type=int, default=100, help='number of epoches')
    parser.add_argument('--aggregator', type=str, default='mean', help='switching the user-item aggregation')
    parser.add_argument('--batch_size', type=int, default=1024, help='batch size')
    parser.add_argument('--optim', choices=['sgd', 'adagrad', 'adam', 'adamax'], default='adam',
                        help='Optimizer: sgd, adagrad, adam or adamax.')
    parser.add_argument('--lr', type=float, default=0.001, help='learning rate')
    parser.add_argument('--l2_reg', type=float, default=1e-7, help='the L2 weight')
    parser.add_argument('--lr_decay', type=float, default=0.98, help='decay learning rate')
    parser.add_argument('--weight_decay', type=float, default=1e-5, help='decay learning rate')
    parser.add_argument('--latent_dim', type=int, default=128, help='latent dimensions')
    parser.add_argument('--num_negative', type=int, default=10, help='num of negative samples during training')
    parser.add_argument('--maxlen', type=int, default=10, help='num of item sequence')
    parser.add_argument('--dropout', type=float, default=0.3, help='random drop out rate')
    parser.add_argument('--save',default='save', action='store_true', help='save model?')
    parser.add_argument('--lambda', type=float, default=50, help='the parameter of EASE')
    parser.add_argument('--lambda_a', type=float, default=0.5, help='for our aggregators')
    parser.add_argument('--lambda_loss', type=float, default=0.4, help='the parameter of loss function')
    parser.add_argument('--static_sample', default='static_sample',action='store_true', help='accelerate the dataloader')

    # others
    parser.add_argument('--cuda',default='cuda', action='store_true', help='use of cuda')
    parser.add_argument('--seed', type=int, default=42, help='manual seed init')
    parser.add_argument('--decay_epoch', type=int, default=10, help='Decay learning rate after this epoch.')

    return parser

In [19]:
parser = create_arg_parser()
opt, _ = parser.parse_known_args()  # Ignore unknown arguments
opt = vars(opt)

opt["device"] = torch.device('cuda' if torch.cuda.is_available() and opt["cuda"] else 'cpu')



In [20]:
def print_config(config):
        info = "Running with the following configs:\n"
        for k, v in config.items():
            info += "\t{} : {}\n".format(k, str(v))
        print("\n" + info + "\n")

if opt["task"] == "multi-user-intra":
        opt["maxlen"] = 50

print_config(opt)


Running with the following configs:
	domains : d1_d2_d3
	task : multi-user-intra
	model : UniCDR
	mask_rate : 0.1
	num_epoch : 100
	aggregator : mean
	batch_size : 1024
	optim : adam
	lr : 0.001
	l2_reg : 1e-07
	lr_decay : 0.98
	weight_decay : 1e-05
	latent_dim : 128
	num_negative : 10
	maxlen : 50
	dropout : 0.3
	save : save
	lambda : 50
	lambda_a : 0.5
	lambda_loss : 0.4
	static_sample : static_sample
	cuda : cuda
	seed : 42
	decay_epoch : 10
	device : cuda




In [21]:
print(f'Running experiment on device: {opt["device"]}')


Running experiment on device: cuda


In [22]:
def seed_everything(seed=1111):
        random.seed(seed)
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        np.random.seed(seed)
        os.environ['PYTHONHASHSEED'] = str(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

seed_everything(opt["seed"])

In [23]:
opt["domains"]

'd1_d2_d3'

In [24]:
if "dual" in opt["task"]:
        filename = opt["domains"].split("_")
        opt["domains"] = []
        opt["domains"].append(filename[0] + "_" + filename[1])
        opt["domains"].append(filename[1] + "_" + filename[0])


else:
        opt["domains"] = opt["domains"].split('_')

print("Loading domains:", opt["domains"])

Loading domains: ['d1', 'd2', 'd3']


In [25]:
domain_list = opt["domains"]
opt["user_max"] = []
opt["item_max"] = []
task_gen_all = {}
domain_id = {}
all_domain_list = []
all_domain_set = []
all_inter = 0

In [26]:
for idx, cur_domain in enumerate(domain_list):
        cur_src_data_dir = os.path.join("datasets/"+str(opt["task"]) + "/dataset/", cur_domain + "/train.txt")
        print(f'Loading {cur_domain}: {cur_src_data_dir}')
        
        all_domain_list.append({})
        all_domain_set.append({})
        max_user = 0
        max_item = 0
        print(cur_src_data_dir)
        print(cur_domain)
        print(opt["domains"])
        with codecs.open(cur_src_data_dir, "r", encoding="utf-8") as infile:
            for line in infile:
                all_inter+=1
                line = line.strip().split("\t")
                user = int(line[0])
                item = int(line[1]) + 1
                max_user = max(max_user, user)
                max_item = max(max_item, item)
                if user not in all_domain_list[idx].keys():
                    all_domain_list[idx][user] = []
                    all_domain_set[idx][user] = set()
                if item not in all_domain_set[idx][user]:
                    all_domain_list[idx][user].append(item)
                    all_domain_set[idx][user].add(item)

        opt["user_max"].append(max_user + 1)
        opt["item_max"].append(max_item + 1)

total_graphs = GraphMaker(opt, all_domain_list)

Loading d1: datasets/multi-user-intra/dataset/d1/train.txt
datasets/multi-user-intra/dataset/d1/train.txt
d1
['d1', 'd2', 'd3']
Loading d2: datasets/multi-user-intra/dataset/d2/train.txt
datasets/multi-user-intra/dataset/d2/train.txt
d2
['d1', 'd2', 'd3']
Loading d3: datasets/multi-user-intra/dataset/d3/train.txt
datasets/multi-user-intra/dataset/d3/train.txt
d3
['d1', 'd2', 'd3']
begin graphmaker................
The alignment id 0 0
231444 2097


  r_inv = np.power(rowsum, -1).flatten()


The alignment id 0 2096
647482 596
The alignment id 0 2691
1104098 1313
graphmaker done.........


In [27]:
all_domain_list = []
all_domain_set = []
all_inter = 0

In [28]:
for idx, cur_domain in enumerate(domain_list):
        cur_src_data_dir = os.path.join("datasets/" + str(opt["task"]) + "/dataset/", cur_domain + "/train.txt")
        print(f'Loading {cur_domain}: {cur_src_data_dir}')

        if opt["aggregator"] == "item_similarity":
            ease_dense = total_graphs.ease[idx].to_dense()

        all_domain_list.append({})
        all_domain_set.append({})
        
        with codecs.open(cur_src_data_dir, "r", encoding="utf-8") as infile:
            for line in infile:
                all_inter += 1
                line = line.strip().split("\t")
                user = int(line[0])
                item = int(line[1]) + 1
                if user not in all_domain_list[idx].keys():
                    all_domain_list[idx][user] = []
                    all_domain_set[idx][user] = set()
                if item not in all_domain_set[idx][user]:
                    if opt["aggregator"] == "item_similarity":
                        all_domain_list[idx][user].append([item, ease_dense[user][item]])
                    else:
                        all_domain_list[idx][user].append([item, 1])
                    all_domain_set[idx][user].add(item)

        print(f'Loading {cur_domain}: {cur_src_data_dir}')
        cur_src_task_generator = TaskGenerator(cur_src_data_dir, opt, all_domain_list, all_domain_set, idx,
                                               total_graphs)
        task_gen_all[idx] = cur_src_task_generator
        domain_id[cur_domain] = idx



Loading d1: datasets/multi-user-intra/dataset/d1/train.txt
Loading d1: datasets/multi-user-intra/dataset/d1/train.txt
the min/max user/item number of  datasets/multi-user-intra/dataset/d1/train.txt
user: 0 231443
item: 1 2096
Loading d2: datasets/multi-user-intra/dataset/d2/train.txt
Loading d2: datasets/multi-user-intra/dataset/d2/train.txt
the min/max user/item number of  datasets/multi-user-intra/dataset/d2/train.txt
user: 4 647481
item: 1 595
Loading d3: datasets/multi-user-intra/dataset/d3/train.txt
Loading d3: datasets/multi-user-intra/dataset/d3/train.txt
the min/max user/item number of  datasets/multi-user-intra/dataset/d3/train.txt
user: 4 1104097
item: 1 1312


In [29]:
train_domains = MetaDomain_Dataset(task_gen_all, num_negatives=opt["num_negative"], meta_split='train')
train_dataloader = MetaDomain_DataLoader(train_domains, sample_batch_size=opt["batch_size"] // len(domain_list), shuffle=True)
opt["num_domains"] = train_dataloader.num_domains
opt["domain_id"] = domain_id



In [30]:
############
## Validation and Test
############
if "inter" in opt["task"]:
    opt["shared_user"] = 1e9
valid_dataloader = {}
test_dataloader = {}
for cur_domain in domain_list:
        if opt["task"] == "dual-user-intra":
            domain_valid = os.path.join("datasets/" + str(opt["task"]) + "/dataset/", cur_domain + "/test.txt")
        else:
            domain_valid = os.path.join("datasets/" + str(opt["task"]) + "/dataset/", cur_domain + "/valid.txt")
        domain_test = os.path.join("datasets/"+str(opt["task"]) + "/dataset/", cur_domain + "/test.txt")
        valid_dataloader[cur_domain] = task_gen_all[domain_id[cur_domain]].instance_a_valid_dataloader(
            domain_valid, 100)
        test_dataloader[cur_domain] = task_gen_all[domain_id[cur_domain]].instance_a_valid_dataloader(
            domain_test, 100)

print("the user number of different domains", opt["user_max"])
print("the item number of different domains", opt["item_max"])

the evaluation data:  datasets/multi-user-intra/dataset/d1/valid.txt
datasets/multi-user-intra/dataset/d1/valid.txt valid user:  10548
the evaluation data:  datasets/multi-user-intra/dataset/d1/test.txt
datasets/multi-user-intra/dataset/d1/test.txt valid user:  10549
the evaluation data:  datasets/multi-user-intra/dataset/d2/valid.txt
datasets/multi-user-intra/dataset/d2/valid.txt valid user:  31572
the evaluation data:  datasets/multi-user-intra/dataset/d2/test.txt
datasets/multi-user-intra/dataset/d2/test.txt valid user:  31573
the evaluation data:  datasets/multi-user-intra/dataset/d3/valid.txt
datasets/multi-user-intra/dataset/d3/valid.txt valid user:  80457
the evaluation data:  datasets/multi-user-intra/dataset/d3/test.txt
datasets/multi-user-intra/dataset/d3/test.txt valid user:  80457
the user number of different domains [231444, 647482, 1104098]
the item number of different domains [2097, 596, 1313]


# Inference Code

In [31]:
import torch
from utils.GraphMaker import GraphMaker
from model.trainer import CrossTrainer
import os
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Define paths to saved weights and dataset
model_weights_path = 'Trained_Model\model_d1d2d3_epoch_5.pt'


# Initialize model with the same architecture as training
model = CrossTrainer(opt)  # `opt` should be the same dictionary used during training



# Load the model using the custom `load()` method of CrossTrainer
model.load(model_weights_path)

# Set model to evaluation mode
model.model.eval()

# Load and preprocess the dataset using GraphMaker
graph_maker = total_graphs
UV_adj, VU_adj, ease_pred = graph_maker.UV, graph_maker.VU, graph_maker.ease

#### Inference on particular example

In [32]:
# Define user index and item indices for inference
user_idx = torch.tensor([1], dtype=torch.long).to(device)  # Move user index to the same device
item_indices = torch.tensor([1, 2,4, 6], dtype=torch.long).to(device)  # Move item indices to the same device
print(f"Length of UV_adj: {len(UV_adj)}")
print(f"Length of VU_adj: {len(VU_adj)}")
print(f"Length of ease_pred: {len(ease_pred)}")
print(f"User index: {user_idx}")

Length of UV_adj: 3
Length of VU_adj: 3
Length of ease_pred: 3
User index: tensor([1], device='cuda:0')


In [33]:
opt['domain_id']

{'d1': 0, 'd2': 1, 'd3': 2}

In [34]:



# Ensure item embeddings are initialized (fix for con_item_emb_list)
model.model.item_embedding_select()

# Specify a domain to run inference on, here it's 'game_video' and 'video_game'
domain_id = opt['domain_id']['d1']  # Change based on domain

# Define user index and item indices for inference
user_idx = torch.tensor([0], dtype=torch.long).to(device)  # Move user index to the same device
item_indices = torch.tensor([1,2, 4, 6], dtype=torch.long).to(device)  # Move item indices to the same device

# Ensure user_idx is valid
if user_idx >= len(UV_adj) or user_idx >= len(VU_adj) or user_idx >= len(ease_pred):
    raise ValueError(f"user_idx {user_idx} is out of bounds for the dataset.")
# Define dummy context and global scores, and move them to the correct device
context_item = torch.zeros((1, 10), dtype=torch.long).to(device)  # Adjust based on your model
context_score = torch.zeros((1, 10), dtype=torch.float).to(device)  # Adjust based on your model
global_item = torch.zeros((1, len(item_indices), len(item_indices)), dtype=torch.long).to(device)  # Adjust shape
global_score = torch.zeros((1, len(item_indices)), dtype=torch.float).to(device)  # Adjust shape

# Get user embeddings using the model
user_embedding = model.model.forward_user(domain_id, user_idx, context_item, context_score, global_item, global_score)

# Get item embeddings using the model
item_embeddings = model.model.forward_item(domain_id, item_indices)

# Use the predict_dot method to generate the predictions (dot product of user and item embeddings)
scores = model.model.predict_dot(user_embedding, item_embeddings)

# Sort the scores to get top recommended items
top_items = torch.argsort(scores, descending=True)

# Display or return top recommended items
print("Top recommended items for user:", top_items)

Top recommended items for user: tensor([2, 1, 0, 3], device='cuda:0')


#### Inference on  whole testing data

In [35]:
opt['domain_id']

{'d1': 0, 'd2': 1, 'd3': 2}

In [36]:
test_dataloader

{'d1': <torch.utils.data.dataloader.DataLoader at 0x23f47975d00>,
 'd2': <torch.utils.data.dataloader.DataLoader at 0x23fd0f93c20>,
 'd3': <torch.utils.data.dataloader.DataLoader at 0x23f35d53ce0>}

In [37]:
metrics_00=model.predict_full_rank(opt['domain_id']['d1'], test_dataloader['d1'], all_domain_set[opt['domain_id']['d1']], task_gen_all[opt['domain_id']['d1']].eval_set)
print(metrics_00)

+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
{'HT_10': 0.14456346573134896, 'NDCG_10': 0.07255207233380863, 'F_10': 0.027287686039604377}


In [38]:
metrics_11=model.predict_full_rank(opt['domain_id']['d2'], test_dataloader['d2'], all_domain_set[opt['domain_id']['d2']], task_gen_all[opt['domain_id']['d2']].eval_set)

print(metrics_11)

+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
{'HT_10': 0.5175624742659868, 'NDCG_10': 0.2843181192016272, 'F_10': 0.09608248518453952}


In [39]:
metrics_11=model.predict_full_rank(opt['domain_id']['d3'], test_dataloader['d3'], all_domain_set[opt['domain_id']['d3']], task_gen_all[opt['domain_id']['d3']].eval_set)

print(metrics_11)

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
{'HT_10': 0.662900679866264, 'NDCG_10': 0.34982383574615095, 'F_10': 0.12316479269315736}


| Dataset | Metric   | Paper Results | Our Results |
|---------|----------|---------------|------------|
| D1      | HT@10    | 32.60         | 14.46     |
|         | NDCG@10  | 13.56         | 07.26     |
| D2      | HT@10    | 64.37         | 51.76     |
|         | NDCG@10  | 50.48         | 28.43     |
| D3      | HT@10    | 73.89         | 66.29     |
|         | NDCG@10  | 59.15         | 34.98     |
