In [1]:
import argparse
import numpy as np
import torch
from torch.autograd import Variable
from utils.GraphMaker import GraphMaker
from model.trainer import CrossTrainer
import os
import json
import sys
import pickle
import pdb
import time
import copy
import csv

True
0
NVIDIA GeForce GTX 1650


In [2]:
sys.path.insert(1, 'utils')
from utils.data import *

--static_sample --cuda --domains sport_cloth --aggregator user_attention  > dual_user_intra_sport_cloth.log 2>&1&

--static_sample --cuda --domains game_video --task dual-user-inter --aggregator mean > dual_item_inter_game_video.log 2>&1&

In [3]:
def create_arg_parser():
    """Create argument parser for our baseline. """
    parser = argparse.ArgumentParser('WSDM')

    # DATA  Arguments
    parser.add_argument('--domains', type=str, default='m1_m2_m3_m4_m5', help='specify none ("none") or a few source markets ("-" seperated) to augment the data for training')
    parser.add_argument('--task', type=str, default='multi-item-intra', help='dual-user-intra, dual-user-inter, multi-item-intra, multi-user-intra')

    # MODEL Arguments
    parser.add_argument('--model', type=str, default='UniCDR', help='right model name')
    parser.add_argument('--mask_rate', type=float, default=0.1, help='mask rate of interactions')
    parser.add_argument('--num_epoch', type=int, default=100, help='number of epoches')
    parser.add_argument('--aggregator', type=str, default='item_similarity', help='switching the user-item aggregation')
    parser.add_argument('--batch_size', type=int, default=1024, help='batch size')
    parser.add_argument('--optim', choices=['sgd', 'adagrad', 'adam', 'adamax'], default='adam',
                        help='Optimizer: sgd, adagrad, adam or adamax.')
    parser.add_argument('--lr', type=float, default=0.005, help='learning rate')
    parser.add_argument('--l2_reg', type=float, default=1e-6, help='the L2 weight')
    parser.add_argument('--lr_decay', type=float, default=0.98, help='decay learning rate')
    parser.add_argument('--weight_decay', type=float, default=1e-5, help='decay learning rate')
    parser.add_argument('--latent_dim', type=int, default=128, help='latent dimensions')
    parser.add_argument('--num_negative', type=int, default=10, help='num of negative samples during training')
    parser.add_argument('--maxlen', type=int, default=10, help='num of item sequence')
    parser.add_argument('--dropout', type=float, default=0.3, help='random drop out rate')
    parser.add_argument('--save',default='save', action='store_true', help='save model?')
    parser.add_argument('--lambda', type=float, default=50, help='the parameter of EASE')
    parser.add_argument('--lambda_a', type=float, default=0.5, help='for our aggregators')
    parser.add_argument('--lambda_loss', type=float, default=0.4, help='the parameter of loss function')
    parser.add_argument('--static_sample', default='static_sample',action='store_true', help='accelerate the dataloader')

    # others
    parser.add_argument('--cuda',default='cuda', action='store_true', help='use of cuda')
    parser.add_argument('--seed', type=int, default=42, help='manual seed init')
    parser.add_argument('--decay_epoch', type=int, default=10, help='Decay learning rate after this epoch.')

    return parser

In [4]:
parser = create_arg_parser()
opt, _ = parser.parse_known_args()  # Ignore unknown arguments
opt = vars(opt)

opt["device"] = torch.device('cuda' if torch.cuda.is_available() and opt["cuda"] else 'cpu')



In [5]:
def print_config(config):
        info = "Running with the following configs:\n"
        for k, v in config.items():
            info += "\t{} : {}\n".format(k, str(v))
        print("\n" + info + "\n")

if opt["task"] == "multi-user-intra":
        opt["maxlen"] = 50

print_config(opt)


Running with the following configs:
	domains : m1_m2_m3_m4_m5
	task : multi-item-intra
	model : UniCDR
	mask_rate : 0.1
	num_epoch : 100
	aggregator : item_similarity
	batch_size : 1024
	optim : adam
	lr : 0.005
	l2_reg : 1e-06
	lr_decay : 0.98
	weight_decay : 1e-05
	latent_dim : 128
	num_negative : 10
	maxlen : 10
	dropout : 0.3
	save : save
	lambda : 50
	lambda_a : 0.5
	lambda_loss : 0.4
	static_sample : static_sample
	cuda : cuda
	seed : 42
	decay_epoch : 10
	device : cuda




In [6]:
print(f'Running experiment on device: {opt["device"]}')


Running experiment on device: cuda


In [7]:
def seed_everything(seed=1111):
        random.seed(seed)
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        np.random.seed(seed)
        os.environ['PYTHONHASHSEED'] = str(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

seed_everything(opt["seed"])

In [8]:
opt["domains"]

'm1_m2_m3_m4_m5'

In [9]:
if "dual" in opt["task"]:
        filename = opt["domains"].split("_")
        opt["domains"] = []
        opt["domains"].append(filename[0] + "_" + filename[1])
        opt["domains"].append(filename[1] + "_" + filename[0])
        # filename = opt["domains"].replace(' || ', '_').split('_')
        # opt["domains"] = []
        # opt["domains"].append(filename[0] + "_" + filename[1])
        # opt["domains"].append(filename[1] + "_" + filename[0])


else:
        opt["domains"] = opt["domains"].split('_')

print("Loading domains:", opt["domains"])

Loading domains: ['m1', 'm2', 'm3', 'm4', 'm5']


In [10]:
domain_list = opt["domains"]
opt["user_max"] = []
opt["item_max"] = []
task_gen_all = {}
domain_id = {}
all_domain_list = []
all_domain_set = []
all_inter = 0

In [11]:
for idx, cur_domain in enumerate(domain_list):
        cur_src_data_dir = os.path.join("datasets/"+str(opt["task"]) + "/dataset/", cur_domain + "/train.txt")
        print(f'Loading {cur_domain}: {cur_src_data_dir}')
        
        all_domain_list.append({})
        all_domain_set.append({})
        max_user = 0
        max_item = 0
        print(cur_src_data_dir)
        print(cur_domain)
        print(opt["domains"])
        with codecs.open(cur_src_data_dir, "r", encoding="utf-8") as infile:
            for line in infile:
                all_inter+=1
                line = line.strip().split("\t")
                user = int(line[0])
                item = int(line[1]) + 1
                max_user = max(max_user, user)
                max_item = max(max_item, item)
                if user not in all_domain_list[idx].keys():
                    all_domain_list[idx][user] = []
                    all_domain_set[idx][user] = set()
                if item not in all_domain_set[idx][user]:
                    all_domain_list[idx][user].append(item)
                    all_domain_set[idx][user].add(item)

        opt["user_max"].append(max_user + 1)
        opt["item_max"].append(max_item + 1)

total_graphs = GraphMaker(opt, all_domain_list)

Loading m1: datasets/multi-item-intra/dataset/m1/train.txt
datasets/multi-item-intra/dataset/m1/train.txt
m1
['m1', 'm2', 'm3', 'm4', 'm5']


Loading m2: datasets/multi-item-intra/dataset/m2/train.txt
datasets/multi-item-intra/dataset/m2/train.txt
m2
['m1', 'm2', 'm3', 'm4', 'm5']
Loading m3: datasets/multi-item-intra/dataset/m3/train.txt
datasets/multi-item-intra/dataset/m3/train.txt
m3
['m1', 'm2', 'm3', 'm4', 'm5']
Loading m4: datasets/multi-item-intra/dataset/m4/train.txt
datasets/multi-item-intra/dataset/m4/train.txt
m4
['m1', 'm2', 'm3', 'm4', 'm5']
Loading m5: datasets/multi-item-intra/dataset/m5/train.txt
datasets/multi-item-intra/dataset/m5/train.txt
m5
['m1', 'm2', 'm3', 'm4', 'm5']
begin graphmaker................
The alignment id 0 0
7109 2199
Start the EASE
load
15625582
49169


  r_inv = np.power(rowsum, -1).flatten()
  return torch.sparse.FloatTensor(indices, values, shape)


EASE End
The alignment id 7109 0
2697 2693
Start the EASE
load
3592711
17953
EASE End
The alignment id 9806 0
3328 2979
Start the EASE
load
4143360
23363
EASE End
The alignment id 13134 0
5482 4931
Start the EASE
load
15990994
31907
EASE End
The alignment id 18616 0
6466 12016
Start the EASE
load
63121092
77136
EASE End
graphmaker done.........


In [12]:
all_domain_list = []
all_domain_set = []
all_inter = 0

In [13]:
for idx, cur_domain in enumerate(domain_list):
        cur_src_data_dir = os.path.join("datasets/" + str(opt["task"]) + "/dataset/", cur_domain + "/train.txt")
        print(f'Loading {cur_domain}: {cur_src_data_dir}')

        if opt["aggregator"] == "item_similarity":
            ease_dense = total_graphs.ease[idx].to_dense()

        all_domain_list.append({})
        all_domain_set.append({})
        
        with codecs.open(cur_src_data_dir, "r", encoding="utf-8") as infile:
            for line in infile:
                all_inter += 1
                line = line.strip().split("\t")
                user = int(line[0])
                item = int(line[1]) + 1
                if user not in all_domain_list[idx].keys():
                    all_domain_list[idx][user] = []
                    all_domain_set[idx][user] = set()
                if item not in all_domain_set[idx][user]:
                    if opt["aggregator"] == "item_similarity":
                        all_domain_list[idx][user].append([item, ease_dense[user][item]])
                    else:
                        all_domain_list[idx][user].append([item, 1])
                    all_domain_set[idx][user].add(item)

        print(f'Loading {cur_domain}: {cur_src_data_dir}')
        cur_src_task_generator = TaskGenerator(cur_src_data_dir, opt, all_domain_list, all_domain_set, idx,
                                               total_graphs)
        task_gen_all[idx] = cur_src_task_generator
        domain_id[cur_domain] = idx



Loading m1: datasets/multi-item-intra/dataset/m1/train.txt
Loading m1: datasets/multi-item-intra/dataset/m1/train.txt
the min/max user/item number of  datasets/multi-item-intra/dataset/m1/train.txt
user: 0 7108
item: 1 2198
Loading m2: datasets/multi-item-intra/dataset/m2/train.txt
Loading m2: datasets/multi-item-intra/dataset/m2/train.txt
the min/max user/item number of  datasets/multi-item-intra/dataset/m2/train.txt
user: 0 2696
item: 1 2692
Loading m3: datasets/multi-item-intra/dataset/m3/train.txt
Loading m3: datasets/multi-item-intra/dataset/m3/train.txt
the min/max user/item number of  datasets/multi-item-intra/dataset/m3/train.txt
user: 0 3327
item: 1 2978
Loading m4: datasets/multi-item-intra/dataset/m4/train.txt
Loading m4: datasets/multi-item-intra/dataset/m4/train.txt
the min/max user/item number of  datasets/multi-item-intra/dataset/m4/train.txt
user: 0 5481
item: 2 4930
Loading m5: datasets/multi-item-intra/dataset/m5/train.txt
Loading m5: datasets/multi-item-intra/dataset

In [14]:
train_domains = MetaDomain_Dataset(task_gen_all, num_negatives=opt["num_negative"], meta_split='train')
train_dataloader = MetaDomain_DataLoader(train_domains, sample_batch_size=opt["batch_size"] // len(domain_list), shuffle=True)
opt["num_domains"] = train_dataloader.num_domains
opt["domain_id"] = domain_id


In [15]:

############
## Validation and Test
############
if "inter" in opt["task"]:
    opt["shared_user"] = 1e9
valid_dataloader = {}
test_dataloader = {}
for cur_domain in domain_list:
        if opt["task"] == "dual-user-intra":
            domain_valid = os.path.join("datasets/" + str(opt["task"]) + "/dataset/", cur_domain + "/test.txt")
        else:
            domain_valid = os.path.join("datasets/" + str(opt["task"]) + "/dataset/", cur_domain + "/valid.txt")
        domain_test = os.path.join("datasets/"+str(opt["task"]) + "/dataset/", cur_domain + "/test.txt")
        valid_dataloader[cur_domain] = task_gen_all[domain_id[cur_domain]].instance_a_valid_dataloader(
            domain_valid, 100)
        test_dataloader[cur_domain] = task_gen_all[domain_id[cur_domain]].instance_a_valid_dataloader(
            domain_test, 100)

print("the user number of different domains", opt["user_max"])
print("the item number of different domains", opt["item_max"])


the evaluation data:  datasets/multi-item-intra/dataset/m1/valid.txt
the evaluation data:  datasets/multi-item-intra/dataset/m1/test.txt
the evaluation data:  datasets/multi-item-intra/dataset/m2/valid.txt
the evaluation data:  datasets/multi-item-intra/dataset/m2/test.txt
the evaluation data:  datasets/multi-item-intra/dataset/m3/valid.txt
the evaluation data:  datasets/multi-item-intra/dataset/m3/test.txt
the evaluation data:  datasets/multi-item-intra/dataset/m4/valid.txt
the evaluation data:  datasets/multi-item-intra/dataset/m4/test.txt
the evaluation data:  datasets/multi-item-intra/dataset/m5/valid.txt
the evaluation data:  datasets/multi-item-intra/dataset/m5/test.txt
the user number of different domains [7109, 2697, 3328, 5482, 6466]
the item number of different domains [2199, 2693, 2979, 4931, 12016]


In [16]:
############
## Model
############
mymodel = CrossTrainer(opt)
opt["num_epoch"] = 120

############
## Train
############
dev_score_history = []
for i in range(opt["num_domains"]):
    dev_score_history.append([0])


current_lr = opt['lr']
iteration_num = 500

print("per batch of an epoch:", iteration_num)
global_step = 0
for epoch in range(0, opt["num_epoch"] + 1):
    start_time = time.time()
    print('Epoch {} starts !'.format(epoch))
    total_loss = [0]

    loss_list = []
    for i in range(opt["num_domains"]):
        loss_list.append([0])

    for iteration in range(iteration_num):
        if epoch == 0:
            continue
        if iteration % 10 is 0:
            print(".", end="")

        mymodel.model.train()
        mymodel.optimizer.zero_grad()
        mymodel.model.item_embedding_select()
        mymodel_loss = 0

        for idx in range(opt["num_domains"]):  # get one batch from each dataloader
            global_step += 1

            cur_train_dataloader = train_dataloader.get_iterator(idx)
            try:
                batch_data = next(cur_train_dataloader)
            except:
                new_train_iterator = iter(train_dataloader[idx])
                batch_data = next(new_train_iterator)

            cur_loss = mymodel.reconstruct_graph(idx, batch_data)

            mymodel_loss += cur_loss
            loss_list[idx].append(cur_loss.item())
            total_loss.append(cur_loss.item())


        mymodel_loss.backward()
        mymodel.optimizer.step()

    print("Average loss:", sum(total_loss)/len(total_loss), "time: ", (time.time() - start_time) / 60, "(min) current lr: ",
            current_lr)

    print('-' * 80)

    if epoch % 5:
        continue

    for idx in range(opt["num_domains"]):
        print(idx, "loss is: ", sum(loss_list[idx])/len(loss_list[idx]))

    print('Make prediction:')
    # validation data prediction
    valid_start = time.time()

    mymodel.model.eval()
    mymodel.model.item_embedding_select()

    decay_switch = 0
    for idx, cur_domain in enumerate(valid_dataloader):
        if opt["task"] == "multi-user-intra":
            metrics = mymodel.predict_full_rank(idx, valid_dataloader[cur_domain], all_domain_set[idx], task_gen_all[idx].eval_set)
        else:
            metrics = mymodel.predict(idx, valid_dataloader[cur_domain])

        print("\n-------------------" + cur_domain + "--------------------")
        print(metrics)


        if metrics["NDCG_10"] > max(dev_score_history[idx]):
            # test data prediction
            print(cur_domain, " better results!")

            if opt["save"]:
                task = str(opt["domains"]) # Change this part accordding to one of the four datasets.
                model_save_path = f'model_{task}_epoch_{epoch}.pt'  # Example filename: Added by Us
                mymodel.save(model_save_path)
                print("best model saved!")

            if opt["task"] == "multi-user-intra":
                test_metrics = mymodel.predict_full_rank(idx, test_dataloader[cur_domain], all_domain_set[idx], task_gen_all[idx].eval_set)
            else:
                test_metrics = mymodel.predict(idx, test_dataloader[cur_domain])

            print(test_metrics)
            ###############################################
            # Saving the test metrics into a CSV file
            csv_filename = str(opt['task']) + "__" + str(opt['domains'])+"__test_metrics.csv"

            # Add the epoch number to the test_metrics dictionary
            test_metrics["epoch"] = epoch  # Add the current epoch number to the metrics

            # Update the columns to include 'epoch' as the first column
            csv_columns = ['epoch'] + list(test_metrics.keys())  # 'epoch' first, then the rest of the metric keys

            try:
                # Check if the file exists. If not, create it with headers
                with open(csv_filename, mode='a', newline='') as file:
                    writer = csv.DictWriter(file, fieldnames=csv_columns)

                    # Write the header only if the file is empty (appends without header otherwise)
                    if file.tell() == 0:
                        writer.writeheader()

                    # Write the test_metrics as a new row
                    writer.writerow(test_metrics)

                print(f"Test metrics saved to {csv_filename}")
            except IOError:
                print("I/O error while saving test metrics")
        else:
            decay_switch += 1
        dev_score_history[idx].append(metrics["NDCG_10"])

    print("valid time:  ", (time.time() - valid_start) / 60, "(min)")


    if epoch > opt['decay_epoch']:
        mymodel.model.warmup = 0

    # lr schedule
    print("decay_switch: ", decay_switch)
    if (epoch > opt['decay_epoch']) and (decay_switch > opt["num_domains"] // 2) and (opt[
        'optim'] in ['sgd', 'adagrad', 'adadelta', 'adam']):
        current_lr *= opt['lr_decay']
        mymodel.update_lr(current_lr)

print('Experiment finished successfully!')



per batch of an epoch: 500
Epoch 0 starts !
Average loss: 0.0 time:  0.0 (min) current lr:  0.005
--------------------------------------------------------------------------------
0 loss is:  0.0
1 loss is:  0.0
2 loss is:  0.0
3 loss is:  0.0
4 loss is:  0.0
Make prediction:


  if iteration % 10 is 0:


+++++++++++++++++++++++++++++++++++

-------------------m1--------------------
{'MRR': 0.007084306559623285, 'NDCG_5': 0.002581350513115813, 'NDCG_10': 0.004096717717655426, 'HT_1': 0.0008508224617129892, 'HT_5': 0.0045377197958026095, 'HT_10': 0.00935904707884288}
m1  better results!
model saved to model_['m1', 'm2', 'm3', 'm4', 'm5']_epoch_0.pt
best model saved!
+++++++++++++++++++++++++++++++++++
{'MRR': 0.007296188555258411, 'NDCG_5': 0.002847366239020075, 'NDCG_10': 0.004786609392852117, 'HT_1': 0.0011242270938729624, 'HT_5': 0.00505902192242833, 'HT_10': 0.011242270938729624}
Test metrics saved to multi-item-intra__['m1', 'm2', 'm3', 'm4', 'm5']__test_metrics.csv
+++++++++++++

-------------------m2--------------------
{'MRR': 0.006926647876472471, 'NDCG_5': 0.002778099151977479, 'NDCG_10': 0.0032307385590624347, 'HT_1': 0.0, 'HT_5': 0.005873715124816446, 'HT_10': 0.007342143906020558}
m2  better results!
model saved to model_['m1', 'm2', 'm3', 'm4', 'm5']_epoch_0.pt
best model s