In [1]:
import os
import random
import time
import numpy as np
import json
import logging
import argparse
import torch
import torch.backends.cudnn as cudnn
from torch.nn.functional import logsigmoid
import torch.nn as nn
import torch.nn.functional as F
import torch.optim
import torch.utils.data
import torch.multiprocessing as mp
import torch.distributed as dist
# from torch.utils.tensorboard import SummaryWriter

import csv
from torch.optim import Adam
from sys import argv
import json
import pdb
from torch.nn import *
from collections import defaultdict
from tqdm import tqdm
import pandas as pd
from datetime import datetime
import shutil
import yaml

from data.utility import Dataset
# from trainer.TransMatch_pretrain import TransMatch
from trainer.TransE import TransE
from util.eval_utils import *

In [2]:
def get_logger():
    logger_name = "main-logger"
    logger = logging.getLogger(logger_name)
    logger.setLevel(logging.INFO)
    handler = logging.StreamHandler()
    fmt = "[%(asctime)s %(levelname)s %(filename)s line %(lineno)d %(process)d] %(message)s"
    handler.setFormatter(logging.Formatter(fmt))
    logger.addHandler(handler)
    return logger
conf = yaml.safe_load(open("./config/train_model_config.yaml"))
conf["dataset"] = "iqon_s"
conf["gpu"] = 1
conf["device"] = torch.device("cuda:%s"%conf["gpu"] if torch.cuda.is_available() else "cpu")
dataset = Dataset(conf)
global logger
logger = get_logger()

In [3]:
conf["user_num"] = len(dataset.user_map)
conf["item_num"] = len(dataset.item_map)
conf["cate_num"] = len(dataset.cate_items)
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
conf['pretrained_model'] = 'TransE'
pretrain_model_file = f"{conf['pretrained_model']}.pth.tar"
pretrain_model_dir = "model/iqon_s/pretrained_model/"
pretrain_model_path = os.path.join(pretrain_model_dir, pretrain_model_file)
if os.path.exists(pretrain_model_path):
    logger.info("=> loading model ...")
    model = torch.load(pretrain_model_path)
    print("Testing with existing model...")
    conf['use_pretrain'] = True
    model.to(conf["device"])
    logger.info(model)

[2024-01-11 00:43:44,660 INFO 2687879523.py line 10 3962714] => loading model ...


Testing with existing model...


[2024-01-11 00:43:47,148 INFO 2687879523.py line 15 3962714] TransE(
  (u_embeddings_l): Embedding(1770, 32, padding_idx=1769)
  (i_bias_l): Embedding(94250, 1, padding_idx=94249)
  (i_embeddings_i): Embedding(94250, 32, padding_idx=94249)
  (visual_nn_comp): Sequential(
    (0): Linear(in_features=2048, out_features=32, bias=True)
    (1): Sigmoid()
  )
  (visual_nn_per): Sequential(
    (0): Linear(in_features=2048, out_features=32, bias=True)
    (1): Sigmoid()
  )
  (i_bias_v): Embedding(94250, 1, padding_idx=94249)
  (u_embeddings_v): Embedding(1770, 32, padding_idx=1769)
)


In [4]:
train_df = pd.read_csv("data/iqon_s/train.csv", header=None).astype('int')
train_df.columns=["user_idx", "top_idx", "pos_bottom_idx", "neg_bottom_idx"]
test_df = pd.read_csv("data/iqon_s/test.csv", header=None).astype('int')
test_df.columns=["user_idx", "top_idx", "pos_bottom_idx", "neg_bottom_idx"]
valid_df = pd.read_csv("data/iqon_s/val.csv", header=None).astype('int')
valid_df.columns=["user_idx", "top_idx", "pos_bottom_idx", "neg_bottom_idx"]
all_bottoms_id = pd.concat([train_df["pos_bottom_idx"], test_df["pos_bottom_idx"], valid_df["pos_bottom_idx"],
    train_df["neg_bottom_idx"], test_df["neg_bottom_idx"], valid_df["neg_bottom_idx"]], ignore_index=True).unique()

In [5]:
all_user = pd.concat([train_df["user_idx"], test_df["user_idx"], valid_df["user_idx"]], ignore_index=True).unique()
all_user

array([1506, 1598, 1748, ...,  350, 1261,  772])

In [6]:
len(all_user)

1769

In [7]:
train_ij_pairs = train_df[['top_idx', 'pos_bottom_idx']].drop_duplicates().values.tolist()
train_ij_pairs

[[33369, 55036],
 [32593, 16447],
 [15739, 58334],
 [22463, 11177],
 [9183, 86612],
 [57917, 48237],
 [45135, 13483],
 [39982, 40063],
 [72132, 28499],
 [66205, 25554],
 [7442, 81704],
 [46207, 47145],
 [85516, 18247],
 [74925, 62290],
 [45707, 5728],
 [67239, 4779],
 [78981, 31783],
 [53465, 21087],
 [66282, 62507],
 [15366, 647],
 [71121, 6021],
 [87273, 49570],
 [31679, 62023],
 [31883, 73614],
 [51870, 15916],
 [83971, 40068],
 [18712, 63934],
 [41210, 66873],
 [37886, 29255],
 [78986, 88701],
 [34271, 91698],
 [41686, 38406],
 [16917, 46096],
 [49838, 90556],
 [81906, 54949],
 [40610, 75525],
 [71732, 55515],
 [85275, 61313],
 [63582, 33562],
 [61202, 8413],
 [19221, 61268],
 [10007, 2661],
 [90509, 89790],
 [6308, 69793],
 [52367, 92910],
 [67071, 85598],
 [10568, 16054],
 [28646, 30315],
 [34271, 34533],
 [13672, 82366],
 [11807, 42654],
 [4593, 9062],
 [85323, 39669],
 [34687, 24743],
 [46876, 31619],
 [11232, 43569],
 [20743, 65981],
 [26625, 71003],
 [37093, 65555],
 [71082, 

In [8]:
def to_tensor(data):
    return torch.tensor(data, dtype=torch.int64).to(conf["device"])

In [9]:
a=0
new_u_ij_dict = {}
dataset.visual_features = dataset.visual_features.to(conf['device'])

ij_pairs = to_tensor(train_ij_pairs)
Is = ij_pairs[:, 0]
Js = ij_pairs[:, 1]
i_rep = model.i_embeddings_i(Is)
j_rep = model.i_embeddings_i(Js)
j_bias = model.i_bias_l(Js)
vis_I = dataset.visual_features[Is]
vis_J = dataset.visual_features[Js]
I_visual = model.visual_nn_comp(vis_I) #bs, hidden_dim
J_visual = model.visual_nn_comp(vis_J)
J_bias_v = model.i_bias_v(Js)

# for user_idx in all_user:
for user_idx in range(len(dataset.user_map)):  
    u_idx = to_tensor(user_idx)   #key
    u_rep = model.u_embeddings_l(u_idx.expand(Is.size(0))) #Is.size(0), hd
    distances = model.transE_predict(u_rep, i_rep, j_rep, j_bias)
    u_rep_v = model.u_embeddings_v(u_idx.expand(Is.size(0))) #Is.size(0), hd
    distances_v = model.transE_predict(u_rep, I_visual, J_visual, J_bias_v)
    distances += distances_v

    topk_scores, topk_indices = torch.topk(distances.view(-1), conf['top_k_u'], dim=-1)
    topk_i_j_pairs = ij_pairs[topk_indices]
    new_u_ij_dict[int(user_idx)] = topk_i_j_pairs.cpu().numpy().tolist()
    if a < 1:
        print(new_u_ij_dict)
        a+= 1
    else:
        continue
with open('data/iqon_s/u_topk_ijs_dict.json', 'w') as json_file:
    json.dump(new_u_ij_dict, json_file)  

{0: [[52429, 67990], [74093, 67990], [14676, 67990], [64436, 67990], [40943, 67990]]}


In [10]:
a=0
new_u = {}
for key, value in new_u_ij_dict.items():
    i_values = [item[0] for item in value]  # 获取 'i' 的值
    j_values = [item[1] for item in value]  # 获取 'j' 的值

    new_u[key] = [i_values, j_values]
    if a < 1:
        print(new_u)
        print(key, value)
        a+= 1
    else:
        continue
with open('data/iqon_s/u_topk_Is_Js_dict.json', 'w') as json_file:
    json.dump(new_u, json_file) 

{0: [[52429, 74093, 14676, 64436, 40943], [67990, 67990, 67990, 67990, 67990]]}
0 [[52429, 67990], [74093, 67990], [14676, 67990], [64436, 67990], [40943, 67990]]


In [11]:
tensor_list = []
for key, value in new_u.items():
    tensor = torch.tensor(value, dtype=torch.int32)  # 将列表转换为张量
    tensor_list.append(tensor)

stacked_tensor = torch.stack(tensor_list)
stacked_tensor[0]

tensor([[52429, 74093, 14676, 64436, 40943],
        [67990, 67990, 67990, 67990, 67990]], dtype=torch.int32)

In [12]:
!python pretrain.py -d=iqon_s -g=2

use_selfatt: 0 top_k_u: 3 context: 1 use_hard_neg: 0 use_Nor: 0 use_topk_ij_for_u: 1
data prepared, 1769 users, 94249 items, 65663 train, 8208 test, 8208 validation data
Continuing training with existing model...
2024-01-11 00:44:04 Epoch 0 Loss: 0.054628
iqon_s_TransMatch_transE_pcc_3_4_mean_0.20
tes 2024-01-11 00:44:04   AUC: 0.7271
va 2024-01-11 00:44:07   AUC: 0.7329
2024-01-11 00:44:14 Epoch 1 Loss: 0.051718
2024-01-11 00:44:18 Epoch 2 Loss: 0.043138
iqon_s_TransMatch_transE_pcc_3_4_mean_0.20
tes 2024-01-11 00:44:19   AUC: 0.7282
va 2024-01-11 00:44:19   AUC: 0.7381
2024-01-11 00:44:26 Epoch 3 Loss: 0.038228
2024-01-11 00:44:30 Epoch 4 Loss: 0.030242
iqon_s_TransMatch_transE_pcc_3_4_mean_0.20
tes 2024-01-11 00:44:30   AUC: 0.7355
va 2024-01-11 00:44:30   AUC: 0.7390
2024-01-11 00:44:37 Epoch 5 Loss: 0.025514
2024-01-11 00:44:41 Epoch 6 Loss: 0.023443
iqon_s_TransMatch_transE_pcc_3_4_mean_0.20
tes 2024-01-11 00:44:42   AUC: 0.7339
va 2024-01-11 00:44:42   AUC: 0.7346
EarlyStopping 

In [14]:
!python pretrain.py -d=iqon_s -g=2

use_selfatt: 1 top_k_u: 3 context: 1 use_hard_neg: 1 use_Nor: 1 use_topk_ij_for_u: 1
data prepared, 1769 users, 94249 items, 65663 train, 8208 test, 8208 validation data
Continuing training with existing model...
2024-01-11 15:00:11 Epoch 0 Loss: 16.452194
iqon_s_TransMatch_transE_pcc_3_4_mean_0.20
tes 2024-01-11 15:00:12   AUC: 0.6363
va 2024-01-11 15:00:16   AUC: 0.6383
2024-01-11 15:01:03 Epoch 1 Loss: 7.642584
2024-01-11 15:01:48 Epoch 2 Loss: 4.021948
iqon_s_TransMatch_transE_pcc_3_4_mean_0.20
tes 2024-01-11 15:01:49   AUC: 0.6055
va 2024-01-11 15:01:50   AUC: 0.6106
EarlyStopping counter: 1 out of 20
2024-01-11 15:02:35 Epoch 3 Loss: 2.262677
EarlyStopping counter: 2 out of 20
2024-01-11 15:03:20 Epoch 4 Loss: 1.337057
iqon_s_TransMatch_transE_pcc_3_4_mean_0.20
tes 2024-01-11 15:03:21   AUC: 0.5715
va 2024-01-11 15:03:22   AUC: 0.5730
EarlyStopping counter: 3 out of 20
2024-01-11 15:04:06 Epoch 5 Loss: 0.821841
EarlyStopping counter: 4 out of 20
2024-01-11 15:04:51 Epoch 6 Loss: 

In [14]:
# grouped_data = test_df.groupby('user_idx').apply(lambda x: x[['top_idx', 'pos_bottom_idx']].values.tolist()).to_dict()
# grouped_data

{0: [[12488, 24175]],
 2: [[87355, 10675]],
 3: [[32765, 29474],
  [64177, 32765],
  [58141, 86916],
  [46656, 25506],
  [3434, 2348]],
 4: [[92867, 30715], [32252, 71869]],
 5: [[34469, 86663], [31216, 51200], [84037, 7158]],
 6: [[90515, 22913]],
 7: [[75490, 85603],
  [84052, 4041],
  [53778, 85675],
  [17608, 56355],
  [42000, 55747]],
 8: [[20655, 23640]],
 10: [[62119, 62023],
  [76290, 42199],
  [52481, 62023],
  [62023, 63632],
  [62023, 84401],
  [44065, 62023]],
 11: [[76420, 40197],
  [21609, 73750],
  [78115, 79788],
  [2478, 53214],
  [70601, 67223],
  [52958, 15809]],
 13: [[23792, 57777], [40473, 31431], [88418, 38232], [32747, 11030]],
 14: [[7213, 53076],
  [76489, 94045],
  [62171, 6633],
  [42243, 26293],
  [90733, 63893],
  [13448, 69150],
  [68909, 68280],
  [33188, 62613],
  [60102, 10286],
  [83478, 64660],
  [71802, 84610],
  [30443, 49859]],
 15: [[60225, 81584],
  [46595, 61788],
  [47401, 17395],
  [80148, 13563],
  [65328, 13435],
  [42430, 70360],
  [73711,

In [11]:
# a=0
# new_u_ij_dict = {}
# for user_idx, i_j_pairs in u_ij_dict.items(): # key, value
#     Is = to_tensor([item[0] for item in i_j_pairs])  # 获取所有 'i' 的值
#     Js = to_tensor([item[1] for item in i_j_pairs]) 
#     ijs = to_tensor(i_j_pairs)
#     u_idx = to_tensor(user_idx)   #key

#     u_rep = model.u_embeddings_l(u_idx.expand(Is.size(0))) #Is.size(0), hd
#     i_rep = model.i_embeddings_i(Is)
#     j_rep = model.i_embeddings_i(Js)
#     j_bias = model.i_bias_l(Js)

#     distances = model.transE_predict(u_rep, i_rep, j_rep, j_bias)

#     u_rep_v = model.u_embeddings_v(u_idx.expand(Is.size(0))) #Is.size(0), hd
#     vis_I = dataset.visual_features[Is]
#     vis_J = dataset.visual_features[Js]
#     I_visual = model.visual_nn_comp(vis_I) #bs, hidden_dim
#     J_visual = model.visual_nn_comp(vis_J)
#     J_bias_v = model.i_bias_v(Js)

#     distances_v = model.transE_predict(u_rep, I_visual, J_visual, J_bias_v)
#     distances += distances_v

#     topk_scores, topk_indices = torch.topk(distances.view(-1), conf['top_k_u'], dim=-1)
#     topk_i_j_pairs = ijs[topk_indices]
#     new_u_ij_dict[int(user_idx)] = topk_i_j_pairs.cpu().numpy().tolist()
#     if a < 1:
#         print(new_u_ij_dict)
#         a+= 1
#     else:
#         continue
# with open('data/iqon_s/u_exist_topk_ijs_dict.json', 'w') as json_file:
#     json.dump(new_u_ij_dict, json_file)  

{0: [[28604, 14755], [57744, 28585], [9500, 811], [61961, 18657], [19133, 93990]]}
