In [1]:
import torch
from torch import tanh
from torch import nn
import pandas as pd
import numpy as np
from numpy import multiply
from sklearn.utils import shuffle
train_file = '/home/duyongkang/PaperRec/Data/Train_2f_unrandom.pkl'
test_file = '/home/duyongkang/PaperRec/Data/Test_2feature.pkl'
feature = '/home/duyongkang/PaperRec/Data/Vector/Word2vec.pkl'

In [2]:
df_train = pd.read_pickle(train_file)
df_test = pd.read_pickle(test_file)
dic = (pd.read_pickle(feature))[['node','vector']].set_index('node').to_dict()['vector']

In [3]:
mul = []
for i in range(len(df_train)):
    a = df_train.author[i]
    p = df_train.paper[i]
    v = multiply(dic[a],dic[p])
    mul.append(v)
df_train['vector'] = mul

In [4]:
mul = []
for i in range(len(df_test)):
    a = df_test.author[i]
    p = df_test.paper[i]
    v = multiply(dic[a],dic[p])
    mul.append(v)
df_test['vector'] = mul

In [5]:
# 将tensor转换成list
def to_list(y):
    y = y.cpu()
    try:
        y = y.numpy()
    except:
        y = y.detach().numpy()
    y = y.astype(np.float).tolist()
    y = [y[i][0] for i in range(len(y))]
    return y
def HR_score(y_true, y_pred, k, truncate=False):
    n_positive = y_true.tolist().count(1)
    # make sure y_true and y_pred have the same length
    assert len(y_true) == len(y_pred)
    assert k > 0 and n_positive > 0

    # truncate y_true and y_pred according to k and pos num
    if truncate:
        topk = min(k, y_true.tolist().count(1))
    else:
        topk = k
    order = np.argsort(y_pred, kind="stable")[-topk:][::-1]
    y_true = y_true[order]

    return (y_true > 0).sum() / n_positive

def DCG_score(y_true, y_pred, k, truncate=False):
    # make sure y_true and y_pred have the same length
    assert len(y_true) == len(y_pred)
    assert k > 0 and y_true.tolist().count(1) > 0

    # truncate y_true and y_pred according to k and pos num
    if truncate:
        topk = min(k, y_true.tolist().count(1))
    else:
        topk = k
    order = np.argsort(y_pred, kind="stable")[-topk:][::-1]
    y_true = y_true[order]

    # linear gain, not seansitive to position
    # gains = y_true
    # exponential gain, seansitive to position
    gains = 2 ** y_true - 1

    # highest rank is 1 so +2 instead of +1
    discounts = np.log2(np.arange(len(y_true)) + 2)

    # return DCG@k
    return np.sum(gains / discounts)

def nDCG_score(y_true, y_pred, k):
    best = DCG_score(y_true, y_true, k)
    real = DCG_score(y_true, y_pred, k)
    # return nDCG@k
    return real / best if best != 0 else 0

# %%
def evaluate(y_true, y_pred, k, length=50):
    NDCG, HR, total = 0.0, 0.0, 0

    for i in range(0, len(y_true), length):
        total += 1

        cur_y_true = y_true[i : i+length]
        cur_y_pred = y_pred[i : i+length]

        NDCG += nDCG_score(cur_y_true, cur_y_pred, k)
        HR   += HR_score  (cur_y_true, cur_y_pred, k)
    
    return NDCG / total, HR / total

In [None]:
## 转化成Tensor
# df_train = pd.read_pickle('/home/duyongkang/PaperRec/Data/Glove_train.pkl')
# df_test = pd.read_pickle('/home/duyongkang/PaperRec/Data/Glove_test.pkl')

device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
# df_train = df_train.sort_index(ascending=True)  # 按索引排序
df_train = shuffle(df_train)
x_train = torch.Tensor(df_train.vector)
y_train = torch.from_numpy(df_train.label.values.reshape(-1,1).astype(np.float32))
# 放在GPU中运算

x_train = x_train.to(device)
y_train = y_train.to(device)

x_test = torch.Tensor(df_test.vector)
y_test = torch.from_numpy(df_test.label.values.reshape(-1,1).astype(np.float32))
# 放在GPU中运算
x_test = x_test.to(device)
y_test = y_test.to(device)
    # model = model.to(device)
## 定义模型

In [8]:
import tensorflow as tf
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.lin_1 = nn.Linear(100, 1024)
        self.lin_2 = nn.Linear(1024, 1)
        self.activate = nn.Tanh()
    def forward(self, input):  # nn.torch.embedding
        x = self.activate(input)
        x = self.lin_1(x)
        x = self.activate(x)
        x = self.lin_2(x)
        return x
lr = 0.03
def get_model():
    model = Net()
    return model, torch.optim.Adam(model.parameters(), lr=lr)

model, opt = get_model()
loss_fn = nn.BCEWithLogitsLoss()
model = model.to(device)

from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
batch = 256*2*2                          #每批训练量
no_of_batches = len(df_train) // batch
epochs = 50                          #轮次
Traindataset = TensorDataset(x_train, y_train)
# Train_dl = DataLoader(Traindataset, batch_size=batch)

In [9]:
for epoch in range(epochs):
    for i in range(no_of_batches):
        x, y = Traindataset[i * batch: i * batch + batch]
        x = x.to(device)
        y = y.to(device)
        y_pred = model(x)                 #使用模型预测
        loss = loss_fn(y_pred, y)         #根据结果计算损失
        opt.zero_grad()                   #梯度清零
        loss.backward()                   #反向传播
        opt.step()                        #优化
    print('epoch:', epoch, '   ', 'loss:', loss_fn(model(x_train), y_train))
    y_pred = model(x_test)
    y_true = np.array(to_list(y_test))
    y_p = np.array(to_list(y_pred))
    ndcg, hr = evaluate(y_true,y_p,5)
    print('NDCG: ',ndcg, 'HR: ', hr)

epoch: 0     loss: tensor(0.5979, device='cuda:1', grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
NDCG:  0.44946384686018487 HR:  0.545431667181144
epoch: 1     loss: tensor(0.5719, device='cuda:1', grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
NDCG:  0.45337148602762656 HR:  0.5466626942700253
epoch: 2     loss: tensor(0.5645, device='cuda:1', grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
NDCG:  0.44162489994372095 HR:  0.5327455233680327
epoch: 3     loss: tensor(0.5660, device='cuda:1', grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
NDCG:  0.43270603089402465 HR:  0.5274473327559226
epoch: 4     loss: tensor(0.5538, device='cuda:1', grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
NDCG:  0.4180445477188773 HR:  0.5086909379839534
epoch: 5     loss: tensor(0.5335, device='cuda:1', grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
NDCG:  0.39533966536965787 HR:  0.4818528810665133
epoch: 6     loss: tensor(0.5618, device='cuda:1', grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
NDC

KeyboardInterrupt: 

In [None]:
# x_test = torch.Tensor(df_test.vector)
# y_test = torch.from_numpy(df_test.label.values.reshape(-1,1).astype(np.float32))
# # 放在GPU中运算

# x_test = x_test.to(device)
# y_test = y_test.to(device)
# # model = model.to(device)

# y_pred = model(x_test)
# y_true = np.array(to_list(y_test))
# y_p = np.array(to_list(y_pred))

for k in range(5,11):
    ndcg, hr = evaluate(y_true,y_p,k)
    print('k: ', k, '  ','NDCG: ',ndcg, 'HR: ', hr)