In [1]:
from conf import *
from graph import *
from sampler import *
from train import train_test_split
from collections import Counter
import pickle

Using backend: pytorch


In [45]:
data = pd.read_csv(file_path, header=None, sep=" ")
data.columns = ["uid", "iid", "duration"]
data.shape

(6191505, 3)

In [46]:
data = data[data['uid']!=data['iid']]
data['label'] = data['duration'].map(lambda x:"neg" if x < duration_min else "pos")
data['uid_pos_cnt'] = data['uid'].map(dict(data[data['label']=='pos']['uid'].value_counts()))
data = data[data['uid_pos_cnt']>=5]

In [48]:
data['uid'] = label_encoder(data['uid'])
data['iid'], iid_map_reverse = label_encoder(data['iid'], return_dict=True)

In [60]:
data[:2]

Unnamed: 0,uid,iid,duration,label,uid_pos_cnt
4,0,0,24,pos,29.0
5,0,1,4,neg,29.0


In [71]:
pos_data, neg_data = data[data['label']=='pos'], data[data['label']=='neg']


In [72]:
len(pos_data), len(neg_data)

(2654722, 2715028)

In [100]:
graph_data = {
    ('user', 'neg', 'item'): (neg_data['uid'].values, neg_data['iid'].values),
       ('user', 'clicked', 'item'): (pos_data['uid'].values, pos_data['iid'].values),
       ('item', 'clicked-by', 'user'): (pos_data['iid'].values, pos_data['uid'].values),
        
    }

g = dgl.heterograph(graph_data, idtype=torch.int64)

In [101]:
g

Graph(num_nodes={'item': 81937, 'user': 184994},
      num_edges={('item', 'clicked-by', 'user'): 2654722, ('user', 'clicked', 'item'): 2654722, ('user', 'neg', 'item'): 2715028},
      metagraph=[('item', 'user', 'clicked-by'), ('user', 'item', 'clicked'), ('user', 'item', 'neg')])

In [65]:
len(pos_data), len(neg_data)

(2654722, 2715028)

In [98]:
list(g.metagraph()['user']['item'])[0]

'clicked'

In [107]:
  
g.nodes['item'].data['iid'] = torch.LongTensor(list(iid_map_reverse.keys()))
g.nodes['item'].data['neg_weight'] = torch.FloatTensor(data['iid'].value_counts().sort_index().map(lambda x:pow(x, 0.75)).values)



In [70]:
g.nodes['item']

NodeSpace(data={'iid': tensor([    0,     1,     2,  ..., 81934, 81935, 81936]), 'neg_weight': tensor([477.3852, 299.2940, 401.0963,  ...,   1.0000,   1.0000,   1.0000])})

In [78]:
import random

In [93]:
random.random()<0.1

True

In [79]:
random.random()

0.2926896895081862

In [106]:
g.nodes['item']

NodeSpace(data={})

In [117]:
class ItemToItemBatchSampler(IterableDataset):
    def __init__(self, g, user_type, item_type, batch_size, hard_neg_ratio):
        self.g = g
        self.user_type = user_type
        self.item_type = item_type
        self.user_to_item_etype = 'clicked'
        self.item_to_user_etype = 'clicked-by'
        self.batch_size = batch_size
        self.neg_weights = g.nodes[item_type].data['neg_weight']
        self.neg_etype = 'neg'
        self.hard_neg_ratio = hard_neg_ratio

    def __iter__(self):
        while True:
            heads = torch.randint(0, self.g.number_of_nodes(self.item_type), (self.batch_size,))
            tails = dgl.sampling.random_walk(self.g, heads, metapath=[self.item_to_user_etype, self.user_to_item_etype])[0][:, 2]
            if random.random() < self.hard_neg_ratio:
                print("hard neg")
                neg_tails = dgl.sampling.random_walk(self.g, heads, metapath=[self.item_to_user_etype, self.neg_etype])[0][:, 2]
            else:
                neg_tails = torch.multinomial(self.neg_weights, self.batch_size, replacement=True)
            mask = (heads != tails) & (tails != -1)
            yield heads[mask], tails[mask], neg_tails[mask]

In [118]:
batch_sampler = ItemToItemBatchSampler(g, 'user', 'item', 128, 0.5)

In [164]:
sum(next(batch_sampler.__iter__())[2]==-1)

tensor(0)

In [140]:
neighbor_sampler = NeighborSampler(g, 'user', 'item', random_walk_length, random_walk_restart_prob, num_random_walks, num_neighbors, num_layers)

In [141]:
neighbor_sampler

<sampler.NeighborSampler at 0x7fd3bf765a50>

In [137]:
from train import *

In [138]:
g, (test_user, test_item) = train_test_split_by_item(g)

test cnt 62144


In [142]:
g.number_of_nodes()

266931

In [148]:
g.number_of_nodes('item')

81937

In [130]:
data[:10]

Unnamed: 0,uid,iid,duration,label,uid_pos_cnt
4,0,0,24,pos,29.0
5,0,1,4,neg,29.0
6,0,2,14,pos,29.0
7,0,3,2,neg,29.0
8,0,4,21,pos,29.0
9,0,5,61,pos,29.0
10,0,6,27,pos,29.0
11,0,7,67,pos,29.0
12,0,8,72,pos,29.0
13,0,9,1,neg,29.0


In [29]:
data['uid'].value_counts()

7318349397582997     3451
3940649677164499     3278
4222124655780413     3217
2251799820626731     2930
12947848931714286    2903
                     ... 
10977524092685113       1
10977524096785852       1
10977524099571995       1
11540474046014201       1
281474982757977         1
Name: uid, Length: 454760, dtype: int64

In [27]:
data[data['label']=='pos']['uid'].value_counts()

9288674256213040     1679
3940649677164499     1335
6473924134852098     1174
4222124655780413     1073
2251799814823112     1071
                     ... 
10133099170413243       1
1970324839533492        1
10133099168529527       1
4785074602873866        1
3377699727372538        1
Name: uid, Length: 412037, dtype: int64

In [25]:
data[:20]

Unnamed: 0,uid,iid,duration,label
0,3635342465,5629499488129327,3,neg
1,3635506408,3940649677547010,15,pos
2,3635506408,10414574139635972,12,pos
3,3635576499,4222124652261817,258,pos
4,3635655035,8725724281777249,24,pos
5,3635655035,10414574147537818,4,neg
6,3635655035,5066549353021961,14,pos
7,3635655035,7599824377346822,2,neg
8,3635655035,3096224746984975,21,pos
9,3635655035,1688849864380705,61,pos


In [17]:
"neg" if 9 < duration_min else "pos"

'neg'

In [6]:
data.shape

(6145146, 3)

In [7]:
neg_data = data[data['duration']<duration_min]

In [8]:
neg_data.shape

(3064780, 3)

In [9]:
neg_data[:10]

Unnamed: 0,uid,iid,duration
0,3635342465,5629499488129327,3
5,3635655035,10414574147537818,4
7,3635655035,7599824377346822,2
13,3635655035,8725724280117360,1
15,3635655035,4785074600865389,1
16,3635655035,6755399377703027,4
18,3635712991,562949960913106,4
24,3636711893,6473924130514075,9
25,3636711893,7318349398643511,3
30,3638859695,10414574139301724,7


In [13]:
item_cnt_df = data['iid'].value_counts().reset_index()
item_cnt_df.columns = ['iid', 'cnt']

In [14]:
item_cnt_df

Unnamed: 0,iid,cnt
0,7318349398643511,36747
1,6192448702685734,30472
2,8725724280117360,24214
3,10977524093405040,23489
4,3635964080,17707
...,...,...
92100,3096224749838618,1
92101,10977524099798665,1
92102,11821949024565139,1
92103,281474983437146,1


In [43]:
hot_threshold = data.shape[0] / 1000


In [44]:
hot_threshold

6083.838

In [23]:
hot_threshold = data.shape[0] / 100
news_count_df['is_hot_news'] = news_count_df['count'] > hot_threshold
hot_news = news_count_df[news_count_df['is_hot_news']==True]['news_id'].values
data = data[~data['news_id'].isin(hot_news)].reset_index(drop=True)

NameError: name 'news_count_df' is not defined

In [17]:
hot_threshold

6083.838

In [26]:
hot_threshold = data.shape[0] / 1000

In [28]:
data.shape

(6083838, 3)

In [31]:
36747/data.shape[0]

0.006040101659511644

In [35]:
hot_news = data['iid'].value_counts()[:50].index

In [37]:
data.shape

(6083838, 3)

In [36]:
data[~data['iid'].isin(hot_news)].reset_index(drop=True).shape

(5393026, 3)

In [39]:
data['iid'].value_counts()[:50].values.sum()

690812

In [42]:
hot_threshold

6083.838

In [48]:
data['iid'].value_counts()[:50]

7318349398643511     36747
6192448702685734     30472
8725724280117360     24214
10977524093405040    23489
3635964080           17707
2814749770130811     17455
6473924130491529     17153
6473924129646648     16358
12384898978899310    16102
1970324840193949     15789
1970324840633684     15771
844424936046810      15426
5348024340327152     15107
12384898982274759    14393
3639492602           14261
10696049117715488    14014
2533274792172350     14006
10414574139366286    13910
3640717553           13855
6755399375270475     13843
2533274793492979     13520
3634548695           13478
844424933697312      12679
6755399377494813     12579
1688849863371969     12360
10414574139317308    12342
7036874418203443     12172
10696049120069277    11693
3377699727977625     11564
562949960913106      11478
1125899910760045     11268
5629499487053945     11101
3634851206           10994
2533274794800435     10912
4785074601112798     10904
6755399374132337     10752
10977524092959949    10522
8

In [54]:
g.ndata

defaultdict(<class 'dict'>, {'iid': {'item': tensor([    0,     1,     2,  ..., 83710, 83711, 83712])}, 'neg_weight': {'item': tensor([2.4700e+03, 6.8999e+01, 9.3325e+02,  ..., 1.0000e+00, 1.0000e+00,
        1.0000e+00])}})

In [50]:
(data['iid'].value_counts()<5).sum()

46153

In [52]:
83713-46153

37560

In [53]:
data.shape[0]/83713

72.67494893266279

In [51]:
46153

46153

In [46]:
(data['iid'].value_counts() > hot_threshold).sum()

147

In [55]:
item2user_g = g.edge_type_subgraph(['clicked-by'])
item2user_g.edata['random_weight'] = torch.rand(item2user_g.number_of_edges())
item2user_test_g = dgl.sampling.select_topk(item2user_g, 1, 'random_weight', edge_dir='out')
test_eids = item2user_test_g.edata['_ID']
test_item, test_user = g.find_edges(test_eids, etype='clicked-by')
print("test cnt", len(test_item))

# g = dgl.remove_edges(g, test_eids, etype='clicked')
# g = dgl.remove_edges(g, test_eids, etype='clicked-by')
# return g

test cnt 64218


In [60]:
train_g = g
train_g = dgl.remove_edges(train_g, test_eids, etype='clicked')
train_g = dgl.remove_edges(train_g, test_eids, etype='clicked-by')
train_g

Graph(num_nodes={'item': 64218, 'user': 247312},
      num_edges={('item', 'clicked-by', 'user'): 2792213, ('user', 'clicked', 'item'): 2792213},
      metagraph=[('item', 'user', 'clicked-by'), ('user', 'item', 'clicked')])

In [39]:
g = dgl.remove_edges(g, test_eids, etype='clicked')
g = dgl.remove_edges(g, test_eids, etype='clicked-by')

In [43]:
test_user, test_item = g.find_edges(test_eids, etype='clicked')

In [2]:
# file_path = "/data/zsj/pinsage/input/sample.txt"
g, iid_map_reverse = build_graph(file_path)
print(g)
# train_g, (test_user, test_item) = train_test_split(g)
# g = train_g
# print(train_g)
# print(len(test_user))

params: file_path /data/zsj/pinsage/input/graph.txt uid_min_cnt 5 duration_min 10
origin data shape 6083838 uid cnt 467349 iid cnt 92105
data shape 5600859 uid cnt 227787 iid cnt 83713
Graph(num_nodes={'item': 83713, 'user': 227787},
      num_edges={('item', 'clicked-by', 'user'): 5600859, ('user', 'clicked', 'item'): 5600859},
      metagraph=[('item', 'user', 'clicked-by'), ('user', 'item', 'clicked')])


In [8]:
dgl.sampling.PinSAGESampler

<module 'dgl' from '/data/zsj/miniconda3/lib/python3.7/site-packages/dgl/__init__.py'>

In [6]:
g.edges['clicked-by']

EdgeSpace(data={'pos_weight': tensor([4.3438, 1.0986, 3.2189,  ..., 6.2442, 1.9459, 1.0986]), 'walk_weight': tensor([4.3438, 1.0986, 3.2189,  ..., 6.2442, 1.9459, 1.0986])})

In [15]:
g.nodes['item'].data['neg_weight']

tensor([1.9899e+03, 5.7045e+02, 3.0800e+02,  ..., 1.0000e+00, 1.0000e+00,
        1.0000e+00])

In [17]:
1/g.nodes['item'].data['neg_weight']

tensor([5.0254e-04, 1.7530e-03, 3.2468e-03,  ..., 1.0000e+00, 1.0000e+00,
        1.0000e+00])

In [16]:
g.nodes['item'].data['pos_weight'] = 1/g.nodes['item'].data['neg_weight']

{'iid': tensor([    0,     1,     2,  ..., 63011, 63012, 63013]), 'neg_weight': tensor([1.9899e+03, 5.7045e+02, 3.0800e+02,  ..., 1.0000e+00, 1.0000e+00,
        1.0000e+00])}

In [None]:
g.nodes['item'].data['neg_weight'] = torch.FloatTensor(data['iid'].value_counts().sort_index().map(lambda x:pow(x, 0.75)).values)


In [59]:
batch_sampler = ItemToItemBatchSampler(g, 'user', 'item', batch_size)
neighbor_sampler = NeighborSampler(g, 'user', 'item', random_walk_length, random_walk_restart_prob, num_random_walks, num_neighbors, num_layers)

collator = PinSAGECollator(neighbor_sampler, g, 'item')
dataloader = DataLoader(batch_sampler, collate_fn=collator.collate_train, num_workers=num_workers)
dataloader_test = DataLoader(torch.arange(g.number_of_nodes('item')), batch_size=batch_size, collate_fn=collator.collate_test, num_workers=num_workers)
dataloader_it = iter(dataloader)


In [60]:
pos_graph, neg_graph, blocks = collator.collate_test_auc(test_user, test_item, 'clicked')

NameError: name 'test_user' is not defined

In [61]:
with open(output_path, 'rb') as f:
    item_embeddings, iid_map_reverse = pickle.load(f)
    print("item:", item_embeddings.shape, len(iid_map_reverse))

item_embeddings = torch.Tensor(item_embeddings)

item: (117285, 64) 117285


In [10]:
from evaluation import cal_hr_k
hit_rate = cal_hr_k(g, item_embeddings, test_user, test_item)

In [9]:
hit_rate


0.12900151432356202

In [70]:
batches = torch.arange(len(test_item)).split(batch_size)
batch = batches[0]

users = test_user[batch]
items = test_item[batch]

for i in range(len(batch)):
    user = users[i]
    item = items[i]
    clicked_ids = g.successors(user, etype='clicked')
    item_emb = item_embeddings[clicked_ids]
    dist = item_emb @ item_embeddings.t()
    top_sim_ids = dist.topk(top_k, 1)[1]
    
    item_hit = 0
    for sim_ids in top_sim_ids:
        if item in sim_ids:
            item_hit+=1
    print(user, len(clicked_ids), item_emb.shape, dist.shape)
    break

tensor(59864) 10 torch.Size([10, 64]) torch.Size([10, 117285])


In [None]:
for sim_ids 

In [76]:
for sim_ids in top_sim_ids:
    print(sim_ids)

tensor([ 25857,  40456,  27870,  14772,   4469,  19496,  29321,  62182,  42305,
         21000,  91983,  11825,  21423,  11875,  15722,  10354,  13160,  80938,
         13764,  25535,  14271,  87058,    104,  31955,  20538,  44762,  41000,
         74003,   8211,   7368,   7385,  96266,  35752,  10467,  34342,  89379,
         12434,  80181,  26402,  33308,  35951,  29163, 107764,  22619,  38093,
         73891,  36803,  16942,  63079,  20494])
tensor([ 26472,  71660,  25475,   4910,  44599,  18394,   4124,  32953,  64122,
         48218,  42869,  33962,  61442,  26284,  46345,  83854,    689,  13443,
         22974,  22061,   7032,  34827,   1509,  46709,  66566,  37922,  69824,
          8065,  36869,  51146,  96289, 113498,  79896,  11880,  29909,  65574,
         67938,  74330,  27232,   5649,    702,  54866,  57737,  10682,  84946,
         51652, 111680,  23675,  40108,   9541])
tensor([  8508,  65085,  28942,  67975,  20323,  45348,  35908,   9833,  12674,
         61895,  13051

In [86]:
batches = torch.arange(len(test_item)).split(batch_size)

hit = 0
for batch in batches[:2]:
    users = test_user[batch]
    items = test_item[batch]
#     item_emb = item_embeddings[items]

#     dist = item_emb @ item_embeddings.t()
#     top_sim_ids = dist.topk(top_k, 1)[1]
    for i in range(len(batch)):
        user = users[i]
        item = items[i]
#         sim_ids = top_sim_ids[i]
        clicked_ids = g.successors(user, etype='clicked')
        item_emb = item_embeddings[clicked_ids]
        dist = item_emb @ item_embeddings.t()
        top_sim_ids = dist.topk(top_k, 1)[1]
        
        item_hit = 0
        for sim_ids in top_sim_ids:
            if item in sim_ids:
                item_hit+=1
                
        hit += item_hit/len(clicked_ids)

# hit_rate = hit/len(test_item)
hit_rate = hit/100
print("hr@", top_k, hit_rate)

hr@ 50 0.39583622184838707


In [28]:
batches = torch.arange(len(test_item)).split(batch_size)

hit = 0
for batch in batches:
    users = test_user[batch]
    items = test_item[batch]
    item_emb = item_embeddings[items]
    
    dist = item_emb @ item_embeddings.t()
    top_sim_ids = dist.topk(50, 1)[1]
    for i in range(len(batch)):
        user = users[i]
        sim_ids = top_sim_ids[i]
        clicked_ids = g.successors(user, etype='clicked')
        hit_ids_cnt = len(set(sim_ids.numpy()) & set(clicked_ids.numpy()))
        hit += hit_ids_cnt/len(clicked_ids)
        
print("hr@50", hit/len(test_item))

hr@50 0.040673906152980104


In [31]:
len(Counter(test_item.numpy()))

17211

In [56]:
from model import *

model = PinSAGEModel(g, 'item', hidden_dims_dict, num_layers)

node feat neg_weight not exist


In [None]:
item_batches = torch.arange(g.number_of_nodes('item')).split(batch_size)
        h_item_batches = []
        for blocks in dataloader_test:
            h_item_batches.append(model.get_repr(blocks))

In [5]:
h_item_batches = []
for blocks in dataloader_test:
    h_item_batches.append(model.get_repr(blocks))

KeyboardInterrupt: 

In [10]:
%%time
blocks = next(dataloader_test.__iter__())

CPU times: user 58.6 s, sys: 1.08 s, total: 59.6 s
Wall time: 2.92 s


In [57]:
model.get_repr(blocks)

NameError: name 'blocks' is not defined

In [61]:
pos_graph, neg_graph, blocks = next(dataloader_it)

pos_score, neg_score = model(pos_graph, neg_graph, blocks)

In [62]:
loss = (neg_score - pos_score + 1).clamp(min=0).mean()
loss

tensor(1.0042, grad_fn=<MeanBackward0>)

In [150]:
sampler = dgl.sampling.PinSAGESampler(g, 'item', 'user', 2, 0.5, 1000, 30)

In [151]:
frontier = sampler([2])
frontier.all_edges()

(tensor([   2,    0,   33,   12,    7,  759,  145,   39,  114,  391,  175,  536,
         1179,  180,  323,   41,   67,  129,  124, 1246, 1119,    1,   81,  206,
         2279,  243,  281,  282, 1680,  687]),
 tensor([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
         2, 2, 2, 2, 2, 2]))

In [152]:
frontier = sampler([2])
frontier.all_edges()

(tensor([   2,    0,    7,   12, 1119,  282,   33, 1246,  114,  981,  145,  448,
          297,  983, 1304,  831,   40,   41,   47,  467,  699,  562,  193,  658,
          323, 1179,   88, 1111,  124, 1070]),
 tensor([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
         2, 2, 2, 2, 2, 2]))

In [153]:
frontier = sampler([2])
frontier.all_edges()

(tensor([   2,    0,    7,   12,  145,  114,   33,  297,  176,   41, 3524,  719,
         1203,  161,  823,  281,  661,    6, 1103,  574,  433, 1246, 1089,  266,
         1111,  206,   81, 4892,  764, 1308]),
 tensor([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
         2, 2, 2, 2, 2, 2]))

In [117]:
frontier = sampler([6])
frontier.all_edges()

(tensor([   6, 3482, 2693, 4539, 4717,   55, 7313, 4515,   79,   80]),
 tensor([6, 6, 6, 6, 6, 6, 6, 6, 6, 6]))

In [118]:
frontier.edata

{'weights': tensor([1364,  265,  248,  202,  192,  179,  174,  161,  149,  145]), '_ID': tensor([   5,  814,  660,  987, 1002,   29, 1367,  981,   35,   36])}

In [119]:
frontier.edata

{'weights': tensor([1364,  265,  248,  202,  192,  179,  174,  161,  149,  145]), '_ID': tensor([   5,  814,  660,  987, 1002,   29, 1367,  981,   35,   36])}

In [12]:
pos_label = torch.ones_like(pos_score)
neg_label = torch.zeros_like(neg_score)
score = torch.cat([pos_score, neg_score])
labels = torch.cat([pos_label, neg_label])
loss = F.binary_cross_entropy_with_logits(score, labels)
loss

tensor(0.7279, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)

In [46]:
data = pd.read_csv(file_path, header=None, sep=" ")
data.columns = ["uid", "iid", "duration"]
print(data.shape, data.nunique())

data = data[data['uid']!=data['iid']]
data = data[data['duration']>=30]
data['uid_cnt'] = data['uid'].map(dict(data['uid'].value_counts()))
data = data[data['uid_cnt']>=5]
print(data.shape, data.nunique())

(16280820, 3) uid         1478955
iid          167329
duration      32142
dtype: int64
(4829884, 4) uid         457923
iid         117285
duration     15787
uid_cnt        358
dtype: int64


In [47]:
data = pd.read_csv(file_path, header=None, sep=" ")
data.columns = ["uid", "iid", "duration"]
print(data.shape, data.nunique())

data = data[data['uid']!=data['iid']]
data = data[data['duration']>=10]
data['uid_cnt'] = data['uid'].map(dict(data['uid'].value_counts()))
data = data[data['uid_cnt']>=5]
print(data.shape, data.nunique())

(16280820, 3) uid         1478955
iid          167329
duration      32142
dtype: int64
(7247959, 4) uid         555007
iid         129899
duration     16103
uid_cnt        459
dtype: int64


In [53]:
h

tensor([[ 0.0420, -0.0065, -0.0060,  ...,  0.0984,  0.0282,  0.1073],
        [ 0.0118, -0.0069,  0.0025,  ...,  0.4265,  0.0041, -0.0065],
        [ 0.0035,  0.0093,  0.0010,  ...,  0.2216,  0.0032, -0.0034],
        ...,
        [ 0.0878, -0.0071, -0.0010,  ...,  0.2556,  0.0091, -0.0032],
        [ 0.0763, -0.0056,  0.0035,  ...,  0.0022,  0.0460, -0.0017],
        [ 0.3204,  0.0024,  0.0910,  ...,  0.0565, -0.0008,  0.0034]],
       grad_fn=<AddBackward0>)

In [63]:
h = model.get_repr(blocks)

In [97]:
pos_graph, neg_graph

(Graph(num_nodes=663, num_edges=228,
       ndata_schemes={'_ID': Scheme(shape=(), dtype=torch.int64)}
       edata_schemes={}),
 Graph(num_nodes=663, num_edges=228,
       ndata_schemes={'_ID': Scheme(shape=(), dtype=torch.int64)}
       edata_schemes={}))

In [101]:
n_nodes = g.number_of_nodes('item')

In [102]:
n_nodes

27381

In [103]:
bias = torch.nn.Parameter(torch.zeros(n_nodes))

In [104]:
bias

Parameter containing:
tensor([0., 0., 0.,  ..., 0., 0., 0.], requires_grad=True)

In [64]:
pos_graph.ndata['h'] = h

In [65]:
pos_graph.ndata['h']

tensor([[ 0.1900, -0.0058, -0.0063,  ..., -0.0010, -0.0037,  0.0608],
        [ 0.0665,  0.0641,  0.2258,  ...,  0.0032, -0.0023,  0.1422],
        [ 0.0465,  0.1795,  0.0619,  ...,  0.1856, -0.0073,  0.0369],
        ...,
        [-0.0033,  0.0018,  0.0653,  ..., -0.0038,  0.0322,  0.0031],
        [ 0.0051,  0.0773,  0.0352,  ...,  0.0027,  0.0076,  0.1767],
        [-0.0003,  0.0287,  0.0401,  ..., -0.0076,  0.0065,  0.0067]],
       grad_fn=<AddBackward0>)

In [87]:
pos_graph.apply_edges(fn.u_dot_v('h', 'h', 's'))

In [90]:
pos_graph.edata['s'] = pos_graph.edata['s'].view(-1)

In [76]:
pos_graph.apply_edges(_add_bias)

In [73]:
bias = nn.Parameter(torch.zeros(g.number_of_nodes('item')))

In [82]:
def _add_bias(edges):
    bias_src = bias[edges.src[dgl.NID]]
    bias_dst = bias[edges.dst[dgl.NID]]
    return {'s': edges.data['s'] + bias_src + bias_dst}

In [93]:
pos_graph.apply_edges(_add_bias)

In [83]:
pos_graph.apply_edges(_add_bias)

In [95]:
pos_graph.edata['s'].mean()

tensor(0.5002, grad_fn=<MeanBackward0>)

In [110]:
item_cnt_df['walk_weight'] = 1/item_cnt_df['cnt'].map(lambda x:pow(x, 0.75))

In [112]:
1/0.000377

2652.5198938992044

In [115]:
data['duration'].map(math.log1p).min()

0.0

In [119]:
math.log1p(1)

0.6931471805599453

In [123]:
item_cnt_df[item_cnt_df['iid']==7881299350955178]

Unnamed: 0,iid,cnt,walk_weight
23537,7881299350955178,15,0.131199


In [96]:
bias

Parameter containing:
tensor([0., 0., 0.,  ..., 0., 0., 0.], requires_grad=True)

In [86]:
(pos_graph.edata['s'] - pos_graph.edata['s']+1).mean()

tensor(1., grad_fn=<MeanBackward0>)

In [80]:
pos_graph.edata['s'] = pos_graph.edata['s'].view(-1)

In [126]:
pos_graph.edges

<dgl.view.HeteroEdgeView at 0x7fca12cf6510>

In [122]:
edges.src[dgl.NID]

NameError: name 'edges' is not defined

In [141]:
F.sigmoid(neg_graph.edata['s'])

KeyError: 's'

In [139]:
pos_graph.edata['s']

tensor([[0.3109],
        [0.5944],
        [0.2907],
        [0.4834],
        [0.5212],
        [0.1901],
        [0.3533],
        [0.4460],
        [0.4893],
        [0.2998],
        [0.3184],
        [0.4980],
        [0.5220],
        [0.4429],
        [0.2053],
        [0.5119],
        [0.5731],
        [0.3242],
        [0.2386],
        [0.4776],
        [0.2827],
        [0.4197],
        [0.3305],
        [0.5345],
        [0.5259],
        [0.5659],
        [0.3151],
        [0.3576],
        [0.3436],
        [0.3791],
        [0.3862],
        [0.6288],
        [0.4490],
        [0.4226],
        [0.2450],
        [0.4893],
        [0.3655],
        [0.5076],
        [0.5852],
        [0.4007],
        [0.3001],
        [0.6301],
        [0.2537],
        [0.4710],
        [0.3276],
        [0.3844],
        [0.2430],
        [0.5160],
        [0.4471],
        [0.4815],
        [0.5184],
        [0.3033],
        [0.5297],
        [0.5968],
        [0.3274],
        [0

In [136]:
pos_graph.apply_edges(_add_bias)

In [49]:
pos_graph.edata['s']

KeyError: 's'

In [None]:
def inference(self, g, x, batch_size, device):
        # inference 用于评估测试，针对的是完全图
        # 目前会出现重复计算的问题，优化方案还在 to do list 上
        nodes = th.arange(g.number_of_nodes())
        for l, layer in enumerate(self.layers):
            y = th.zeros(g.number_of_nodes(),
                         self.n_hidden if l != len(self.layers) - 1 else self.n_classes)
            for start in trange(0, len(nodes), batch_size):
                end = start + batch_size
                batch_nodes = nodes[start:end]
                block = dgl.to_block(dgl.in_subgraph(g, batch_nodes), batch_nodes)
                input_nodes = block.srcdata[dgl.NID]
                h = th.Tensor(x[input_nodes]).to(device)
                h_dst = h[:block.number_of_dst_nodes()]
                h = layer(block, (h, h_dst))
                if l != len(self.layers) - 1:
                    h = self.activation(h)
                    h = self.dropout(h)
                y[start:end] = h.cpu()
            x = y
        return y

In [9]:
aaa = "[10510364 10683295 12324075 12751059 12824074 3036238 6663520 7054128 8122430 8313001 9755885 9875552 10461616 10970660 11847996 12633224 12652155 12811847 12961523 12991779 13148093 13252032 1929817 6703929 10103217 10257620 11168769 11709600 11943315 1219460 12888389 12905923 13034053 6617552 7193357 9109540 10823778 11481036 11693452 13258460 1384121 344112 3702627 7365852 8158483 9294903 9374388 9599382 10258772 10964543 12221549 12353800 12409283 12924238 13211957 13269124 2773999 4764361 7380230 7550438 10006703 10178340 11138365 13200619 13223161 13258853 6681383 7437465 7849623 7978467 8634905 8643650 10838587 12259280 13186562 13204223 13214053 3183824 320657 51088 6720436 6829412 7604383 8168494 10649580 11541705 11676443 12078797 12568396 12636786 13163307 6525561 7169176 7849524 8428988 8905661 10400225 10759384 10766228 10909283 11444823 12323401 12618467 12902167 3307927 6390569 8068041 873704 10949771 11200796 11444374 12421854 13065872 13267721 2152 7603107 8875682 9096343 9636988 9691375 10697127 11130229 11474470 11703339 2872637 6440919 6971122 7596915 8175916 8666386 9326892 9652983]"

In [10]:
len(aaa.split())

132

In [11]:
132/12

11.0