In [31]:
import pyforest
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from tqdm import tqdm
from random import shuffle
import random
import math
import os

In [29]:
column_names = ['user', 'item', 'rating','timestamp']
data = pd.read_csv('u.data', sep='\t', names=column_names)
data = data[['user', 'item']].copy()
data.user = data.user - 1 # user id starts from 0
data.item = data.item - 1 # item id starts from 0
print(data.shape)
data.head()

(100000, 2)


Unnamed: 0,user,item
0,195,241
1,185,301
2,21,376
3,243,50
4,165,345


In [30]:
n_users = data.user.nunique() 
n_items = data.item.nunique()

n_users, n_items

(943, 1682)

In [32]:
data.item.max()

1681

In [33]:
# min 20 movies per user
data.user.value_counts()

404    737
654    685
12     636
449    540
275    518
      ... 
440     20
35      20
811     20
894     20
92      20
Name: user, Length: 943, dtype: int64

In [34]:
# no such restriction on movies
data.item.value_counts()

49      583
257     509
99      508
180     507
293     485
       ... 
851       1
1504      1
1652      1
1451      1
1640      1
Name: item, Length: 1682, dtype: int64

In [35]:
random.seed(0)

pos_edges = dict()
neg_edges = dict()

for u in range(n_users):
    pos_edges[u] = set()
    neg_edges[u] = set()

# create sets of positive edges for each user
for _, row in data.iterrows():
    u = row['user']
    i = row['item']
    pos_edges[u].add(i)

# create sets of negative edges for each user
for u in tqdm(range(n_users)):
    for i in range(n_items):
        if i not in pos_edges[u]:
            neg_edges[u].add(i)

for u in range(n_users):
    pos_edges[u] = list(pos_edges[u])
    neg_edges[u] = list(neg_edges[u])

# sample negative edges for each user as neg_edges_sampled 
neg_edges_sampled = dict()
for u in tqdm(range(n_users)):
    neg_edges_all = neg_edges[u]
    # sample same number of negative edges as positive edges for each user 
    n_sampled_edges = len(pos_edges[u])
    neg_edges_sampled[u] = np.random.choice(neg_edges_all, size = n_sampled_edges, replace = False)

# divide pos_edges[u] into pos_train[u], pos_test[u] for each user, and similar for neg_edges_sampled
fraction = 0.3

pos_train = dict()
pos_test = dict()
neg_train = dict()
neg_test = dict()

for u in tqdm(range(n_users)):
    shuffle(pos_edges[u])
    shuffle(neg_edges_sampled[u])
    assert len(pos_edges[u]) == len(neg_edges_sampled[u])
    
    n_pos = len(pos_edges[u])
    n_test = math.ceil(n_pos * fraction)
    n_train = n_pos - n_test

    pos_train[u] = pos_edges[u][:n_train]
    pos_test[u] = pos_edges[u][n_train:]
    neg_train[u] = neg_edges_sampled[u][:n_train]
    neg_test[u] = neg_edges_sampled[u][n_train:]

100%|██████████| 943/943 [00:00<00:00, 2445.04it/s]
100%|██████████| 943/943 [00:00<00:00, 6388.60it/s]
100%|██████████| 943/943 [00:00<00:00, 5558.79it/s]


In [37]:
print(pos_edges[92])
print(neg_edges_sampled[92])
print()
print(pos_train[92])
print(pos_test[92])
print()
print(neg_train[92])
print(neg_test[92])

[275, 14, 234, 865, 411, 819, 150, 120, 117, 475, 814, 844, 124, 221, 282, 274, 933, 0, 13, 476]
[1268  901  744  538  390 1134 1236  336  505  214   74 1378  983 1530
  822  126 1215  808  407  810]

[275, 14, 234, 865, 411, 819, 150, 120, 117, 475, 814, 844, 124, 221]
[282, 274, 933, 0, 13, 476]

[1268  901  744  538  390 1134 1236  336  505  214   74 1378  983 1530]
[ 822  126 1215  808  407  810]


In [38]:
# merge positive and negative edges as train_edges and test_edges, which are dict()
train_edges = []
test_edges = []

for u in tqdm(range(n_users)):
    for i in pos_train[u]:
        train_edges.append((u, i, 1))
    for i in neg_train[u]:
        train_edges.append((u, i, 0))
        
    for i in pos_test[u]:
        test_edges.append((u, i, 1))
    for i in neg_test[u]:
        test_edges.append((u, i, 0))

# shuffle train_edges to mix +ve and -ve edges
np.random.shuffle(train_edges)
np.random.shuffle(test_edges)

100%|██████████| 943/943 [00:00<00:00, 18043.27it/s]


In [39]:
print(len(train_edges))
train_edges

139150


[(659, 1463, 0),
 (642, 49, 1),
 (649, 551, 1),
 (445, 287, 1),
 (531, 451, 1),
 (150, 226, 1),
 (893, 1071, 0),
 (621, 941, 0),
 (914, 332, 1),
 (296, 210, 1),
 (63, 1000, 0),
 (736, 57, 1),
 (127, 1186, 0),
 (177, 236, 1),
 (400, 791, 0),
 (591, 292, 1),
 (485, 1244, 0),
 (638, 785, 1),
 (786, 323, 1),
 (377, 105, 1),
 (183, 72, 0),
 (845, 183, 1),
 (503, 1108, 0),
 (384, 430, 0),
 (398, 763, 0),
 (180, 532, 0),
 (147, 404, 0),
 (129, 1536, 0),
 (617, 1598, 0),
 (707, 239, 0),
 (457, 188, 1),
 (535, 266, 0),
 (473, 1051, 0),
 (234, 0, 1),
 (710, 188, 1),
 (617, 92, 1),
 (536, 922, 1),
 (292, 759, 0),
 (762, 237, 1),
 (111, 1273, 0),
 (434, 1227, 1),
 (215, 21, 1),
 (86, 1039, 0),
 (795, 198, 1),
 (455, 1236, 0),
 (760, 863, 1),
 (620, 61, 1),
 (475, 324, 1),
 (341, 285, 1),
 (69, 138, 1),
 (560, 91, 1),
 (755, 419, 1),
 (455, 451, 1),
 (888, 152, 1),
 (472, 136, 1),
 (805, 606, 0),
 (304, 1057, 0),
 (58, 608, 1),
 (798, 257, 1),
 (898, 981, 0),
 (550, 1620, 1),
 (406, 1284, 0),
 (101

In [40]:
print(len(test_edges))
test_edges

60850


[(23, 317, 1),
 (753, 306, 1),
 (197, 424, 0),
 (93, 1201, 0),
 (449, 5, 0),
 (314, 565, 0),
 (503, 730, 1),
 (547, 729, 0),
 (262, 96, 1),
 (681, 833, 1),
 (531, 1010, 1),
 (273, 404, 1),
 (709, 155, 1),
 (663, 146, 0),
 (188, 1121, 0),
 (526, 594, 0),
 (538, 1487, 0),
 (721, 299, 1),
 (392, 1039, 1),
 (772, 1210, 0),
 (415, 931, 0),
 (775, 167, 1),
 (434, 312, 1),
 (180, 915, 0),
 (59, 769, 0),
 (180, 483, 0),
 (338, 757, 0),
 (611, 885, 0),
 (200, 439, 1),
 (605, 984, 0),
 (842, 186, 0),
 (289, 161, 1),
 (270, 479, 1),
 (349, 514, 1),
 (795, 274, 1),
 (256, 287, 1),
 (59, 828, 0),
 (200, 1438, 0),
 (184, 320, 1),
 (647, 234, 1),
 (471, 593, 0),
 (457, 432, 1),
 (248, 524, 0),
 (416, 707, 1),
 (41, 1000, 0),
 (267, 1034, 1),
 (645, 970, 0),
 (59, 209, 1),
 (390, 944, 0),
 (895, 1671, 1),
 (223, 707, 1),
 (868, 24, 1),
 (172, 304, 1),
 (649, 662, 1),
 (560, 529, 1),
 (200, 506, 0),
 (612, 533, 0),
 (641, 774, 1),
 (654, 1233, 0),
 (830, 143, 1),
 (879, 82, 0),
 (292, 1196, 0),
 (485, 

In [41]:
# make train edges as 2d-array from list of tuples
train_edges_np = np.zeros(shape = (len(train_edges), 3), dtype = np.int32)
for i in range(len(train_edges)):
    user = train_edges[i][0]
    item = train_edges[i][1]
    rating = train_edges[i][2]
    train_edges_np[i] = np.array([user, item, rating])

# save the training edges
np.save('train_edges.npy', train_edges_np)

# make test edges as 2d-array from list of tuples
test_edges_np = np.zeros(shape = (len(test_edges), 3), dtype = np.int32)
for i in range(len(test_edges)):
    user = test_edges[i][0]
    item = test_edges[i][1]
    rating = test_edges[i][2]
    test_edges_np[i] = np.array([user, item, rating])

np.save('test_edges.npy', test_edges_np)

In [42]:
!ls

adj.npy   all_edges.npy    test_edges.npy   u.data
adjr.npy  movielens.ipynb  train_edges.npy
