In [31]:
import pyforest
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from tqdm import tqdm
from random import shuffle
import random
import math
import os

In [43]:
column_names = ['user', 'item', 'rating','timestamp']
data = pd.read_csv('u.data', sep='\t', names=column_names)
data = data[['user', 'item']].copy()
data.user = data.user - 1 # user id starts from 0
data.item = data.item - 1 # item id starts from 0
print(data.shape)
data.head()

(100000, 2)


Unnamed: 0,user,item
0,195,241
1,185,301
2,21,376
3,243,50
4,165,345


In [44]:
n_users = data.user.nunique() 
n_items = data.item.nunique()

n_users, n_items

(943, 1682)

In [45]:
data.item.max()

1681

In [46]:
# min 20 movies per user
data.user.value_counts()

404    737
654    685
12     636
449    540
275    518
      ... 
440     20
35      20
811     20
894     20
92      20
Name: user, Length: 943, dtype: int64

In [34]:
# no such restriction on movies
data.item.value_counts()

49      583
257     509
99      508
180     507
293     485
       ... 
851       1
1504      1
1652      1
1451      1
1640      1
Name: item, Length: 1682, dtype: int64

In [47]:
random.seed(0)

pos_edges = dict()
neg_edges = dict()

for u in range(n_users):
    pos_edges[u] = set()
    neg_edges[u] = set()

# create sets of positive edges for each user
for _, row in data.iterrows():
    u = row['user']
    i = row['item']
    pos_edges[u].add(i)

# create sets of negative edges for each user
for u in tqdm(range(n_users)):
    for i in range(n_items):
        if i not in pos_edges[u]:
            neg_edges[u].add(i)

for u in range(n_users):
    pos_edges[u] = list(pos_edges[u])
    neg_edges[u] = list(neg_edges[u])

# sample negative edges for each user as neg_edges_sampled 
neg_edges_sampled = dict()
for u in tqdm(range(n_users)):
    neg_edges_all = neg_edges[u]
    # sample same number of negative edges as positive edges for each user 
    n_sampled_edges = len(pos_edges[u])
    neg_edges_sampled[u] = np.random.choice(neg_edges_all, size = n_sampled_edges, replace = False)

# divide pos_edges[u] into pos_train[u], pos_test[u] for each user, and similar for neg_edges_sampled
fraction = 0.15

pos_train = dict()
pos_test = dict()
neg_train = dict()
neg_test = dict()

for u in tqdm(range(n_users)):
    shuffle(pos_edges[u])
    shuffle(neg_edges_sampled[u])
    assert len(pos_edges[u]) == len(neg_edges_sampled[u])
    
    n_pos = len(pos_edges[u])
    n_test = math.ceil(n_pos * fraction)
    n_train = n_pos - n_test

    pos_train[u] = pos_edges[u][:n_train]
    pos_test[u] = pos_edges[u][n_train:]
    neg_train[u] = neg_edges_sampled[u][:n_train]
    neg_test[u] = neg_edges_sampled[u][n_train:]

100%|██████████| 943/943 [00:00<00:00, 2446.69it/s]
100%|██████████| 943/943 [00:00<00:00, 7376.05it/s]
100%|██████████| 943/943 [00:00<00:00, 6150.86it/s]


In [48]:
print(pos_edges[92])
print(neg_edges_sampled[92])
print()
print(pos_train[92])
print(pos_test[92])
print()
print(neg_train[92])
print(neg_test[92])

[275, 14, 234, 865, 411, 819, 150, 120, 117, 475, 814, 844, 124, 221, 282, 274, 933, 0, 13, 476]
[ 783  634  874 1255 1417  385  106 1342 1021  914 1679  341 1396  855
 1241  583  892  441  401  898]

[275, 14, 234, 865, 411, 819, 150, 120, 117, 475, 814, 844, 124, 221, 282, 274, 933]
[0, 13, 476]

[ 783  634  874 1255 1417  385  106 1342 1021  914 1679  341 1396  855
 1241  583  892]
[441 401 898]


In [49]:
# merge positive and negative edges as train_edges and test_edges, which are dict()
train_edges = []
test_edges = []

for u in tqdm(range(n_users)):
    for i in pos_train[u]:
        train_edges.append((u, i, 1))
    for i in neg_train[u]:
        train_edges.append((u, i, 0))
        
    for i in pos_test[u]:
        test_edges.append((u, i, 1))
    for i in neg_test[u]:
        test_edges.append((u, i, 0))

# shuffle train_edges to mix +ve and -ve edges
np.random.shuffle(train_edges)
np.random.shuffle(test_edges)

100%|██████████| 943/943 [00:00<00:00, 15866.67it/s]


In [50]:
print(len(train_edges))
train_edges

169090


[(233, 610, 1),
 (444, 281, 0),
 (342, 1259, 0),
 (61, 839, 0),
 (93, 385, 1),
 (57, 173, 1),
 (424, 540, 0),
 (926, 759, 0),
 (814, 1517, 0),
 (296, 1288, 0),
 (118, 1595, 0),
 (929, 522, 1),
 (713, 368, 1),
 (434, 230, 0),
 (215, 407, 1),
 (188, 1376, 0),
 (638, 470, 1),
 (220, 12, 0),
 (81, 1209, 0),
 (449, 1558, 0),
 (845, 767, 1),
 (426, 262, 1),
 (581, 3, 0),
 (407, 311, 1),
 (942, 1303, 0),
 (553, 844, 1),
 (714, 264, 1),
 (416, 531, 0),
 (150, 398, 0),
 (342, 1630, 0),
 (457, 844, 1),
 (592, 1039, 0),
 (384, 851, 0),
 (29, 434, 1),
 (12, 1633, 0),
 (55, 153, 1),
 (245, 228, 0),
 (307, 294, 1),
 (639, 1442, 0),
 (503, 1092, 1),
 (369, 180, 1),
 (264, 147, 0),
 (157, 215, 1),
 (563, 1008, 0),
 (881, 701, 0),
 (397, 180, 1),
 (398, 267, 1),
 (231, 1128, 0),
 (657, 1129, 0),
 (654, 69, 1),
 (746, 1332, 0),
 (824, 1086, 1),
 (30, 305, 1),
 (365, 183, 1),
 (615, 241, 0),
 (485, 288, 1),
 (54, 558, 0),
 (101, 942, 0),
 (837, 133, 1),
 (320, 1330, 1),
 (620, 150, 1),
 (464, 79, 0),
 (1

In [51]:
print(len(test_edges))
test_edges

30910


[(724, 285, 1),
 (776, 876, 0),
 (566, 635, 1),
 (652, 207, 1),
 (314, 568, 0),
 (777, 41, 1),
 (879, 190, 1),
 (27, 759, 1),
 (584, 1136, 0),
 (483, 577, 1),
 (150, 1578, 0),
 (270, 243, 1),
 (27, 799, 1),
 (285, 19, 1),
 (498, 86, 1),
 (197, 372, 0),
 (853, 19, 1),
 (906, 931, 0),
 (221, 570, 1),
 (415, 1164, 0),
 (873, 149, 1),
 (279, 321, 1),
 (238, 487, 1),
 (245, 1221, 1),
 (760, 1672, 0),
 (447, 302, 1),
 (691, 1203, 0),
 (525, 1207, 0),
 (5, 1666, 0),
 (658, 453, 0),
 (199, 276, 0),
 (233, 488, 1),
 (406, 0, 1),
 (896, 116, 1),
 (184, 160, 0),
 (827, 465, 0),
 (578, 1650, 0),
 (869, 1453, 0),
 (396, 117, 0),
 (757, 652, 1),
 (621, 14, 1),
 (204, 442, 0),
 (757, 1059, 0),
 (696, 455, 1),
 (226, 593, 0),
 (869, 704, 0),
 (558, 1183, 0),
 (607, 728, 1),
 (61, 498, 0),
 (312, 448, 1),
 (55, 909, 0),
 (565, 367, 0),
 (495, 284, 0),
 (428, 510, 1),
 (115, 457, 0),
 (91, 422, 1),
 (533, 566, 0),
 (660, 30, 1),
 (483, 1412, 0),
 (591, 145, 0),
 (145, 413, 0),
 (534, 29, 1),
 (307, 1617

In [52]:
# make train edges as 2d-array from list of tuples
train_edges_np = np.zeros(shape = (len(train_edges), 3), dtype = np.int32)
for i in range(len(train_edges)):
    user = train_edges[i][0]
    item = train_edges[i][1]
    rating = train_edges[i][2]
    train_edges_np[i] = np.array([user, item, rating])

# save the training edges
np.save('train_edges.npy', train_edges_np)

# make test edges as 2d-array from list of tuples
test_edges_np = np.zeros(shape = (len(test_edges), 3), dtype = np.int32)
for i in range(len(test_edges)):
    user = test_edges[i][0]
    item = test_edges[i][1]
    rating = test_edges[i][2]
    test_edges_np[i] = np.array([user, item, rating])

np.save('test_edges.npy', test_edges_np)

In [53]:
!ls

adj.npy   all_edges.npy    test_edges.npy   u.data
adjr.npy  movielens.ipynb  train_edges.npy
