In [1]:
import pyforest
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from tqdm import tqdm
from random import shuffle
import random
import math
import os

In [2]:
column_names = ['user', 'item', 'rating','timestamp']
data = pd.read_csv('u.data', sep='\t', names=column_names)
data = data[['user', 'item']].copy()
data.user = data.user - 1 # user id starts from 0
print(data.shape)
data.head()

(100000, 2)


Unnamed: 0,user,item
0,195,242
1,185,302
2,21,377
3,243,51
4,165,346


In [3]:
n_users = data.user.nunique() 
n_items = data.item.nunique()

n_users, n_items

(943, 1682)

In [4]:
# min 20 movies per user
data.user.value_counts()

404    737
654    685
12     636
449    540
275    518
      ... 
440     20
35      20
811     20
894     20
92      20
Name: user, Length: 943, dtype: int64

In [5]:
# no such restriction on movies
data.item.value_counts()

50      583
258     509
100     508
181     507
294     485
       ... 
852       1
1505      1
1653      1
1452      1
1641      1
Name: item, Length: 1682, dtype: int64

In [6]:
random.seed(0)

pos_edges = dict()
neg_edges = dict()

for u in range(n_users):
    pos_edges[u] = set()
    neg_edges[u] = set()

# create sets of positive edges for each user
for _, row in data.iterrows():
    u = row['user']
    i = row['item']
    pos_edges[u].add(i)

# create sets of negative edges for each user
for u in tqdm(range(n_users)):
    for i in range(n_items):
        if i not in pos_edges[u]:
            neg_edges[u].add(i)

for u in range(n_users):
    pos_edges[u] = list(pos_edges[u])
    neg_edges[u] = list(neg_edges[u])

# sample negative edges for each user as neg_edges_sampled 
neg_edges_sampled = dict()
for u in tqdm(range(n_users)):
    neg_edges_all = neg_edges[u]
    # sample same number of negative edges as positive edges for each user 
    n_sampled_edges = len(pos_edges[u])
    neg_edges_sampled[u] = np.random.choice(neg_edges_all, size = n_sampled_edges, replace = False)

# divide pos_edges[u] into pos_train[u], pos_test[u] for each user, and similar for neg_edges_sampled
fraction = 0.3

pos_train = dict()
pos_test = dict()
neg_train = dict()
neg_test = dict()

for u in tqdm(range(n_users)):
    shuffle(pos_edges[u])
    shuffle(neg_edges_sampled[u])
    assert len(pos_edges[u]) == len(neg_edges_sampled[u])
    
    n_pos = len(pos_edges[u])
    n_test = math.ceil(n_pos * fraction)
    n_train = n_pos - n_test

    pos_train[u] = pos_edges[u][:n_train]
    pos_test[u] = pos_edges[u][n_train:]
    neg_train[u] = neg_edges_sampled[u][:n_train]
    neg_test[u] = neg_edges_sampled[u][n_train:]

100%|██████████| 943/943 [00:00<00:00, 2077.75it/s]
100%|██████████| 943/943 [00:00<00:00, 5506.47it/s]
100%|██████████| 943/943 [00:00<00:00, 5030.68it/s]


In [7]:
print(pos_edges[92])
print(neg_edges_sampled[92])
print()
print(pos_train[92])
print(pos_test[92])
print()
print(neg_train[92])
print(neg_test[92])

[276, 15, 235, 866, 412, 820, 151, 121, 118, 476, 815, 845, 125, 222, 283, 275, 934, 1, 14, 477]
[ 841 1222 1389  196 1068 1387  921  505 1404  553  534  710  397  281
 1025  378  149  463 1663  994]

[276, 15, 235, 866, 412, 820, 151, 121, 118, 476, 815, 845, 125, 222]
[283, 275, 934, 1, 14, 477]

[ 841 1222 1389  196 1068 1387  921  505 1404  553  534  710  397  281]
[1025  378  149  463 1663  994]


In [17]:
# merge positive and negative edges as train_edges and test_edges, which are dict()
train_edges = []
test_edges = []

for u in tqdm(range(n_users)):
    for i in pos_train[u]:
        train_edges.append((u, i, 1))
    for i in neg_train[u]:
        train_edges.append((u, i, 0))
        
    for i in pos_test[u]:
        test_edges.append((u, i, 1))
    for i in neg_test[u]:
        test_edges.append((u, i, 0))

# shuffle train_edges to mix +ve and -ve edges
np.random.shuffle(train_edges)
np.random.shuffle(test_edges)

100%|██████████| 943/943 [00:00<00:00, 20668.83it/s]


In [21]:
print(len(train_edges))
train_edges

139150


[(233, 56, 1),
 (279, 781, 1),
 (150, 203, 1),
 (93, 58, 1),
 (638, 905, 0),
 (192, 159, 1),
 (341, 614, 0),
 (853, 1091, 0),
 (217, 1609, 0),
 (48, 16, 0),
 (494, 581, 1),
 (339, 143, 1),
 (283, 300, 1),
 (46, 292, 1),
 (327, 176, 1),
 (553, 1462, 0),
 (312, 932, 0),
 (654, 562, 0),
 (200, 658, 1),
 (388, 610, 1),
 (405, 491, 1),
 (129, 689, 1),
 (168, 1259, 0),
 (72, 475, 1),
 (660, 118, 1),
 (766, 187, 1),
 (241, 237, 1),
 (59, 723, 0),
 (12, 291, 0),
 (896, 215, 1),
 (245, 1044, 1),
 (177, 35, 0),
 (42, 302, 1),
 (824, 620, 1),
 (229, 1113, 0),
 (64, 125, 1),
 (89, 509, 1),
 (895, 720, 1),
 (271, 1221, 0),
 (654, 885, 0),
 (306, 443, 0),
 (96, 183, 1),
 (882, 237, 1),
 (22, 1672, 0),
 (508, 258, 1),
 (180, 614, 0),
 (499, 1366, 0),
 (275, 237, 1),
 (719, 242, 1),
 (527, 440, 0),
 (652, 964, 0),
 (572, 895, 0),
 (115, 298, 1),
 (206, 194, 1),
 (471, 318, 1),
 (428, 235, 1),
 (373, 120, 1),
 (507, 151, 1),
 (377, 1229, 0),
 (7, 847, 0),
 (20, 1546, 0),
 (404, 1442, 1),
 (400, 482, 1)

In [20]:
print(len(test_edges))
test_edges

60850


[(803, 678, 1),
 (361, 689, 1),
 (879, 346, 1),
 (693, 199, 1),
 (298, 1474, 0),
 (396, 327, 1),
 (839, 99, 1),
 (626, 22, 1),
 (809, 848, 0),
 (697, 195, 1),
 (300, 118, 1),
 (845, 1479, 1),
 (749, 306, 1),
 (663, 1057, 0),
 (729, 582, 0),
 (294, 1050, 1),
 (216, 919, 0),
 (901, 1340, 0),
 (829, 1405, 0),
 (451, 371, 1),
 (294, 1037, 0),
 (13, 1213, 0),
 (806, 1661, 0),
 (654, 1010, 1),
 (268, 76, 1),
 (641, 1039, 1),
 (93, 1423, 0),
 (795, 1509, 0),
 (709, 483, 1),
 (523, 228, 1),
 (58, 231, 0),
 (393, 88, 1),
 (143, 732, 0),
 (28, 1434, 0),
 (526, 23, 1),
 (157, 175, 1),
 (686, 1067, 0),
 (803, 1091, 0),
 (94, 361, 0),
 (869, 817, 0),
 (789, 1343, 0),
 (647, 21, 1),
 (106, 381, 0),
 (326, 949, 1),
 (804, 1344, 0),
 (10, 281, 0),
 (864, 71, 1),
 (932, 273, 1),
 (320, 41, 0),
 (311, 1537, 0),
 (172, 338, 0),
 (665, 428, 1),
 (378, 208, 1),
 (298, 47, 1),
 (850, 1140, 0),
 (263, 676, 1),
 (416, 883, 0),
 (930, 300, 1),
 (654, 47, 1),
 (832, 511, 1),
 (343, 620, 0),
 (396, 210, 1),
 (12

In [22]:
# make train edges as 2d-array from list of tuples
train_edges_np = np.zeros(shape = (len(train_edges), 3), dtype = np.int32)
for i in range(len(train_edges)):
    user = train_edges[i][0]
    item = train_edges[i][1]
    rating = train_edges[i][2]
    train_edges_np[i] = np.array([user, item, rating])

# save the training edges
np.save('train_edges.npy', train_edges_np)

# make test edges as 2d-array from list of tuples
test_edges_np = np.zeros(shape = (len(test_edges), 3), dtype = np.int32)
for i in range(len(test_edges)):
    user = test_edges[i][0]
    item = test_edges[i][1]
    rating = test_edges[i][2]
    test_edges_np[i] = np.array([user, item, rating])

np.save('test_edges.npy', test_edges_np)