#### **Library imports**

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from tqdm import tqdm
from random import shuffle
import random
import math

#### **Explore the data files**

In [12]:
column_names = ['user', 'item', 'rating', 'timestamp']

In [24]:
data = pd.read_csv('files/u.data', sep='\t', names = column_names)
print(data.shape)
print(data.user.nunique())
print(data.item.nunique())
data.head()

(100000, 4)
943
1682


Unnamed: 0,user,item,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [14]:
data = pd.read_csv('files/u1.base', sep='\t', names = column_names)
print(data.shape)
data.head()

(80000, 4)


Unnamed: 0,user,item,rating,timestamp
0,1,1,5,874965758
1,1,2,3,876893171
2,1,3,4,878542960
3,1,4,3,876893119
4,1,5,3,889751712


In [15]:
data = pd.read_csv('files/u1.test', sep='\t', names = column_names)
print(data.shape)
data.head()

(20000, 4)


Unnamed: 0,user,item,rating,timestamp
0,1,6,5,887431973
1,1,10,3,875693118
2,1,12,5,878542960
3,1,14,5,874965706
4,1,17,3,875073198


In [26]:
for index in list(range(1, 6)):
    data_train = pd.read_csv('files/u{}.base'.format(index), sep='\t', names = column_names)
    data_test = pd.read_csv('files/u{}.test'.format(index), sep='\t', names = column_names)
    print('file #{}'.format(index))
    print('Train data:')
    print('Unique users: {}'.format(data_train.user.nunique()))
    print('Unique items: {}'.format(data_train.item.nunique()))
    print('Test data:')
    print('Unique users: {}'.format(data_test.user.nunique()))
    print('Unique items: {}'.format(data_test.item.nunique()))
    print()
# based on results of this cell, file #5 is the most complete

file #1
Train data:
Unique users: 943
Unique items: 1650
Test data:
Unique users: 459
Unique items: 1410

file #2
Train data:
Unique users: 943
Unique items: 1648
Test data:
Unique users: 653
Unique items: 1420

file #3
Train data:
Unique users: 943
Unique items: 1650
Test data:
Unique users: 869
Unique items: 1423

file #4
Train data:
Unique users: 943
Unique items: 1660
Test data:
Unique users: 923
Unique items: 1394

file #5
Train data:
Unique users: 943
Unique items: 1650
Test data:
Unique users: 927
Unique items: 1407



#### **Generate train and test edges from file #5**

In [64]:
data_train = pd.read_csv('files/u5.base', sep='\t', names = column_names)
data_test = pd.read_csv('files/u5.test', sep='\t', names = column_names)
data_full = pd.read_csv('files/u.data', sep='\t', names = column_names)

data_train = data_train[['user', 'item']]
data_test = data_test[['user', 'item']]
data_full = data_full[['user', 'item']]

data_full['user'] = data_full['user'] - 1
data_full['item'] = data_full['item'] - 1

data_train['user'] = data_train['user'] - 1
data_train['item'] = data_train['item'] - 1

data_test['user'] = data_test['user'] - 1
data_test['item'] = data_test['item'] - 1

In [65]:
n_users, n_items, n_samples = data_full.user.nunique(), data_full.item.nunique(), data_full.shape[0]
n_users_train, n_items_train, n_samples_train = data_train.user.nunique(), data_train.item.nunique(), data_train.shape[0]
n_users_test, n_items_test, n_samples_test = data_test.user.nunique(), data_test.item.nunique(), data_test.shape[0]

print('n_users: {}, n_items: {}, n_samples: {}'.format(n_users, n_items, n_samples))
print('n_users_train: {}, n_items_train: {}, n_samples_train: {}'.format(n_users_train, n_items_train, n_samples_train))
print('n_users_test: {}, n_items_test: {}, n_samples_test: {}'.format(n_users_test, n_items_test, n_samples_test))

n_users: 943, n_items: 1682, n_samples: 100000
n_users_train: 943, n_items_train: 1650, n_samples_train: 80000
n_users_test: 927, n_items_test: 1407, n_samples_test: 20000


##### **Generate negative edges using full data**

In [66]:
random.seed(0)

pos_edges = dict()
neg_edges = dict()

for u in range(n_users):
    pos_edges[u] = set()
    neg_edges[u] = set()

# create sets of positive edges for each user
for _, row in data_full.iterrows():
    u = row['user']
    i = row['item']
    pos_edges[u].add(i)

# create sets of negative edges for each user
for u in tqdm(range(n_users)):
    for i in range(n_items):
        if i not in pos_edges[u]:
            neg_edges[u].add(i)

for u in range(n_users):
    pos_edges[u] = list(pos_edges[u])
    neg_edges[u] = list(neg_edges[u])

# sample negative edges for each user as neg_edges_sampled 
neg_edges_sampled = dict()
for u in tqdm(range(n_users)):
    neg_edges_all = neg_edges[u]
    # sample same number of negative edges as positive edges for each user 
    n_sampled_edges = len(pos_edges[u])
    neg_edges_sampled[u] = np.random.choice(neg_edges_all, size = n_sampled_edges, replace = False)

# divide neg_edges[u] into neg_train[u], neg_test[u] for each user
fraction = 0.2

neg_train = dict()
neg_test = dict()

for u in tqdm(range(n_users)):
    assert len(pos_edges[u]) == len(neg_edges_sampled[u])
    shuffle(neg_edges_sampled[u])

    n_neg = len(neg_edges_sampled[u])
    n_test = math.ceil(n_neg * fraction)
    n_train = n_neg - n_test

    neg_train[u] = neg_edges_sampled[u][:n_train]
    neg_test[u] = neg_edges_sampled[u][n_train:]

100%|██████████| 943/943 [00:00<00:00, 2511.78it/s]
100%|██████████| 943/943 [00:00<00:00, 6430.93it/s]
100%|██████████| 943/943 [00:00<00:00, 12995.74it/s]


##### **Generate positive edges from data_train and data_test**

In [69]:
pos_train = dict()
pos_test = dict()

for u in range(n_users):
    pos_train[u] = set()
    pos_test[u] = set()

for _, row in data_train.iterrows():
    u = row['user']
    i = row['item']
    pos_train[u].add(i)

for _, row in data_test.iterrows():
    u = row['user']
    i = row['item']
    pos_test[u].add(i)

for u in range(n_users):
    pos_train[u] = np.array(list(pos_train[u]))
    pos_test[u] = np.array(list(pos_test[u]))

#### **Merge positive and negative edges to form train and test sets**

In [80]:
# merge positive and negative edges as train_edges and test_edges, which are dict()
train_edges = []
test_edges = []

for u in tqdm(range(n_users)):
    for i in pos_train[u]:
        train_edges.append((u, i, 1))
    for i in neg_train[u]:
        train_edges.append((u, i, 0))
        
    for i in pos_test[u]:
        test_edges.append((u, i, 1))
    for i in neg_test[u]:
        test_edges.append((u, i, 0))

# shuffle train_edges to mix +ve and -ve edges
np.random.shuffle(train_edges)
np.random.shuffle(test_edges)

100%|██████████| 943/943 [00:00<00:00, 13268.88it/s]


In [81]:
print('Training edges: ', len(train_edges))
print('Unseen edges: ', len(test_edges))

Training edges:  159619
Unseen edges:  40381


In [82]:
print('Positive training edges: ', sum([len(x) for x in pos_train.values()]))
print('Negative training edges: ', sum([len(x) for x in neg_train.values()]))

print('Positive unseen edges: ', sum([len(x) for x in pos_test.values()]))
print('Negative unseen edges: ', sum([len(x) for x in neg_test.values()]))

Positive training edges:  80000
Negative training edges:  79619
Positive unseen edges:  20000
Negative unseen edges:  20381


In [85]:
# make train edges as 2d-array from list of tuples
train_edges_np = np.zeros(shape = (len(train_edges), 3), dtype = np.int32)
for i in range(len(train_edges)):
    user = train_edges[i][0]
    item = train_edges[i][1]
    rating = train_edges[i][2]
    train_edges_np[i] = np.array([user, item, rating])

# save the training edges
np.save('train_edges.npy', train_edges_np)

# make test edges as 2d-array from list of tuples
test_edges_np = np.zeros(shape = (len(test_edges), 3), dtype = np.int32)
for i in range(len(test_edges)):
    user = test_edges[i][0]
    item = test_edges[i][1]
    rating = test_edges[i][2]
    test_edges_np[i] = np.array([user, item, rating])

np.save('test_edges.npy', test_edges_np)

In [86]:
!ls

adj.npy   all_edges.npy  files		  test_edges.npy   u.data
adjr.npy  explore.ipynb  movielens.ipynb  train_edges.npy
