In [119]:
# Library imports
import pyforest
import numpy as np
import pandas as pd
import os
from matplotlib import pyplot as plt
from tqdm import tqdm

from turtle import forward
import torch.nn as nn
import torch.nn.functional as F
import math
import torch
from torch.nn.parameter import Parameter
from torch.nn.modules.module import Module

from cf import CollaborativeFiltering

In [8]:
column_names = ['user_id', 'item_id', 'rating','timestamp']
df = pd.read_csv('u.data', sep='\t', names=column_names)
df = df[['user_id', 'item_id', 'rating']]
df.user_id = df.user_id - 1 # for 0-indexing
df.item_id = df.item_id - 1 # for 0-indexing
df.head()

Unnamed: 0,user_id,item_id,rating
0,195,241,3
1,185,301,3
2,21,376,1
3,243,50,2
4,165,345,1


In [9]:
# create another df for binarized ratings
binary_rating = {0:0, 1:1, 2:1, 3:1, 4:1, 5:1}
df_binary = df.copy()
df_binary.rating = [binary_rating[row] for row in df_binary.rating]
df_binary

Unnamed: 0,user_id,item_id,rating
0,195,241,1
1,185,301,1
2,21,376,1
3,243,50,1
4,165,345,1
...,...,...,...
99995,879,475,1
99996,715,203,1
99997,275,1089,1
99998,12,224,1


In [24]:
n_users = df.user_id.nunique()
n_items = df.item_id.nunique()

In [25]:
# create sets of positive and negative edges for each user
pos_edges = dict()
neg_edges = dict()

for u in range(n_users):
    pos_edges[u] = set()
    neg_edges[u] = set()

# create sets of positive edges, for each user
for _, row in df.iterrows():
    u = row.user_id
    i = row.item_id
    pos_edges[u].add(i)

# create set of negative edges for each user
for u in range(n_users):
    for i in range(n_items):
        if i not in pos_edges[u]:
            neg_edges[u].add(i)

In [48]:
# sample negative edges for each user, as neg_edges_sampled
neg_edges_sampled = dict()
for u in tqdm(range(n_users)):
    # sample neg_edges for user u
    neg_edges_all = list(neg_edges[u])
    # sample same number of negative edges as positive edges
    n_sampled_edges = len(pos_edges[u])
    neg_edges_sampled[u] = np.random.choice(neg_edges_all, size = n_sampled_edges, replace = False)
    neg_edges_sampled[u] = set(neg_edges_sampled[u])

100%|██████████| 943/943 [00:00<00:00, 5141.44it/s]


In [57]:
# merge pos_edges and neg_edges_sampled as train_edges, which is a dict()
train_edges = []
for u in tqdm(range(n_users)):
    for i in pos_edges[u]:
        train_edges.append((u, i, 1))
    for i in neg_edges_sampled[u]:
        train_edges.append((u, i, 0))
# shuffle train_edges to mix +ve and -ve edges
np.random.shuffle(train_edges)

100%|██████████| 943/943 [00:00<00:00, 14041.02it/s]


In [58]:
train_edges

[(541, 731, 1),
 (845, 174, 1),
 (229, 818, 0),
 (822, 502, 1),
 (166, 654, 1),
 (313, 716, 1),
 (278, 860, 0),
 (404, 925, 0),
 (536, 1481, 0),
 (268, 1043, 0),
 (491, 126, 1),
 (86, 1437, 0),
 (17, 167, 1),
 (112, 302, 1),
 (6, 1191, 0),
 (180, 827, 1),
 (392, 1627, 0),
 (266, 792, 0),
 (709, 1536, 0),
 (220, 1169, 0),
 (914, 1638, 0),
 (115, 1213, 1),
 (536, 1328, 0),
 (642, 903, 0),
 (585, 1158, 0),
 (314, 486, 0),
 (654, 1590, 0),
 (123, 766, 0),
 (541, 1162, 0),
 (336, 14, 1),
 (44, 277, 1),
 (517, 411, 1),
 (504, 415, 0),
 (15, 1292, 0),
 (501, 257, 1),
 (842, 1436, 0),
 (149, 150, 1),
 (115, 895, 1),
 (72, 483, 0),
 (738, 968, 1),
 (486, 275, 1),
 (536, 1644, 0),
 (366, 159, 0),
 (430, 987, 1),
 (342, 1510, 0),
 (297, 345, 0),
 (159, 1396, 0),
 (708, 1082, 0),
 (27, 743, 0),
 (245, 293, 1),
 (95, 186, 1),
 (215, 199, 1),
 (453, 955, 1),
 (846, 199, 1),
 (243, 844, 1),
 (888, 592, 0),
 (821, 165, 0),
 (882, 1286, 0),
 (294, 15, 0),
 (547, 236, 1),
 (325, 477, 1),
 (906, 1648, 0)

In [107]:
# create sparse matrix for storing the train_edges
n_ratings = len(df)
indices = torch.zeros(size = (2, 2 * n_ratings))
values = torch.zeros(size = (2 * n_ratings,))
for i in range(len(train_edges)):
    (user, item, rating) = train_edges[i]
    indices[0][i] = user
    indices[1][i] = item 
    values[i] = rating
adj = torch.sparse_coo_tensor(indices, values, (n_users, n_items))

In [114]:
# It seems sparse matrices won't suffice, so sticking with normal matrices for now
adj = np.load('adj.npy')
adjr = np.load('adjr.npy')

In [118]:
# algorithm -
# -> for now, use fixed A- for all inner-loop and outer-loop iterations
# -> train inner-loop for T complete epochs
# -> after T epochs, compute meta-grad w.r.t all entries of adj once
# -> since meta-grad is computed only once every T epochs, the overhead should not be too huge

1682

In [204]:
test_model = CollaborativeFiltering(n_users, n_items, n_factors = 32)
optimizer = torch.optim.Adam(test_model.parameters(), lr = 0.01)
test_model.train()

CollaborativeFiltering(
  (user_emb): Embedding(943, 32)
  (item_emb): Embedding(1682, 32)
)

In [208]:
users = torch.LongTensor(np.array([tup[0] for tup in train_edges]))
items = torch.LongTensor(np.array([tup[1] for tup in train_edges]))
ratings = torch.FloatTensor(np.array([tup[2] for tup in train_edges]))

In [212]:
y_hat = test_model(users, items)
loss = F.mse_loss(y_hat, ratings)
optimizer.zero_grad()
loss.backward()
optimizer.step()
print(loss.item())
# note: to use nll_loss in place of mse_loss, need to change ratings to torch.LongTensor 
# and take softmax at the end of output of the cf model 
# (make the outputs categorical instead of returning a single summed up value) 

1.4699302911758423


In [220]:
# make train_edges as 2d-array from list of tuples
train_edges_np = np.zeros(shape = (2 * n_ratings, 3), dtype=np.int32)
for i in range(2 * n_ratings):
    user = train_edges[i][0]
    item = train_edges[i][1]
    rating = train_edges[i][2]
    train_edges_np[i] = np.array([user, item, rating])

In [222]:
# save train_edges_np[i]
np.save('train_edges.npy', train_edges_np)