In [2]:
import sys, os
import warnings
sys.path.append('../build/')

import kuzu
import torch
import tqdm
import pandas as pd
from torch.nn import Linear
import torch.nn.functional as F
from sentence_transformers import SentenceTransformer

import torch_geometric.transforms as T
from torch_geometric.nn import SAGEConv, to_hetero

from torch_geometric.data import HeteroData
from torch_geometric.transforms import ToUndirected, RandomLinkSplit

In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [4]:
db_path = './ml-small'
if os.path.exists(db_path):
    os.system('rm -rf ' + db_path)
    
def load_data(connection):
    print('loading data...')
    connection.execute('CREATE NODE TABLE movie (movieId INT64, title STRING, genres STRING, PRIMARY KEY (movieId))')
    connection.execute('CREATE NODE TABLE user (userId INT64, PRIMARY KEY (userId))')
    connection.execute('CREATE REL TABLE rating (FROM user TO movie, rating DOUBLE, timestamp INT64)')

    connection.execute('COPY movie FROM "./movieLen-small/movies.csv" (HEADER=TRUE)')
    connection.execute('COPY user FROM "./movieLen-small/users.csv" (HEADER=TRUE)')
    connection.execute('COPY rating FROM "./movieLen-small/ratings.csv" (HEADER=TRUE)')

db = kuzu.Database(db_path)
conn = kuzu.Connection(db)

In [5]:
load_data(conn)

loading data...


In [6]:
res = conn.execute('MATCH (u:user)-[r:rating]->(m:movie) RETURN u, r, m')
torch_geometric_data, pos_to_idx, unconverted_properties, edge_properties = res.get_as_torch_geometric()



In [7]:
torch_geometric_data

HeteroData(
  [1muser[0m={ userId=[610] },
  [1mmovie[0m={ movieId=[9724] },
  [1m(user, to, movie)[0m={ edge_index=[2, 100836] }
)

In [8]:
torch_geometric_data['user', 'to', 'movie'].edge_label = torch.LongTensor(edge_properties['user', 'movie']['rating'])

  torch_geometric_data['user', 'to', 'movie'].edge_label = torch.LongTensor(edge_properties['user', 'movie']['rating'])


In [9]:
torch_geometric_data['user'].x = torch.eye(len(torch_geometric_data['user'].userId))

In [10]:
torch_geometric_data

HeteroData(
  [1muser[0m={
    userId=[610],
    x=[610, 610]
  },
  [1mmovie[0m={ movieId=[9724] },
  [1m(user, to, movie)[0m={
    edge_index=[2, 100836],
    edge_label=[100836]
  }
)

In [11]:
s = unconverted_properties['movie']['title'][0]

In [12]:
def extract_movie_year_from_title(s):
    import re
    m = re.search(r'\((\d{4})\)', s)
    if m:
        return int(m.group(1))
    else:
        return 0

In [13]:
years = []
for i, s in enumerate(unconverted_properties['movie']['title']):
    years.append(extract_movie_year_from_title(s))
years = torch.FloatTensor(years)
years -= years.min()
years /= years.max()

In [14]:
model = SentenceTransformer('all-MiniLM-L6-v2')
with torch.no_grad():
    title_embs = model.encode(unconverted_properties['movie']['title'], show_progress_bar=True, convert_to_tensor=True, device=device).cpu()

Batches:   0%|          | 0/304 [00:00<?, ?it/s]

In [15]:
df = pd.DataFrame.from_dict(unconverted_properties['movie'])
genres_encoded = df['genres'].str.get_dummies('|').values
genres_encoded = torch.from_numpy(genres_encoded).to(torch.float)

In [16]:
movie_x = torch.cat([years.unsqueeze(1), title_embs, genres_encoded], dim=1)

In [17]:
torch_geometric_data['movie'].x = movie_x   

In [18]:
del torch_geometric_data['user'].userId
del torch_geometric_data['movie'].movieId

In [19]:
torch_geometric_data

HeteroData(
  [1muser[0m={ x=[610, 610] },
  [1mmovie[0m={ x=[9724, 405] },
  [1m(user, to, movie)[0m={
    edge_index=[2, 100836],
    edge_label=[100836]
  }
)

In [20]:
data = ToUndirected()(torch_geometric_data)
del data['movie', 'rev_to', 'user'].edge_label  # Remove "reverse" label.
data.to(device, non_blocking=True)

# 2. Perform a link-level split into training, validation, and test edges.
transform = RandomLinkSplit(
    num_val=0.1,
    num_test=0.1,
    neg_sampling_ratio=0.0,
    edge_types=[('user', 'to', 'movie')],
    rev_edge_types=[('movie', 'rev_to', 'user')],
)
train_data, val_data, test_data = transform(data)

In [21]:
class GNNEncoder(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = SAGEConv((-1, -1), hidden_channels)
        self.conv2 = SAGEConv((-1, -1), out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x


class EdgeDecoder(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.lin1 = Linear(2 * hidden_channels, hidden_channels)
        self.lin2 = Linear(hidden_channels, 1)

    def forward(self, z_dict, edge_label_index):
        row, col = edge_label_index
        z = torch.cat([z_dict['user'][row], z_dict['movie'][col]], dim=-1)

        z = self.lin1(z).relu()
        z = self.lin2(z)
        return z.view(-1)

class Model(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.encoder = GNNEncoder(hidden_channels, hidden_channels)
        self.encoder = to_hetero(self.encoder, data.metadata(), aggr='sum')
        self.decoder = EdgeDecoder(hidden_channels)

    def forward(self, x_dict, edge_index_dict, edge_label_index):
        z_dict = self.encoder(x_dict, edge_index_dict)
        return self.decoder(z_dict, edge_label_index)

In [22]:
weight = torch.bincount(train_data['user', 'to', 'movie'].edge_label)
weight = weight.max() / weight

def weighted_mse_loss(pred, target, weight=None):
    weight = 1. if weight is None else weight[target].to(pred.dtype)
    return (weight * (pred - target.to(pred.dtype)).pow(2)).mean()

In [23]:
model = Model(hidden_channels=64).to(device)

In [25]:
# Due to lazy initialization, we need to run one model step so the number
# of parameters can be inferred:
with torch.no_grad():
    model.encoder(train_data.x_dict, train_data.edge_index_dict)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [31]:
def train():
    model.train()
    optimizer.zero_grad()
    pred = model(train_data.x_dict, train_data.edge_index_dict,
                 train_data['user', 'to', 'movie'].edge_label_index)
    target = train_data['user', 'to', 'movie'].edge_label
    loss = weighted_mse_loss(pred, target, weight)
    loss.backward()
    optimizer.step()
    return float(loss)

In [32]:
@torch.no_grad()
def test(data):
    model.eval()
    pred = model(data.x_dict, data.edge_index_dict,
                 data['user', 'to', 'movie'].edge_label_index)
    pred = pred.clamp(min=0, max=5)
    target = data['user', 'to', 'movie'].edge_label.float()
    rmse = F.mse_loss(pred, target).sqrt()
    return float(rmse)

In [34]:
for epoch in range(1,500):
    loss = train()
    train_rmse = test(train_data)
    val_rmse = test(val_data)
    test_rmse = test(test_data)
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Train: {train_rmse:.4f}, '
          f'Val: {val_rmse:.4f}, Test: {test_rmse:.4f}')

Epoch: 001, Loss: 2.4052, Train: 1.0241, Val: 1.1259, Test: 1.1357
Epoch: 002, Loss: 2.3947, Train: 1.0261, Val: 1.1284, Test: 1.1383
Epoch: 003, Loss: 2.3921, Train: 1.0536, Val: 1.1509, Test: 1.1611
Epoch: 004, Loss: 2.3974, Train: 1.0272, Val: 1.1307, Test: 1.1408
Epoch: 005, Loss: 2.3887, Train: 1.0273, Val: 1.1314, Test: 1.1416
Epoch: 006, Loss: 2.3871, Train: 1.0510, Val: 1.1506, Test: 1.1610
Epoch: 007, Loss: 2.3909, Train: 1.0264, Val: 1.1316, Test: 1.1417
Epoch: 008, Loss: 2.3843, Train: 1.0291, Val: 1.1340, Test: 1.1443
Epoch: 009, Loss: 2.3815, Train: 1.0468, Val: 1.1482, Test: 1.1587
Epoch: 010, Loss: 2.3844, Train: 1.0237, Val: 1.1305, Test: 1.1408
Epoch: 011, Loss: 2.3809, Train: 1.0321, Val: 1.1374, Test: 1.1477
Epoch: 012, Loss: 2.3766, Train: 1.0411, Val: 1.1450, Test: 1.1552
Epoch: 013, Loss: 2.3777, Train: 1.0221, Val: 1.1307, Test: 1.1405
Epoch: 014, Loss: 2.3775, Train: 1.0380, Val: 1.1436, Test: 1.1535
Epoch: 015, Loss: 2.3737, Train: 1.0338, Val: 1.1406, Test: 1.

In [36]:
num_movies = len(pos_to_idx['movie'])
num_users = len(pos_to_idx['user'])

results = []

for user_id in range(0,num_users): 

    row = torch.tensor([user_id] * num_movies)
    col = torch.arange(num_movies)
    edge_label_index = torch.stack([row, col], dim=0)

    pred = model(data.x_dict, data.edge_index_dict,
                 edge_label_index)
    pred = pred.clamp(min=0, max=5)

    user_db_id = pos_to_idx['user'][user_id]

    mask = (pred == 5).nonzero(as_tuple=True)

    ten_predictions = [pos_to_idx['movie'][el] for el in  mask[0].tolist()[:10]]
    results.append({'user': user_db_id, 'movies': ten_predictions})

In [37]:
results

[{'user': 1,
  'movies': [260, 648, 733, 943, 1089, 1127, 1136, 1196, 1197, 1208]},
 {'user': 2,
  'movies': [80906, 131724, 4518, 5181, 5746, 5764, 5919, 6835, 7899, 7991]},
 {'user': 3,
  'movies': [5746, 5764, 7899, 171, 113394, 31184, 69712, 92494, 31522, 6402]},
 {'user': 4,
  'movies': [4518, 5181, 5746, 5764, 6835, 7899, 7991, 171, 2203, 3508]},
 {'user': 5,
  'movies': [943, 1089, 1136, 1927, 318, 80906, 131724, 4518, 5181, 5746]},
 {'user': 6,
  'movies': [260, 943, 1089, 1136, 1213, 1927, 3147, 318, 48516, 80906]},
 {'user': 7,
  'movies': [5181, 5746, 5764, 6835, 7899, 171, 51705, 72330, 113394, 137595]},
 {'user': 8,
  'movies': [131724, 5181, 5746, 5764, 5919, 6835, 7899, 7991, 171, 1046]},
 {'user': 9,
  'movies': [943, 1089, 1136, 1927, 318, 80906, 131724, 4518, 5181, 5746]},
 {'user': 10,
  'movies': [5746,
   5764,
   7899,
   69712,
   92494,
   31522,
   25887,
   31925,
   44777,
   44851]},
 {'user': 11,
  'movies': [260, 943, 1089, 1136, 1196, 1197, 1208, 1213, 19