In [1]:
!cd ../../../ && make release NUM_THREADS=20

cd external && \
	mkdir -p build && \
	cd build && \
	cmake    -DCMAKE_BUILD_TYPE=Release ../ && \
	cmake --build . --config Release -- -j 20
-- Configuring done
-- Generating done
-- Build files have been written to: /Users/lc/Developer/kuzu/external/build
[100%] Built target apache_arrow
mkdir -p build/release && \
	cd build/release && \
	cmake    -DCMAKE_BUILD_TYPE=Release ../.. && \
	cmake --build . --config Release -- -j 20
-- pybind11 v2.11.0 dev1
-- Configuring done
-- Generating done
-- Build files have been written to: /Users/lc/Developer/kuzu/build/release
[  1%] Built target kuzu_transaction
[  2%] Built target utf8proc
[  2%] Built target kuzu_common_csv_reader
[  2%] Built target kuzu_common_data_chunk
[  3%] Built target kuzu_common_task_system
[  4%] Built target kuzu_catalog
[  4%] Built target kuzu_common_vector
[  5%] Built target kuzu_common_arrow
[  6%] Built target kuzu_binder_expression
[  6%] Built target kuzu_binder
[  7%] Built target kuzu_optimizer
[  8%] Buil

In [1]:
import sys, os
import warnings
sys.path.append('../build/')

import kuzu
import torch
import tqdm
import pandas as pd
from torch.nn import Linear
import torch.nn.functional as F
from sentence_transformers import SentenceTransformer

import torch_geometric.transforms as T
from torch_geometric.nn import SAGEConv, to_hetero

from torch_geometric.data import HeteroData
from torch_geometric.transforms import ToUndirected, RandomLinkSplit

In [2]:
db_path = './ml-small'
if os.path.exists(db_path):
    os.system('rm -rf ' + db_path)
    
def load_data(connection):
    print('loading data...')
    connection.execute('CREATE NODE TABLE movie (movieId INT64, title STRING, genres STRING, PRIMARY KEY (movieId))')
    connection.execute('CREATE NODE TABLE user (userId INT64, PRIMARY KEY (userId))')
    connection.execute('CREATE REL TABLE rating (FROM user TO movie, rating DOUBLE, timestamp INT64)')

    connection.execute('COPY movie FROM "./movieLen-small/movies.csv" (HEADER=TRUE)')
    connection.execute('COPY user FROM "./movieLen-small/users.csv" (HEADER=TRUE)')
    connection.execute('COPY rating FROM "./movieLen-small/ratings.csv" (HEADER=TRUE)')

db = kuzu.Database(db_path)
conn = kuzu.Connection(db)

In [3]:
load_data(conn)

loading data...


In [4]:
res = conn.execute('MATCH (u:user)-[r:rating]->(m:movie) RETURN u, r, m')
torch_geometric_data, pos_to_idx, unconverted_properties, edge_properties = res.get_as_torch_geometric()



In [5]:
torch_geometric_data

HeteroData(
  [1muser[0m={ userId=[610] },
  [1mmovie[0m={ movieId=[9724] },
  [1m(user, to, movie)[0m={ edge_index=[2, 100836] }
)

In [6]:
torch_geometric_data['user', 'movie'].edge_label = torch.FloatTensor(edge_properties['user', 'movie']['rating'])

In [7]:
torch_geometric_data['user'].x = torch.eye(len(torch_geometric_data['user'].userId))

In [8]:
torch_geometric_data

HeteroData(
  [1muser[0m={
    userId=[610],
    x=[610, 610]
  },
  [1mmovie[0m={ movieId=[9724] },
  [1m(user, to, movie)[0m={
    edge_index=[2, 100836],
    edge_label=[100836]
  }
)

In [9]:
s = unconverted_properties['movie']['title'][0]

In [10]:
def extract_movie_year_from_title(s):
    import re
    m = re.search(r'\((\d{4})\)', s)
    if m:
        return int(m.group(1))
    else:
        return 0

In [11]:
years = []
for i, s in enumerate(unconverted_properties['movie']['title']):
    years.append(extract_movie_year_from_title(s))
years = torch.FloatTensor(years)
years -= years.min()
years /= years.max()

In [12]:
model = SentenceTransformer('all-MiniLM-L6-v2')
with torch.no_grad():
    title_embs = model.encode(unconverted_properties['movie']['title'], show_progress_bar=True, convert_to_tensor=True).cpu()

Batches:   0%|          | 0/304 [00:00<?, ?it/s]

In [13]:
df = pd.DataFrame.from_dict(unconverted_properties['movie'])
genres_encoded = df['genres'].str.get_dummies('|').values
genres_encoded = torch.from_numpy(genres_encoded).to(torch.float)

In [14]:
movie_x = torch.cat([years.unsqueeze(1), title_embs, genres_encoded], dim=1)

In [15]:
torch_geometric_data['movie'].x = movie_x   

In [16]:
del torch_geometric_data['user'].userId
del torch_geometric_data['movie'].movieId

In [17]:
torch_geometric_data

HeteroData(
  [1muser[0m={ x=[610, 610] },
  [1mmovie[0m={ x=[9724, 405] },
  [1m(user, to, movie)[0m={
    edge_index=[2, 100836],
    edge_label=[100836]
  }
)

In [18]:
data = ToUndirected()(torch_geometric_data)
del data['movie', 'rev_to', 'user'].edge_label  # Remove "reverse" label.

# 2. Perform a link-level split into training, validation, and test edges.
transform = RandomLinkSplit(
    num_val=0.1,
    num_test=0.1,
    neg_sampling_ratio=0.0,
    edge_types=[('user', 'to', 'movie')],
    rev_edge_types=[('movie', 'rev_to', 'user')],
)
train_data, val_data, test_data = transform(data)

In [19]:
class GNNEncoder(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = SAGEConv((-1, -1), hidden_channels)
        self.conv2 = SAGEConv((-1, -1), out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x


class EdgeDecoder(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.lin1 = Linear(2 * hidden_channels, hidden_channels)
        self.lin2 = Linear(hidden_channels, 1)

    def forward(self, z_dict, edge_label_index):
        row, col = edge_label_index
        z = torch.cat([z_dict['user'][row], z_dict['movie'][col]], dim=-1)

        z = self.lin1(z).relu()
        z = self.lin2(z)
        return z.view(-1)

class Model(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.encoder = GNNEncoder(hidden_channels, hidden_channels)
        self.encoder = to_hetero(self.encoder, data.metadata(), aggr='sum')
        self.decoder = EdgeDecoder(hidden_channels)

    def forward(self, x_dict, edge_index_dict, edge_label_index):
        z_dict = self.encoder(x_dict, edge_index_dict)
        return self.decoder(z_dict, edge_label_index)

In [20]:
weight = torch.bincount(train_data['user', 'movie'].edge_label)
weight = weight.max() / weight

def weighted_mse_loss(pred, target, weight=None):
    weight = 1. if weight is None else weight[target].to(pred.dtype)
    return (weight * (pred - target.to(pred.dtype)).pow(2)).mean()

RuntimeError: "bincount_cpu" not implemented for 'Float'