In [1]:
import torch
import pandas as pd
import numpy as np
from torch.nn import Linear
import torch.nn.functional as F
from sentence_transformers import SentenceTransformer

import torch_geometric.transforms as T
from torch_geometric.nn import SAGEConv, to_hetero

from torch_geometric.data import HeteroData
from torch_geometric.transforms import ToUndirected, RandomLinkSplit

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)



cpu


In [2]:
from neo4j import GraphDatabase
import pandas as pd

url= 'bolt://18.212.58.119:7687'
user = 'neo4j'
password = 'leaks-oils-starts'

driver = GraphDatabase.driver(url, auth=(user, password))

def fetch_data(query):
  with driver.session() as session:
    result = session.run(query)
    return pd.DataFrame([r.values() for r in result], columns=result.keys())

In [3]:
driver

<neo4j._sync.driver.BoltDriver at 0x179a22fd0>

In [4]:
def load_node(cypher, index_col, encoders=None, **kwargs):
    # Execute the cypher query and retrieve data from Neo4j
    df = fetch_data(cypher)
    df.set_index(index_col, inplace=True)
    # Define node mapping
    mapping = {index: i for i, index in enumerate(df.index.unique())}
    # Define node features
    x = None
    if encoders is not None:
        xs = [encoder(df[col]) for col, encoder in encoders.items()]
        x = torch.cat(xs, dim=-1)

    return x, mapping

In [5]:
def load_edge(cypher, src_index_col, src_mapping, dst_index_col, dst_mapping,
                  encoders=None, **kwargs):
    # Execute the cypher query and retrieve data from Neo4j
    df = fetch_data(cypher)
    # Define edge index
    src = [src_mapping[index] for index in df[src_index_col]]
    dst = [dst_mapping[index] for index in df[dst_index_col]]
    edge_index = torch.tensor([src, dst])
    # Define edge features
    edge_attr = None
    if encoders is not None:
        edge_attrs = [encoder(df[col]) for col, encoder in encoders.items()]
        edge_attr = torch.cat(edge_attrs, dim=-1)

    return edge_index, edge_attr

In [10]:
class SequenceEncoder(object):
    # The 'SequenceEncoder' encodes raw column strings into embeddings.
    def __init__(self, model_name='all-MiniLM-L6-v2', device=None):
        self.device = device
        self.model = SentenceTransformer(model_name, device=device)

    @torch.no_grad()
    def __call__(self, df):
        x = self.model.encode(df.values, show_progress_bar=True,
                              convert_to_tensor=True, device=self.device)
        return x.cpu()

In [11]:
class GenresEncoder(object):
    # The 'GenreEncoder' splits the raw column strings by 'sep' and converts
    # individual elements to categorical labels.
    def __init__(self, sep='|'):
        self.sep = sep

    def __call__(self, df):
        genres = set(g for col in df.values for g in col.split(self.sep))
        mapping = {genre: i for i, genre in enumerate(genres)}

        x = torch.zeros(len(df), len(mapping))
        for i, col in enumerate(df.values):
            for genre in col.split(self.sep):
                x[i, mapping[genre]] = 1
        return x
        
class IdentityEncoder(object):
    # The 'IdentityEncoder' takes the raw column values and converts them to
    # PyTorch tensors.
    def __init__(self, dtype=None, is_list=False):
        self.dtype = dtype
        self.is_list = is_list

    def __call__(self, df):
        if self.is_list:
            return torch.stack([torch.tensor(el) for el in df.values])
        return torch.from_numpy(df.values).to(self.dtype)

In [6]:
user_query = """
MATCH (u:User) RETURN u.userId AS userId
"""

user_x, user_mapping = load_node(user_query, index_col='userId')

In [7]:
user_x

In [8]:
user_mapping

{'1': 0,
 '2': 1,
 '3': 2,
 '4': 3,
 '5': 4,
 '6': 5,
 '7': 6,
 '8': 7,
 '9': 8,
 '10': 9,
 '11': 10,
 '12': 11,
 '13': 12,
 '14': 13,
 '15': 14,
 '16': 15,
 '17': 16,
 '18': 17,
 '19': 18,
 '20': 19,
 '21': 20,
 '22': 21,
 '23': 22,
 '24': 23,
 '25': 24,
 '26': 25,
 '27': 26,
 '28': 27,
 '29': 28,
 '30': 29,
 '31': 30,
 '32': 31,
 '33': 32,
 '34': 33,
 '35': 34,
 '36': 35,
 '37': 36,
 '38': 37,
 '39': 38,
 '40': 39,
 '41': 40,
 '42': 41,
 '43': 42,
 '44': 43,
 '45': 44,
 '46': 45,
 '47': 46,
 '48': 47,
 '49': 48,
 '50': 49,
 '51': 50,
 '52': 51,
 '53': 52,
 '54': 53,
 '55': 54,
 '56': 55,
 '57': 56,
 '58': 57,
 '59': 58,
 '60': 59,
 '61': 60,
 '62': 61,
 '63': 62,
 '64': 63,
 '65': 64,
 '66': 65,
 '67': 66,
 '68': 67,
 '69': 68,
 '70': 69,
 '71': 70,
 '72': 71,
 '73': 72,
 '74': 73,
 '75': 74,
 '76': 75,
 '77': 76,
 '78': 77,
 '79': 78,
 '80': 79,
 '81': 80,
 '82': 81,
 '83': 82,
 '84': 83,
 '85': 84,
 '86': 85,
 '87': 86,
 '88': 87,
 '89': 88,
 '90': 89,
 '91': 90,
 '92': 91,
 '93': 

In [12]:
movie_query = """
MATCH (m:Movie)-[:IN_GENRE]->(genre:Genre)
WITH m, collect(genre.name) AS genres_list
RETURN m.movieId AS movieId, m.title AS title, apoc.text.join(genres_list, '|') AS genres, m.fastrp AS fastrp
"""
movie_x, movie_mapping = load_node(
    movie_query, 
    index_col='movieId', encoders={
        'title': SequenceEncoder(),
        'genres': GenresEncoder(),
        'fastrp': IdentityEncoder(is_list=True)
    })

.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


Batches:   0%|          | 0/286 [00:00<?, ?it/s]

In [13]:
rating_query = """
MATCH (u:User)-[r:RATED]->(m:Movie) 
RETURN u.userId AS userId, m.movieId AS movieId, r.rating AS rating
"""

edge_index, edge_label = load_edge(
    rating_query,
    src_index_col='userId',
    src_mapping=user_mapping,
    dst_index_col='movieId',
    dst_mapping=movie_mapping,
    encoders={'rating': IdentityEncoder(dtype=torch.long)},
)

In [14]:
data = HeteroData()
# Add user node features for message passing:
data['user'].x = torch.eye(len(user_mapping), device=device)
# Add movie node features
data['movie'].x = movie_x
# Add ratings between users and movies
data['user', 'rates', 'movie'].edge_index = edge_index
data['user', 'rates', 'movie'].edge_label = edge_label
data.to(device, non_blocking=True)

HeteroData(
  user={ x=[671, 671] },
  movie={ x=[9125, 460] },
  (user, rates, movie)={
    edge_index=[2, 100004],
    edge_label=[100004],
  }
)

In [15]:
data = ToUndirected()(data)
del data['movie', 'rev_rates', 'user'].edge_label  # Remove "reverse" label.

# 2. Perform a link-level split into training, validation, and test edges.
transform = RandomLinkSplit(
    num_val=0.1,
    num_test=0.1,
    neg_sampling_ratio=0.0,
    edge_types=[('user', 'rates', 'movie')],
    rev_edge_types=[('movie', 'rev_rates', 'user')],
)
train_data, val_data, test_data = transform(data)

In [17]:
class GNNEncoder(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = SAGEConv((-1, -1), hidden_channels)
        self.conv2 = SAGEConv((-1, -1), out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x


class EdgeDecoder(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.lin1 = Linear(2 * hidden_channels, hidden_channels)
        self.lin2 = Linear(hidden_channels, 1)

    def forward(self, z_dict, edge_label_index):
        row, col = edge_label_index
        z = torch.cat([z_dict['user'][row], z_dict['movie'][col]], dim=-1)

        z = self.lin1(z).relu()
        z = self.lin2(z)
        return z.view(-1)

class Model(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.encoder = GNNEncoder(hidden_channels, hidden_channels)
        self.encoder = to_hetero(self.encoder, data.metadata(), aggr='sum')
        self.decoder = EdgeDecoder(hidden_channels)

    def forward(self, x_dict, edge_index_dict, edge_label_index):
        z_dict = self.encoder(x_dict, edge_index_dict)
        return self.decoder(z_dict, edge_label_index)
weight = torch.bincount(train_data['user','rates', 'movie'].edge_label)
weight = weight.max() / weight

def weighted_mse_loss(pred, target, weight=None):
    weight = 1. if weight is None else weight[target].to(pred.dtype)
    return (weight * (pred - target.to(pred.dtype)).pow(2)).mean()
model = Model(hidden_channels=64).to(device)
# Due to lazy initialization, we need to run one model step so the number
# of parameters can be inferred:
with torch.no_grad():
    model.encoder(train_data.x_dict, train_data.edge_index_dict)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
def train():
    model.train()
    optimizer.zero_grad()
    pred = model(train_data.x_dict, train_data.edge_index_dict,
                 train_data['user', 'rates', 'movie'].edge_label_index)
    target = train_data['user', 'rates', 'movie'].edge_label
    loss = weighted_mse_loss(pred, target, weight)
    loss.backward()
    optimizer.step()
    return float(loss)
@torch.no_grad()
def test(data):
    model.eval()
    pred = model(data.x_dict, data.edge_index_dict,
                 data['user', 'rates', 'movie'].edge_label_index)
    pred = pred.clamp(min=0, max=5)
    target = data['user', 'rates', 'movie'].edge_label.float()
    rmse = F.mse_loss(pred, target).sqrt()
    return float(rmse)

In [18]:
for epoch in range(1, 300):
    loss = train()
    train_rmse = test(train_data)
    val_rmse = test(val_data)
    test_rmse = test(test_data)
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Train: {train_rmse:.4f}, '
          f'Val: {val_rmse:.4f}, Test: {test_rmse:.4f}')

Epoch: 001, Loss: 19.2698, Train: 3.1679, Val: 3.1740, Test: 3.1728
Epoch: 002, Loss: 15.6286, Train: 2.2449, Val: 2.2568, Test: 2.2530
Epoch: 003, Loss: 8.7431, Train: 1.2806, Val: 1.2798, Test: 1.2674
Epoch: 004, Loss: 11.4858, Train: 1.2121, Val: 1.2286, Test: 1.2174
Epoch: 005, Loss: 6.6172, Train: 1.8334, Val: 1.8470, Test: 1.8413
Epoch: 006, Loss: 6.9604, Train: 2.1434, Val: 2.1545, Test: 2.1503
Epoch: 007, Loss: 8.1775, Train: 2.1393, Val: 2.1503, Test: 2.1461
Epoch: 008, Loss: 8.1368, Train: 1.9196, Val: 1.9320, Test: 1.9268
Epoch: 009, Loss: 7.1642, Train: 1.5417, Val: 1.5570, Test: 1.5491
Epoch: 010, Loss: 6.1067, Train: 1.1701, Val: 1.1877, Test: 1.1751
Epoch: 011, Loss: 6.1622, Train: 1.0571, Val: 1.0736, Test: 1.0581
Epoch: 012, Loss: 7.0014, Train: 1.0737, Val: 1.0917, Test: 1.0766
Epoch: 013, Loss: 6.4787, Train: 1.2454, Val: 1.2636, Test: 1.2518
Epoch: 014, Loss: 5.6958, Train: 1.5059, Val: 1.5218, Test: 1.5131
Epoch: 015, Loss: 5.7471, Train: 1.6786, Val: 1.6928, Test:

In [19]:
num_movies = len(movie_mapping)
num_users = len(user_mapping)

reverse_movie_mapping = dict(zip(movie_mapping.values(),movie_mapping.keys()))
reverse_user_mapping = dict(zip(user_mapping.values(),user_mapping.keys()))

results = []

for user_id in range(0,num_users): 

    row = torch.tensor([user_id] * num_movies)
    col = torch.arange(num_movies)
    edge_label_index = torch.stack([row, col], dim=0)

    pred = model(data.x_dict, data.edge_index_dict,
                 edge_label_index)
    pred = pred.clamp(min=0, max=5)

    user_neo4j_id = reverse_user_mapping[user_id]

    mask = (pred == 5).nonzero(as_tuple=True)

    ten_predictions = [reverse_movie_mapping[el] for el in  mask[0].tolist()[:10]]
    results.append({'user': user_neo4j_id, 'movies': ten_predictions})

In [20]:
results

[{'user': '1',
  'movies': ['26513', '107559', '32469', '5768', '51471', '114', '6920']},
 {'user': '2',
  'movies': ['110352',
   '106471',
   '106473',
   '80839',
   '62383',
   '55498',
   '26513',
   '8341',
   '107559',
   '116939']},
 {'user': '3',
  'movies': ['110352',
   '106471',
   '106473',
   '62383',
   '26513',
   '8341',
   '107559',
   '116939',
   '84304',
   '98000']},
 {'user': '4',
  'movies': ['115664',
   '110352',
   '110873',
   '106471',
   '106473',
   '103659',
   '103210',
   '102666',
   '102252',
   '162672']},
 {'user': '5',
  'movies': ['110352',
   '110873',
   '106471',
   '106473',
   '80839',
   '62383',
   '55498',
   '26513',
   '8341',
   '8537']},
 {'user': '6',
  'movies': ['26513',
   '8341',
   '107559',
   '32469',
   '5768',
   '1067',
   '149606',
   '51471',
   '5071',
   '389']},
 {'user': '7',
  'movies': ['110352',
   '110873',
   '106471',
   '106473',
   '80839',
   '62383',
   '55498',
   '26513',
   '8341',
   '8537']},
 {'user': 

In [None]:
import_predictions_query = """
UNWIND $data AS row
MATCH (u:User {userId: row.user})
WITH u, row
UNWIND row.movies AS movieId
MATCH (m:Movie {movieId: movieId})
WITH u,m
// filter out existing links
WHERE NOT (u)-[:RATED]->(m)
MERGE (u)-[:RECOMMEND]->(m)
"""

# Exercise for the User - Update the fetch_data method to take in 
# the results data as an input and run the above query 
fetch_data(import_predictions_query, {'data': results})