In [1]:
from settings import * 

import torch 
import pandas as pd 
import dgl 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_path = os.path.join(YELP_DIR, 'train.csv')
valid_path = os.path.join(YELP_DIR, 'valid.csv')
test_path = os.path.join(YELP_DIR, 'test.csv')

d_train = pd.read_csv(train_path, encoding='utf-8-sig')
d_valid = pd.read_csv(valid_path, encoding='utf-8-sig')
d_test = pd.read_csv(test_path, encoding='utf-8-sig')

In [3]:
train_uids = torch.LongTensor(d_train.loc[:, 'user_id'])
train_iids = torch.LongTensor(d_train.loc[:, 'business_id'])
train_ratings = torch.LongTensor(d_train.loc[:, 'stars'])

valid_uids = torch.LongTensor(d_valid.loc[:, 'user_id'])
valid_iids = torch.LongTensor(d_valid.loc[:, 'business_id'])
valid_ratings = torch.LongTensor(d_valid.loc[:, 'stars'])

test_uids = torch.LongTensor(d_test.loc[:, 'user_id'])
test_iids = torch.LongTensor(d_test.loc[:, 'business_id'])
test_ratings = torch.LongTensor(d_test.loc[:, 'stars'])

In [4]:
graph = dgl.heterograph({
    ('user', 'preference', 'item'): (train_uids, train_iids), 
    ('item', 'preference-by', 'user') : (train_iids, train_uids)
})

graph.edges['preference'].data['rating'] = train_ratings 
graph.edges['preference-by'].data['rating'] = train_ratings 

In [5]:
from torch.utils.data import TensorDataset, DataLoader 

train_set = TensorDataset(train_uids, train_iids, train_ratings)
valid_set = TensorDataset(valid_uids, valid_iids, valid_ratings)
test_set = TensorDataset(test_uids, test_iids, test_ratings)

In [13]:
class MinibatchSampler:
    def __init__(self, graph, num_layers):
        self.graph = graph 
        self.num_layers = num_layers 

    def sample(self, batch):
        users, items, ratings = zip(*batch)
        users = torch.stack(users)
        items = torch.stack(items)
        ratings = torch.stack(ratings)

        pair_graph = dgl.heterograph(
            {('user', 'preference', 'item') : (users, items)}, 
            num_nodes_dict = {'user' : self.graph.number_of_nodes('user'), 'item': self.graph.number_of_nodes('item')}
        )

        pair_graph = dgl.compact_graphs(pair_graph)

        pair_graph.edata['rating'] = ratings

        seeds = {'user': pair_graph.nodes['user'].data[dgl.NID], 
                 'item': pair_graph.nodes['item'].data[dgl.NID]}
        blocks = self.construct_blocks(seeds, (users, items))

        for feature_name in self.graph.nodes['user'].data.keys():
            blocks[0].srcnodes['user'].data[feature_name] = self.graph.nodes['user'].data[feature_name][blocks[0].sronodes['user'].data[dgl.NID]]

        for feature_name in self.graph.nodes['item'].data.keys():
            blocks[0].srcnodes['item'].data[feature_name] = self.graph.nodes['item'].data[feature_name][blocks[0].srcnodes['item'].data[dgl.NID]]

        return pair_graph, blocks

    def construct_blocks(self, seeds, user_item_pairs_to_remove):
        blocks = []
        users, items = user_item_pairs_to_remove
        for i in range(self.num_layers):

            sampled_graph = dgl.in_subgraph(self.graph, seeds)

            sampled_eids = sampled_graph.edges['preference'].data[dgl.EID]
            sampled_eids_rev = sampled_graph.edges['preference-by'].data[dgl.EID]
            
            # rating을 예측하는 것은 edge를 예측하는 것과 같으며, 
            # sub graph의 edge를 예측할 때 모델이 연결되어 있다는 정보를 알지 못하도록 remove 합니다.
            # 모델이 연결되어 있다는 정보를 알고 있다면, 예측의 의미가 없기 때문입니다.
            _, _, edges_to_remove = sampled_graph.edge_ids(users, items, etype='preference', return_uv=True)
            _, _, edges_to_remove_rev = sampled_graph.edge_ids(items, users, etype='preference-by', return_uv=True)
            
            sampled_with_edges_removed = sampled_graph
            if len(edges_to_remove) > 0:
                sampled_with_edges_removed = dgl.remove_edges(
                    sampled_with_edges_removed, edges_to_remove, 'preference')
                sampled_eids = sampled_with_edges_removed.edges['preference'].data[dgl.EID]
            if len(edges_to_remove_rev) > 0:
                sampled_with_edges_removed = dgl.remove_edges(
                    sampled_with_edges_removed, edges_to_remove_rev, 'preference-by')
                sampled_eids_rev = sampled_with_edges_removed.edges['preference-by'].data[dgl.EID]
            
            # Create a block from the sampled graph.
            block = dgl.to_block(sampled_with_edges_removed, seeds)
            blocks.insert(0, block)
            seeds = {'user': block.srcnodes['user'].data[dgl.NID],
                     'item': block.srcnodes['item'].data[dgl.NID]}
            
            # Copy the ratings to the edges of the sampled block
            block.edges['preference'].data['rating'] = \
                self.graph.edges['preference'].data['rating'][sampled_eids]
            block.edges['preference-by'].data['rating'] = \
                self.graph.edges['preference-by'].data['rating'][sampled_eids_rev]
            
        return blocks

In [14]:
import torch.nn as nn 
import torch.nn.functional as F 
import dgl.function as fn 
import dgl.nn as dglnn 

In [15]:
class GCMCConv(nn.Module):
    def __init__(self, hidden_dims, num_ratings):
        super().__init__()
        
        # The ratings are ranged from 1 to num_ratings, so I add 1 to the number of parameters.
        self.W_r = nn.Parameter(torch.randn(num_ratings + 1, hidden_dims, hidden_dims))
        self.W = nn.Linear(hidden_dims * 2, hidden_dims)
        
    def compute_message(self, W, edges):
        W_r = W[edges.data['rating']]
        h = edges.src['h']
        m = (W_r @ h.unsqueeze(-1)).squeeze(2)
        return m
    
    def forward(self, graph, node_features):
        with graph.local_scope():
            src_features, dst_features = node_features
            graph.srcdata['h'] = src_features
            graph.dstdata['h'] = dst_features
            # Compute messages
            graph.apply_edges(lambda edges: {'m': self.compute_message(self.W_r, edges)})
            # Aggregate messages
            graph.update_all(fn.copy_e('m', 'm'), fn.mean('m', 'h_neigh'))
            # Updates the representations of output users and items
            result = F.relu(self.W(torch.cat([graph.dstdata['h'], graph.dstdata['h_neigh']], 1)))
            return result

In [16]:
class GCMCLayer(nn.Module):
    def __init__(self, hidden_dims, num_ratings):
        super().__init__()
        
        self.heteroconv = dglnn.HeteroGraphConv(
            {'preference': GCMCConv(hidden_dims, num_ratings), 'preference-by': GCMCConv(hidden_dims, num_ratings)}, 
            aggregate='sum'
        )
    
    def forward(self, block, input_user_features, input_item_features):
        with block.local_scope():
            h_user = input_user_features 
            h_item = input_item_features 
            
            src_features = {'user':h_user, 'item':h_item}
            
            dst_features = {'user':h_user[:block.number_of_dst_nodes('user')], 'item': h_item[:block.number_of_dst_nodes('item')]}
            
            result = self.heteroconv(block, (src_features, dst_features))
            return result['user'], result['item']

In [17]:
class GCMCRating(nn.Module):
    def __init__(self, num_users, num_items, hidden_dims, num_ratings, num_layers):
        super().__init__()
        
        # Node-specific learnable embeddings
        self.user_embeddings = nn.Embedding(num_users, hidden_dims)
        self.item_embeddings = nn.Embedding(num_items, hidden_dims)
        
        self.layers = nn.ModuleList([
            GCMCLayer(hidden_dims, num_ratings) for _ in range(num_layers)])
        
        self.W = nn.Linear(hidden_dims, hidden_dims)
        self.V = nn.Linear(hidden_dims, hidden_dims)
        
    def forward(self, blocks):
        # Propagate messages top-down (Step 4)
        # We start with a learnable embedding for each user and item...
        user_embeddings = self.user_embeddings(blocks[0].srcnodes['user'].data[dgl.NID])
        item_embeddings = self.item_embeddings(blocks[0].srcnodes['item'].data[dgl.NID])
        
        
        # Then perform a heterogeneous GCMC convolution
        for block, layer in zip(blocks, self.layers):
            user_embeddings, item_embeddings = layer(block, user_embeddings, item_embeddings)
        
        # Compute predicted preference (Step 5)
        user_embeddings = self.W(user_embeddings)
        item_embeddings = self.V(item_embeddings)
        
        return user_embeddings, item_embeddings
        
    def compute_score(self, pair_graph, user_embeddings, item_embeddings):
        with pair_graph.local_scope():
            pair_graph.nodes['user'].data['h'] = user_embeddings
            pair_graph.nodes['item'].data['h'] = item_embeddings
            pair_graph.apply_edges(fn.u_dot_v('h', 'h', 'r'))
            
            return pair_graph.edata['r']

In [37]:
def rmse(pred, label):
    pred = pred.flatten()
    return ((pred - label)**2).mean().sqrt()

In [40]:
import tqdm 

NUM_LAYERS = 1 
BATCH_SIZE = 500 
NUM_EPOCHS = 50 
HIDDEN_DIMS = 8

sampler = MinibatchSampler(graph, NUM_LAYERS)
train_dataloader = DataLoader(train_set, batch_size=BATCH_SIZE, collate_fn=sampler.sample, shuffle=True)
test_dataloader = DataLoader(valid_set, batch_size=BATCH_SIZE, collate_fn=sampler.sample, shuffle=False)

model = GCMCRating(graph.number_of_nodes('user'), graph.number_of_nodes('item'), HIDDEN_DIMS, 5, NUM_LAYERS)
optimizer = torch.optim.Adam(model.parameters())

best_loss = float('inf')
for _ in range(NUM_EPOCHS):
    model.train()
    with tqdm.tqdm(train_dataloader) as t:
        for pair_graph, blocks in t: 
           
            user_emb, item_emb = model(blocks)
            prediction = model.compute_score(pair_graph, user_emb, item_emb)
            loss = ((prediction - pair_graph.edata['rating']) ** 2).mean()
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            t.set_postfix({'loss': '%.4f' % loss.item()}, refresh=False)
    
    model.eval()
    with tqdm.tqdm(test_dataloader) as t:
        with torch.no_grad():
            predictions = [] 
            ratings = []
            for pair_graph, blocks in t:
                user_emb, item_emb = model(blocks)
                prediction = model.compute_score(pair_graph, user_emb, item_emb)
                predictions.append(prediction)
                ratings.append(pair_graph.edata['rating'])
            
            predictions = torch.cat(predictions, dim=0)
            ratings = torch.cat(ratings, dim=0)
        RMSE_loss = rmse(predictions, ratings)
        print(f'RMSE: {RMSE_loss.item():.4f}', )

        if best_loss > RMSE_loss:
            best_loss = RMSE_loss 
            path = os.path.join(SAVE_PATH)
            torch.save(model.state_dict(), os.path.join(path, 'GCMC-paramters.pt'))

100%|██████████| 1448/1448 [01:04<00:00, 22.35it/s, loss=1.4018]
100%|██████████| 480/480 [00:07<00:00, 60.60it/s]


RMSE: 1.1500


 11%|█▏        | 164/1448 [00:07<00:58, 22.03it/s, loss=1.3607]