In [1]:
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.metrics.cluster import (
    completeness_score,
    homogeneity_score,
    v_measure_score,
)
import pandas as pd
import torch
import numpy as np
from tqdm import tqdm
from torch_geometric.data import HeteroData
from torch_geometric.transforms import RandomLinkSplit, ToUndirected
from torch_geometric.data import InMemoryDataset
import torch_geometric.transforms as T
from torch_geometric.loader import DataLoader
from torch_geometric.nn import GCNConv, SAGEConv
import torch.nn.functional as F
from torch import nn

In [2]:
def load_node_csv(path, index_col, item_col, encoders=None, **kwargs):
    df = pd.read_csv(path, **kwargs)
    mapping = {index: i for i, index in enumerate(df[index_col].unique())}

    x = np.arange(df[item_col].nunique()).reshape(-1,1)
    
    if encoders is not None:
        x = None
        xs = [encoder(df[col]) for col, encoder in encoders.items()]
        x = torch.cat(xs, dim=-1)

    return x, mapping


def load_edge_csv(path, src_index_col, src_mapping, dst_index_col, dst_mapping,
                  encoders=None, **kwargs):
    df = pd.read_csv(path, **kwargs)

    src = [src_mapping[index] for index in df[src_index_col]]
    dst = [dst_mapping[index] for index in df[dst_index_col]]
    edge_index = torch.tensor([src, dst])

    edge_attr = None
    if encoders is not None:
        edge_attrs = [encoder(df[col]) for col, encoder in encoders.items()]
        edge_attr = torch.cat(edge_attrs, dim=-1)

    return edge_index, edge_attr


class IdentityEncoder(object):
    # The 'IdentityEncoder' takes the raw column values and converts them to
    # PyTorch tensors.
    def __init__(self, dtype=None):
        self.dtype = dtype

    def __call__(self, df):
        return torch.from_numpy(np.ones_like(df.values)).view(-1, 1).to(self.dtype)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
rating_path = '/opt/ml/input/data/train/train_ratings.csv'

movie_x, user_mapping = load_node_csv(rating_path, index_col='user', item_col='item')

df = pd.read_csv(rating_path)
movie_mapping = {index: i for i, index in enumerate(df.item.unique())}
edge_index, edge_label = load_edge_csv(
    rating_path,
    src_index_col='user',
    src_mapping=user_mapping,
    dst_index_col='item',
    dst_mapping=movie_mapping,
    encoders={'item': IdentityEncoder(dtype=torch.long)},
)

data = HeteroData()

data['user'].num_nodes = len(user_mapping)  # Users do not have any features.
data['movie'].num_nodes = len(movie_mapping)
data['user', 'rates', 'movie'].edge_index = edge_index
data['user', 'rates', 'movie'].edge_label = edge_label

# data = data.pin_memory()
# data = data.to('cuda:0', non_blocking=True)

print(data)
data['user'].x = torch.ones(data['user'].num_nodes, 1, dtype=torch.float32)
del data['user'].num_nodes

data['movie'].x = torch.ones(data['movie'].num_nodes, 1, dtype=torch.float32)
del data['movie'].num_nodes

data = data.to(device)
# We can now convert `data` into an appropriate format for training a
# graph-based machine learning model:

# 1. Add a reverse ('movie', 'rev_rates', 'user') relation for message passing.
data = ToUndirected()(data)
del data['movie', 'rev_rates', 'user'].edge_label  # Remove "reverse" label.

print(data)

HeteroData(
  [1muser[0m={ num_nodes=31360 },
  [1mmovie[0m={ num_nodes=6807 },
  [1m(user, rates, movie)[0m={
    edge_index=[2, 5154471],
    edge_label=[5154471, 1]
  }
)
HeteroData(
  [1muser[0m={ x=[31360, 1] },
  [1mmovie[0m={ x=[6807, 1] },
  [1m(user, rates, movie)[0m={
    edge_index=[2, 5154471],
    edge_label=[5154471, 1]
  },
  [1m(movie, rev_rates, user)[0m={ edge_index=[2, 5154471] }
)


In [3]:
# data = InMemoryDataset.collate([data])[0].to(device)
# data = data.to_homogeneous()

In [4]:
data

HeteroData(
  [1muser[0m={ x=[31360, 1] },
  [1mmovie[0m={ x=[6807, 1] },
  [1m(user, rates, movie)[0m={
    edge_index=[2, 5154471],
    edge_label=[5154471, 1]
  },
  [1m(movie, rev_rates, user)[0m={ edge_index=[2, 5154471] }
)

In [6]:
train_data, val_data, test_data = T.RandomLinkSplit(
    num_val=0.1,
    num_test=0.1,
    neg_sampling_ratio=0.0,
    edge_types=[('user', 'rates', 'movie')],
    rev_edge_types=[('movie', 'rev_rates', 'user')],
)(data)

In [8]:
print(data)
print(train_data)

HeteroData(
  [1muser[0m={ x=[31360, 1] },
  [1mmovie[0m={ x=[6807, 1] },
  [1m(user, rates, movie)[0m={
    edge_index=[2, 5154471],
    edge_label=[5154471, 1]
  },
  [1m(movie, rev_rates, user)[0m={ edge_index=[2, 5154471] }
)
HeteroData(
  [1muser[0m={ x=[31360, 1] },
  [1mmovie[0m={ x=[6807, 1] },
  [1m(user, rates, movie)[0m={
    edge_index=[2, 4123577],
    edge_label=[4123577, 1],
    edge_label_index=[2, 4123577]
  },
  [1m(movie, rev_rates, user)[0m={ edge_index=[2, 4123577] }
)


In [13]:
train_data[('user', 'rates', 'movie')].edge_label_index

tensor([[30479, 19339, 15912,  ..., 20114, 28161, 11963],
        [ 1280,  3804,   301,  ...,  3822,   922,  2516]], device='cuda:0')

In [None]:
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
val_loader = DataLoader(val_data, batch_size=32)
test_loader = DataLoader(test_data, batch_size=32)

In [None]:
next(iter(train_loader))

In [None]:
class GNNEncoder(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = SAGEConv((-1, -1), hidden_channels)
        self.conv2 = SAGEConv((-1, -1), out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x


class EdgeDecoder(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.lin1 = nn.Linear(2 * hidden_channels, hidden_channels)
        self.lin2 = nn.Linear(hidden_channels, 1)

    def forward(self, z):
        z = self.lin1(z).relu()
        z = self.lin2(z)
        return z.view(-1)


class Model(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.encoder = GNNEncoder(hidden_channels, hidden_channels)
        self.decoder = EdgeDecoder(hidden_channels)

    def forward(self, x, edge_index):
        z = self.encoder(x, edge_index)
        return self.decoder(z)


model = Model(hidden_channels=32).to(device)

# # Due to lazy initialization, we need to run one model step so the number
# # of parameters can be inferred:
# with torch.no_grad():
#     model.encoder(train_data.x_dict, train_data.edge_index_dict)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)


def train():
    model.train()
    total_loss = 0
    for data in train_loader:
        optimizer.zero_grad()
        pred = model(data.x, data.edge_index)
        target = data.edge_label.float()
        loss = F.binary_cross_entropy_with_logits(pred, target).sqrt()
        loss.backward()
        optimizer.step()
        total_loss += float(loss) * data.num_graphs
    return total_loss / len(train_data)

@torch.no_grad()
def test(dataloader):
    model.eval()
    total_rmse = 0
    for data in dataloader:
        pred = model(data.x, data.edge_index)
        pred = pred.clamp(min=0, max=1)
        target = data.edge_label.float()
        rmse = F.binary_cross_entropy_with_logits(pred, target).sqrt()
        total_rmse += rmse
    return float(total_rmse)



In [None]:
for epoch in range(1, 11):
    loss = train()
    train_rmse = test(train_data)
    val_rmse = test(val_data)
    test_rmse = test(test_data)
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Train: {train_rmse:.4f}, '
          f'Val: {val_rmse:.4f}, Test: {test_rmse:.4f}')