In [1]:
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.metrics.cluster import (
    completeness_score,
    homogeneity_score,
    v_measure_score,
)
import pandas as pd
import torch
import numpy as np
from tqdm import tqdm
from torch_geometric.data import HeteroData
from torch_geometric.data import InMemoryDataset
import torch_geometric.transforms as T
from torch_geometric.nn import SAGEConv, to_hetero
from torch_geometric.utils import negative_sampling
import torch.nn.functional as F
import torch.nn as nn
from tqdm import tqdm
from torch_geometric.loader import HGTLoader

import os
import random

In [2]:
class MovieLensDataset(InMemoryDataset):

    def __init__(self, root, transform = None, pre_transform = None):
        super().__init__(root, transform, pre_transform)
        self.data, self.slices = torch.load(self.processed_paths[0])

    @property
    def raw_file_names(self):
        return [f'/opt/ml/input/data/train/train_ratings.csv']

    @property
    def processed_file_names(self):
        return [f'data_0.pt']

    def download(self):
        pass

    def process(self):
        data = HeteroData()

        df = pd.read_csv(self.raw_file_names[0])

        item_mapping = {idx: i for i, idx in enumerate(df.item.unique())}
        user_mapping = {idx: i for i, idx in enumerate(df.user.unique())}
        data['item'].num_nodes = len(item_mapping)
        data['user'].num_nodes = len(user_mapping)

        src = [user_mapping[idx] for idx in df['user']]
        dst = [item_mapping[idx] for idx in df['item']]
        edge_index = torch.tensor([src, dst])

        rating = torch.from_numpy(np.ones_like(df.item)).to(torch.long)
        data['user', 'rates', 'item'].edge_index = edge_index
        data['user', 'rates', 'item'].edge_label = rating

        if self.pre_transform is not None:
            data = self.pre_transform(data)

        torch.save(self.collate([data]), self.processed_paths[0])

In [3]:
dataset = MovieLensDataset(root='/opt/ml/input/melon')

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
data = dataset[0].to(device)

idx = np.arange(data['user'].num_nodes)
random.shuffle(idx)
train_idx, val_idx = idx[len(idx)//10:], idx[:len(idx)//10]

train_mask = torch.ones(data['user'].num_nodes, dtype=bool)
train_mask[train_idx] = True

val_mask = torch.zeros(data['user'].num_nodes, dtype=bool)
val_mask[val_idx] = True

data['user'].train_mask = train_mask
data['user'].val_mask = val_mask

data['user'].x = torch.ones(data['user'].num_nodes, 1, dtype=torch.float32)
del data['user'].num_nodes

data['item'].x = torch.ones(data['item'].num_nodes, 1, dtype=torch.float32)
del data['item'].num_nodes

data = T.ToUndirected()(data)
del data['item', 'rev_rates', 'user'].edge_label  # Remove "reverse" label.

In [5]:
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [6]:
train_input_nodes = ('user', data['user'].train_mask)
val_input_nodes = ('user', data['user'].val_mask)

kwargs = {'batch_size': 1024, 'num_workers': 6, 'persistent_workers': True}

train_loader = HGTLoader(data, num_samples=[1024] * 4, shuffle=True, input_nodes = train_input_nodes, **kwargs)
val_loader = HGTLoader(data, num_samples=[1024] * 4, input_nodes = train_input_nodes, **kwargs)

In [7]:
batch = next(iter(train_loader))
print(batch)

HeteroData(
  [1mitem[0m={ x=[4096, 1] },
  [1muser[0m={
    train_mask=[4096],
    val_mask=[4096],
    x=[4096, 1],
    batch_size=1024
  },
  [1m(user, rates, item)[0m={
    edge_index=[2, 73302],
    edge_label=[73302]
  },
  [1m(item, rev_rates, user)[0m={ edge_index=[2, 191435] }
)


In [None]:
neg_edge_index = negative_sampling(
            edge_index=batch['user', 'item'].edge_index, #positive edges
            # num_nodes=len(set(torch.unique(batch['user', 'item'].edge_index[0].detach().cpu()))),  # throws error
            num_neg_samples=batch['user', 'item'].edge_index.size()[1],
            force_undirected=True
        )

In [None]:
torch.where(batch['user'].train_mask == True)[0].size()[0]

In [13]:
batch.edge_index_dict

{('user',
  'rates',
  'item'): tensor([[2153,  820, 1442,  ..., 2351, 1648, 4053],
         [   0,    0,    0,  ..., 4095, 4095, 4095]], device='cuda:0'),
 ('item',
  'rev_rates',
  'user'): tensor([[1141,    3,  124,  ..., 1038, 3540,  174],
         [   0,    0,    0,  ..., 4095, 4095, 4095]], device='cuda:0')}

In [None]:
batch['user', 'item'].edge_index.size()[1]

In [None]:
neg_edge_index

In [None]:
len(set(torch.unique(batch['user', 'item'].edge_index[0].detach().cpu())))

In [None]:
torch.cat((batch['user', 'item'].edge_label, torch.zeros_like(batch['user', 'item'].edge_label)))

In [9]:
class GNNEncoder(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = SAGEConv((-1, -1), hidden_channels)
        self.conv2 = SAGEConv((-1, -1), out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x


class EdgeDecoder(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.lin1 = nn.Linear(2 * hidden_channels, hidden_channels)
        self.lin2 = nn.Linear(hidden_channels, 1)

    def forward(self, z_dict, pos_edge_index, neg_edge_index):
        print(pos_edge_index.size())
        print(neg_edge_index.size())
        edge_index = torch.cat([pos_edge_index, neg_edge_index], dim=-1)
        row, col = edge_index
        z = torch.cat([z_dict['user'][row], z_dict['item'][col]], dim=-1)

        z = self.lin1(z).relu()
        z = self.lin2(z)
        return z.view(-1,1)


class Model(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.encoder = GNNEncoder(hidden_channels, hidden_channels)
        self.encoder = to_hetero(self.encoder, data.metadata(), aggr='sum')
        self.decoder = EdgeDecoder(hidden_channels)

    def forward(self, x_dict, edge_index_dict, edge_index, neg_edge_index):
        z_dict = self.encoder(x_dict, edge_index_dict)
        return self.decoder(z_dict, edge_index, neg_edge_index)


model = Model(hidden_channels=32).to(device)

# Due to lazy initialization, we need to run a single batch so the number
# of parameters can be inferred:
with torch.no_grad():
    batch = next(iter(train_loader))
    batch = batch.to(device)
    model.encoder(batch.x_dict, batch.edge_index_dict)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)


def train(train_loader):
    model.train()
    total_loss = 0
    for data in train_loader:
        neg_edge_index = negative_sampling(
            edge_index=data['user', 'item'].edge_index, #positive edges
            num_neg_samples=data['user', 'item'].edge_index.size()[1],
            force_undirected=True
        )

        data = data.to(device)
        optimizer.zero_grad()
        pred = model(data.x_dict, data.edge_index_dict,
                    data['user', 'item'].edge_index, neg_edge_index)
        print('>>>>>>',batch['user', 'item'].edge_label.size())
        print(batch['user', 'item'].edge_index.size())
        print(neg_edge_index[0].size())
        target = torch.cat((batch['user', 'item'].edge_label, torch.zeros_like(neg_edge_index[0]))).view(-1,1).float()
        
        loss = F.binary_cross_entropy_with_logits(pred, target) # .sqrt()
        # print(f"pred: {pred}, target: {target}\nloss: {loss}")
        loss.backward()
        optimizer.step()
        total_loss += float(loss) * 1024
    return float(total_loss)


@torch.no_grad()
def test(data):
    model.eval()
    pred = model(data.x_dict, data.edge_index_dict,
                 data['user', 'item'].edge_index)
    pred = pred.clamp(min=0, max=1)
    target = data['user', 'item'].edge_label.view(-1,1).float()
    rmse = F.binary_cross_entropy_with_logits(pred, target) # .sqrt()
    return float(rmse)

In [10]:
for epoch in range(1, 21):
    loss = train(train_loader)
    train_bce = 0
    for train_data in train_loader:
        train_data = train_data.to(device)
        tmp_train_bce = test(train_data)
        train_bce += tmp_train_bce
    val_bce = 0
    for val_data in val_loader:
        val_data = val_data.to(device)
        tmp_val_bce = test(val_data)
        val_bce += tmp_val_bce
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Train: {train_bce:.4f}, '
          f'Val: {val_bce:.4f}')

torch.Size([2, 71760])
torch.Size([2, 71760])
>>>>>> torch.Size([73673])
torch.Size([2, 73673])
torch.Size([71760])


ValueError: Target size (torch.Size([145433, 1])) must be the same as input size (torch.Size([143520, 1]))

### TO ARGVA!

In [None]:
from torch.nn import Linear
import torch
from torch_geometric.nn import ARGVA, GCNConv, HeteroConv

In [None]:
class Encoder(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = HeteroConv({
                        ('user', 'rates', 'movie'): GCNConv(in_channels, hidden_channels),
                    }, aggr='sum')
        self.conv_mu = HeteroConv({
                        ('user', 'rates', 'movie'): GCNConv(hidden_channels, out_channels),
                    }, aggr='sum')
        self.conv_logstd = HeteroConv({
                        ('user', 'rates', 'movie'): GCNConv(hidden_channels, out_channels),
                    }, aggr='sum')

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        return self.conv_mu(x, edge_index), self.conv_logstd(x, edge_index)


class Discriminator(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super().__init__()
        self.lin1 = Linear(in_channels, hidden_channels)
        self.lin2 = Linear(hidden_channels, hidden_channels)
        self.lin3 = Linear(hidden_channels, out_channels)

    def forward(self, x):
        x = self.lin1(x).relu()
        x = self.lin2(x).relu()
        return self.lin3(x)


encoder = Encoder(in_channels=1, hidden_channels=32, out_channels=32)
discriminator = Discriminator(in_channels=32, hidden_channels=64,
                              out_channels=32)
model = ARGVA(encoder, discriminator).to(device)

encoder_optimizer = torch.optim.Adam(encoder.parameters(), lr=0.005)
discriminator_optimizer = torch.optim.Adam(discriminator.parameters(),
                                           lr=0.001)

In [None]:
def train():
    model.train()
    encoder_optimizer.zero_grad()
    z = model.encode(train_data.x_dict, train_data.edge_index_dict)

    # We optimize the discriminator more frequently than the encoder.
    for i in range(5):
        discriminator_optimizer.zero_grad()
        discriminator_loss = model.discriminator_loss(z)
        discriminator_loss.backward()
        discriminator_optimizer.step()

    loss = model.recon_loss(z, train_data.pos_edge_label_index_dict)
    loss = loss + model.reg_loss(z)
    loss = loss + (1 / train_data.num_nodes) * model.kl_loss()
    loss.backward()
    encoder_optimizer.step()
    return float(loss)


@torch.no_grad()
def test(data):
    model.eval()
    z = model.encode(data.x_dict, data.edge_index_dict)

    # Cluster embedded values using k-means.
    kmeans_input = z.cpu().numpy()
    kmeans = KMeans(n_clusters=7, random_state=0).fit(kmeans_input)
    pred = kmeans.predict(kmeans_input)

    labels = data.y.cpu().numpy()
    completeness = completeness_score(labels, pred)
    hm = homogeneity_score(labels, pred)
    nmi = v_measure_score(labels, pred)

    auc, ap = model.test(z, data.pos_edge_label_index,
                         data.neg_edge_label_index)

    return auc, ap, completeness, hm, nmi

In [None]:
train_data.x_dict['user']

In [None]:
for epoch in range(1, 11):
    loss = train()
    auc, ap, completeness, hm, nmi = test(test_data)
    print((f'Epoch: {epoch:03d}, Loss: {loss:.3f}, AUC: {auc:.3f}, '
           f'AP: {ap:.3f}, Completeness: {completeness:.3f}, '
           f'Homogeneity: {hm:.3f}, NMI: {nmi:.3f}'))