In [2]:
import dgl
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
from itertools import combinations
import numpy as np
from tqdm import tqdm
import itertools
from numpy.typing import NDArray, ArrayLike
from typing import Optional
from builtins import len
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def generate_neg_edges(pos_edges: NDArray, neg_size, max_iterations: Optional[int] = 1000000) -> NDArray:
        pos_edges_set = set([tuple(edge) for edge in pos_edges])
        neg_edges = set()
        min_edge, max_edge = pos_edges.flatten().min(axis=0), pos_edges.flatten().max(axis=0)
        iterations = 0
        while len(neg_edges) < neg_size and iterations < max_iterations:
                start_edge = np.random.randint(min_edge, max_edge)
                end_edge = np.random.randint(min_edge, max_edge)
                if start_edge != end_edge and (start_edge, end_edge) not in pos_edges_set:
                        neg_edges.add((start_edge, end_edge))
                iterations += 1
        return np.array(list(neg_edges))

def generate_candidate_edges(pos_edges: NDArray, neg_edges: NDArray, user_playlist: ArrayLike, n_candidates: Optional[int]=1000, max_iterations: Optional[int]=1000000):
        start_candidates = set(user_playlist)
        start_candidates.discard(0)
        end_candidates = set([track for track in set(pos_edges.flatten()) if track not in start_candidates])
        end_candidates.discard(0)
        candidates = set()
        iterations = 0

        pos_edges_set = set(pos_edges.flatten())
        neg_edges_set = set(neg_edges.flatten())

        
        while len(candidates) < n_candidates and iterations < max_iterations:
                candidate = (np.random.choice(list(start_candidates), size=1)[0], np.random.choice(list(end_candidates), size=1)[0])
                if candidate not in candidates.union(pos_edges_set).union(neg_edges_set):
                        candidates.add(candidate)
                iterations += 1
        return np.array(list(candidates))

In [4]:
features_filepath = "/home/adamgorski/Desktop/inzynierka/conseillify/conseillify-research/data/predictions/ranking/features.csv"
playlists_filepath = "/home/adamgorski/Desktop/inzynierka/conseillify/conseillify-research/data/predictions/ranking/playlists.csv"
user_playlist_filepath = "/home/adamgorski/Desktop/inzynierka/conseillify/conseillify-research/data/predictions/ranking/user_playlist.csv"

In [5]:

features = pd.read_csv(features_filepath, index_col=False).to_numpy()[:, 2:13]
features = torch.from_numpy(np.array(features, dtype=np.float64))
playlists = pd.read_csv(playlists_filepath).to_numpy()
SAMPLE_SIZE = 3
sampled_playlists = playlists[np.random.choice(playlists.shape[0], size=SAMPLE_SIZE, replace=False), :]
pos_edges = np.array([list(combinations(np.delete(playlist, 0), 2)) for playlist in tqdm(sampled_playlists)]).reshape(-1, 2)
pos_edges = np.unique(pos_edges, axis=0)
user_playlist = pd.read_csv(user_playlist_filepath).to_numpy()[:, 1]
print(pos_edges.shape)
print(features.shape)
print(sampled_playlists.shape)
print(user_playlist.shape)

100%|██████████| 3/3 [00:00<00:00, 218.25it/s]


(34004, 2)
torch.Size([27927, 11])
(3, 376)
(375,)


In [6]:
neg_edges = generate_neg_edges(pos_edges, pos_edges.shape[0])
candidates = generate_candidate_edges(pos_edges, neg_edges, user_playlist)

In [7]:
train_pos_edges, test_pos_edges, train_neg_edges, test_neg_edges = train_test_split(pos_edges, neg_edges, test_size=0.2)
train_pos_edges = np.array(train_pos_edges)
test_pos_edges = np.array(test_pos_edges)
train_neg_edges = np.array(train_neg_edges)
test_neg_edges = np.array(test_neg_edges)

print(train_pos_edges.size, train_neg_edges.size, test_pos_edges.size, test_neg_edges.size) 

54406 54406 13602 13602


In [8]:
train_pos_graph = dgl.graph((train_pos_edges[:, 0].flatten(), train_neg_edges[:, 1].flatten()), num_nodes=features.shape[0])
train_neg_graph = dgl.graph((train_neg_edges[:, 0].flatten(), train_neg_edges[:, 1].flatten()), num_nodes=features.shape[0])
test_pos_graph = dgl.graph((test_pos_edges[:, 0].flatten(), test_pos_edges[:, 1].flatten()), num_nodes=features.shape[0])
test_neg_graph = dgl.graph((test_neg_edges[:, 0].flatten(), test_neg_edges[:, 1].flatten()), num_nodes=features.shape[0])
train_start_edges = np.concatenate((train_pos_edges[:, 0], train_neg_edges[:, 0]), axis=0).flatten()
train_end_edges = np.concatenate((train_pos_edges[:, 1], train_neg_edges[:, 1]), axis=0).flatten()
test_start_edges = np.concatenate((test_pos_edges[:, 0], test_neg_edges[:, 0]), axis=0).flatten()
test_end_edges = np.concatenate((test_pos_edges[:, 1], test_pos_edges[:, 1]), axis=0).flatten()
train_g = dgl.graph((train_start_edges, train_end_edges), num_nodes=features.shape[0])
test_g = dgl.graph((test_start_edges, test_end_edges), num_nodes=features.shape[0])
train_pos_graph.ndata['feat']=features
train_neg_graph.ndata['feat']=features
test_pos_graph.ndata['feat']=features
test_neg_graph.ndata['feat']=features
train_g.ndata['feat']=features
test_g.ndata['feat']=features

In [9]:
from dgl.nn import SAGEConv

# ----------- 2. create model -------------- #
# build a two-layer GraphSAGE model
class GraphSAGE(nn.Module):
    def __init__(self, in_feats, h_feats):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(in_feats, h_feats, 'mean')
        self.conv2 = SAGEConv(h_feats, h_feats, 'mean')

    def forward(self, g, in_feat):
        h = self.conv1(g, in_feat.float())
        h = F.relu(h)
        h = self.conv2(g, h)
        return h
import dgl.function as fn

class DotPredictor(nn.Module):
    def forward(self, g, h):
        with g.local_scope():
            g.ndata['h'] = h
            # Compute a new edge feature named 'score' by a dot-product between the
            # source node feature 'h' and destination node feature 'h'.
            g.apply_edges(fn.u_dot_v('h', 'h', 'score'))
            # u_dot_v returns a 1-element vector for each edge so you need to squeeze it.
            return g.edata['score'][:, 0]

In [10]:
import dgl.function as fn

class DotPredictor(nn.Module):
    def forward(self, g, h):
        with g.local_scope():
            g.ndata['h'] = h
            g.apply_edges(fn.u_dot_v('h', 'h', 'score'))
            return g.edata['score'][:, 0]
class MLPPredictor(nn.Module):
    def __init__(self, h_feats):
        super().__init__()
        self.W1 = nn.Linear(h_feats * 2, h_feats)
        self.W2 = nn.Linear(h_feats, 1)

    def apply_edges(self, edges):
        """
        Computes a scalar score for each edge of the given graph.

        Parameters
        ----------
        edges :
            Has three members ``src``, ``dst`` and ``data``, each of
            which is a dictionary representing the features of the
            source nodes, the destination nodes, and the edges
            themselves.

        Returns
        -------
        dict
            A dictionary of new edge features.
        """
        h = torch.cat([edges.src['h'], edges.dst['h']], 1)
        return {'score': self.W2(F.relu(self.W1(h))).squeeze(1)}

    def forward(self, g, h):
        with g.local_scope():
            g.ndata['h'] = h
            g.apply_edges(self.apply_edges)
            return g.edata['score']

In [11]:


def compute_loss(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score])
    labels = torch.cat([torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])])
    return F.binary_cross_entropy_with_logits(scores, labels)

def compute_auc(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score]).numpy()
    labels = torch.cat(
        [torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]).numpy()
    return roc_auc_score(labels, scores)

In [12]:
model = GraphSAGE(train_pos_graph.ndata['feat'].shape[1], 16)
pred = MLPPredictor(16)
optimizer = torch.optim.Adam(itertools.chain(model.parameters(), pred.parameters()), lr=0.01)
for epoch in range(200):
    h = model(train_g, train_g.ndata['feat'])
    pos_score = pred(train_pos_graph, h)
    neg_score = pred(train_neg_graph, h)
    loss = compute_loss(pos_score, neg_score)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    if epoch % 5 == 0:
        print('In epoch {}, loss: {}'.format(epoch, loss))        



In epoch 0, loss: 3.723393440246582
In epoch 5, loss: 1.30510675907135
In epoch 10, loss: 1.4460320472717285
In epoch 15, loss: 1.086094617843628
In epoch 20, loss: 0.7561049461364746
In epoch 25, loss: 0.6178773641586304
In epoch 30, loss: 0.5126681327819824
In epoch 35, loss: 0.44930770993232727
In epoch 40, loss: 0.4253521263599396
In epoch 45, loss: 0.41442540287971497
In epoch 50, loss: 0.4095926284790039
In epoch 55, loss: 0.4062982201576233
In epoch 60, loss: 0.3995200991630554
In epoch 65, loss: 0.3938545286655426
In epoch 70, loss: 0.38772648572921753
In epoch 75, loss: 0.3825594186782837
In epoch 80, loss: 0.37719762325286865
In epoch 85, loss: 0.37193235754966736
In epoch 90, loss: 0.3662301301956177
In epoch 95, loss: 0.36018574237823486
In epoch 100, loss: 0.3537357449531555
In epoch 105, loss: 0.3457789421081543
In epoch 110, loss: 0.33734244108200073
In epoch 115, loss: 0.33015814423561096
In epoch 120, loss: 0.3210482895374298
In epoch 125, loss: 0.31232595443725586
In 

In [13]:
from sklearn.metrics import roc_auc_score
with torch.no_grad():
    pos_score = pred(test_pos_graph, h)
    neg_score = pred(test_neg_graph, h)
    print('AUC', compute_auc(pos_score, neg_score))


AUC 0.9007852383212283


In [14]:
candidates_graph = dgl.graph((candidates[:, 0].flatten(), candidates[:, 1].flatten()), num_nodes=features.shape[0])
candidates_graph.ndata['feat']=features
candidates_preds = pred(candidates_graph, h)
preds = candidates_preds.detach()
preds = np.sort(preds)[::-1]
print(len([p for p in preds if p > 0]), len([p for p in preds if p < 0]))

1 999


In [15]:
print(type(h))

<class 'torch.Tensor'>
