In [None]:
import pandas as pd
from arango import ArangoClient
from tqdm import tqdm
import numpy as np
import itertools
import requests
import sys
# import oasis

import torch
import torch.nn.functional as F
from torch.nn import Linear
from arango import ArangoClient
import torch_geometric.transforms as T
from torch_geometric.nn import SAGEConv, to_hetero
from torch_geometric.transforms import RandomLinkSplit, ToUndirected
from sentence_transformers import SentenceTransformer
from torch_geometric.data import HeteroData
import yaml

In [None]:
metadata_path = '/Users/ikram.ali/workplace/projects/experiments/downloads/movielens/movies_metadata.csv'
df = pd.read_csv(metadata_path)

In [None]:
df.head()

In [None]:
df.columns

In [None]:
# on these rows metadata information is missing
df = df.drop([19730, 29503, 35587])

In [None]:
# sampled from links.csv file
links_small = pd.read_csv('/Users/ikram.ali/workplace/projects/experiments/downloads/movielens/links_small.csv')

In [None]:
links_small.head()

In [None]:
# selecting tmdbId coloumn from links_small file
links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int')

In [None]:
df['id'] = df['id'].astype('int')

In [None]:
sampled_md = df[df['id'].isin(links_small)]
sampled_md.shape

In [None]:
sampled_md['tagline'] = sampled_md['tagline'].fillna('')
sampled_md['description'] = sampled_md['overview'] + sampled_md['tagline']
sampled_md['description'] = sampled_md['description'].fillna('')

In [None]:
sampled_md = sampled_md.reset_index()

In [None]:
sampled_md.head()

In [None]:
indices = pd.Series(sampled_md.index, index=sampled_md['title'])

In [None]:
ind_gen = pd.Series(sampled_md.index, index=sampled_md['genres'])

##Let's Load Ratings File

We are going to use the ratings file to construct a bipartite graph. This file includes movies rated by different users on the scale of 1-5, rating of 1 implies very bad movie and 5 corresponds to a very good movie.

In [None]:
ratings_path = '/Users/ikram.ali/workplace/projects/experiments/downloads/movielens/ratings.csv'

In [None]:
ratings_df = pd.read_csv(ratings_path)
ratings_df.head()

In [None]:
# performs user and movie mappings
def node_mappings(path, index_col):
    df = pd.read_csv(path, index_col=index_col)
    mapping = {index: i for i, index in enumerate(df.index.unique())}

    return mapping

In [None]:
user_mapping = node_mappings(ratings_path, index_col='userId')

In [None]:
movie_mapping = node_mappings(ratings_path, index_col='movieId')

In [None]:
m_id = ratings_df['movieId'].tolist()

In [None]:
# all unique movie_ids present inside ratings file
#m_id = list(set(m_id))
m_id = list(dict.fromkeys(m_id))
len(m_id)

In [None]:
def convert_int(x):
    try:
        return int(x)
    except:
        return np.nan

In [None]:
id_map = pd.read_csv('/Users/ikram.ali/workplace/projects/experiments/downloads/movielens/links_small.csv')[['movieId', 'tmdbId']]

In [None]:
id_map['tmdbId'] = id_map['tmdbId'].apply(convert_int)

In [None]:
id_map.columns = ['movieId', 'id']


In [None]:
id_map.head()

In [None]:
id_map = id_map.merge(sampled_md[['title', 'id']], on='id').set_index('title')

In [None]:
indices_map = id_map.set_index('id')

In [None]:
# print 5 mappings of movieIds
list(movie_mapping.items())[:5]

In [None]:
print("%d number of unique movie ids" %len(m_id))

In [None]:
# remove ids which dont have meta data information

def remove_movies(m_id):
    no_metadata = []
    for idx in range(len(m_id)):
        tmdb_id = id_map.loc[id_map['movieId'] == m_id[idx]]
  
        if tmdb_id.size == 0:
            no_metadata.append(m_id[idx])
            #print('No Meta data information at:', m_id[idx])
    return no_metadata

In [None]:
no_metadata = remove_movies(m_id)

In [None]:
## remove ids which dont have meta data information
for element in no_metadata:
    if element in m_id:
        print("ids with no metadata information:",element)
        m_id.remove(element)

In [None]:
print("Number of movies with metadata information:", len(m_id))

In [None]:
# create new movie_mapping dict with only m_ids having metadata information
movie_mappings = {}
for idx, m in enumerate(m_id):
    movie_mappings[m] = idx

In [None]:
client = ArangoClient(hosts="http://localhost:8529")
sys_db = client.db("_system", username="root", password="toor")
sys_db.databases()

In [None]:
db = client.db("deets", username="root", password="toor")

In [None]:
db.create_collection(name="Movie")

In [None]:
batch = []
BATCH_SIZE = 128
batch_idx = 1
index = 0
movie_collection = db["Movie"]

In [None]:
# loading movies metadata information into ArangoDB's Movie collection
for idx in tqdm(range(len(m_id))):
    insert_doc = {}
    tmdb_id = id_map.loc[id_map['movieId'] == m_id[idx]]
  
    if tmdb_id.size == 0:
        print('No Meta data information at:', m_id[idx])
        

    else:
        tmdb_id = int(tmdb_id.iloc[:,1][0])
        emb_id = "Movie/" + str(movie_mappings[m_id[idx]])
        insert_doc["_id"] = emb_id
        m_meta = sampled_md.loc[sampled_md['id'] == tmdb_id]
        # adding movie metadata information 
        m_title = m_meta.iloc[0]['title']
        m_poster = m_meta.iloc[0]['poster_path']
        m_description = m_meta.iloc[0]['description']
        m_language = m_meta.iloc[0]['original_language']
        m_genre = m_meta.iloc[0]['genres']
        m_genre = yaml.load(m_genre, Loader=yaml.BaseLoader)
        genres = [g['name'] for g in m_genre]
         
        insert_doc["movieId"] = m_id[idx]
        insert_doc["mapped_movieId"] = movie_mappings[m_id[idx]]
        insert_doc["tmdbId"] = tmdb_id
        insert_doc['movie_title'] = m_title
     
        insert_doc['description'] = m_description
        insert_doc['genres'] = genres
        insert_doc['language'] = m_language
        
        if str(m_poster) == "nan":
            insert_doc['poster_path'] = "No poster path available"
        else:
            insert_doc['poster_path'] = m_poster
        
        batch.append(insert_doc)
        index +=1
        last_record = (idx == (len(m_id) - 1))
        if index % BATCH_SIZE == 0:
            #print("Inserting batch %d" % (batch_idx))
            batch_idx += 1
            movie_collection.import_bulk(batch)
            batch = []   
        if last_record and len(batch) > 0:
            print("Inserting batch the last batch!")
            movie_collection.import_bulk(batch)

In [None]:
# create a new collection named "Users" if it does not exist.
# This returns an API wrapper for "Users" collection.
if not db.has_collection("Users"):
    db.create_collection("Users", replication_factor=3)

In [None]:
# Users has no side information
total_users = np.unique(ratings_df[['userId']].values.flatten()).shape[0]
print("Total number of Users:", total_users)

In [None]:
def populate_user_collection(total_users):
    batch = []
    BATCH_SIZE = 50
    batch_idx = 1
    index = 0
    user_ids = list(user_mapping.keys())
    user_collection = db["Users"]
    for idx in tqdm(range(total_users)):
        insert_doc = {}

        insert_doc["_id"] = "Users/" + str(user_mapping[user_ids[idx]])
        insert_doc["original_id"] = str(user_ids[idx])
        
        batch.append(insert_doc)
        index +=1
        last_record = (idx == (total_users - 1))
        if index % BATCH_SIZE == 0:
            #print("Inserting batch %d" % (batch_idx))
            batch_idx += 1
            user_collection.import_bulk(batch)
            batch = []   
        if last_record and len(batch) > 0:
            print("Inserting batch the last batch!")
            user_collection.import_bulk(batch)

In [None]:
populate_user_collection(total_users)

# Creating Ratings (Edge) Collection

Here, we first create a Ratings (Edge) collection in ArangoDB and then populate this collection with edges of a bipartite graph. Each edge document in this collection will contain the information about _from (user) and _to (movie) node along with the rating data given by a user to that particular movie. Once the creation of this collection is completed, a bipartite graph (user and movie nodes) is formed in ArangoDB which can be viewed using ArangoDB Web UI under the Graphs->movie_rating_graph.

In [None]:
# create a new collection named "Ratings" if it does not exist.
# This returns an API wrapper for "Ratings" collection.
if not db.has_collection("Ratings"):
    db.create_collection("Ratings", edge=True, replication_factor=3)



# defining graph schema

# create a new graph called movie_rating_graph in the temp database if it does not already exist.
if not db.has_graph("movie_rating_graph"):
    db.create_graph('movie_rating_graph', smart=True)

# This returns and API wrapper for the above created graphs
movie_rating_graph = db.graph("movie_rating_graph")

In [None]:
# Create a new vertex collection named "Users" if it does not exist.
if not movie_rating_graph.has_vertex_collection("Users"):
    movie_rating_graph.vertex_collection("Users")

In [None]:
# Create a new vertex collection named "Movie" if it does not exist.
if not movie_rating_graph.has_vertex_collection("Movie"):
    movie_rating_graph.vertex_collection("Movie")

In [None]:
# creating edge definitions named "Ratings. This creates any missing
# collections and returns an API wrapper for "Ratings" edge collection.
if not movie_rating_graph.has_edge_definition("Ratings"):
    Ratings = movie_rating_graph.create_edge_definition(
        edge_collection='Ratings',
        from_vertex_collections=['Users'],
        to_vertex_collections=['Movie']
    )

In [None]:
user_id, movie_id, ratings = ratings_df[['userId']].values.flatten(), ratings_df[['movieId']].values.flatten() , ratings_df[['rating']].values.flatten()

In [None]:
def create_ratings_graph(user_id, movie_id, ratings):
    batch = []
    BATCH_SIZE = 100
    batch_idx = 1
    index = 0
    edge_collection = db["Ratings"]
    for idx in tqdm(range(ratings.shape[0])):
        
        # removing edges (movies) with no metatdata
        if movie_id[idx] in no_metadata:
            print('Removing edges with no metadata', movie_id[idx])
            
        else:
            insert_doc = {}
            insert_doc = {"_id":    "Ratings" + "/" + 'user-' + str(user_mapping[user_id[idx]]) + "-r-" + "movie-" + str(movie_mappings[movie_id[idx]]), 
                          "_from":  ("Users" + "/" + str(user_mapping[user_id[idx]])),
                          "_to":    ("Movie" + "/" + str(movie_mappings[movie_id[idx]])),
                          "_rating": float(ratings[idx])}

            batch.append(insert_doc)
            index += 1
            last_record = (idx == (ratings.shape[0] - 1))

            if index % BATCH_SIZE == 0:
                #print("Inserting batch %d" % (batch_idx))
                batch_idx += 1
                edge_collection.import_bulk(batch)
                batch = []
            if last_record and len(batch) > 0:
                print("Inserting batch the last batch!")
                edge_collection.import_bulk(batch)

In [None]:
create_ratings_graph(user_id, movie_id, ratings)

In [None]:
# Get API wrappers for collections.
users = db.collection('Users')
movies = db.collection('Movie')
ratings_graph = db.collection('Ratings')

In [None]:
len(users), len(movies), len(ratings_graph)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Load edges from Ratings collection in ArangoDB and export them to PyG data format.
Data handling of graphs in PyG: In order to construct edges of the graph in PyG we need to represent graph connectivity in COO format (edge_index) i.e with shape [2, num_edges]. Therefore, create_pyg_edges method can be seen as a generic function which reads the documents from edge collection (Ratings) and create edges (edge_index) in PyG using _from (src) and _to (dst) attributes of rating documents. Since the edge of the graph is accompanied with ratings information, hence, create_pyg_edges method is also going to read the _rating attribute from an edge_collection and store it in a PyG data object using edge_attr variable.

In [None]:
def create_pyg_edges(rating_docs):
    src = []
    dst = []
    ratings = []
    for doc in rating_docs:
        _from = int(doc['_from'].split('/')[1])
        _to   = int(doc['_to'].split('/')[1])
         
        src.append(_from)
        dst.append(_to)
        ratings.append(int(doc['_rating']))
        
    edge_index = torch.tensor([src, dst])
    edge_attr = torch.tensor(ratings)

    return edge_index, edge_attr 

In [None]:
edge_index, edge_label = create_pyg_edges(db.aql.execute('FOR doc IN Ratings RETURN doc'))

In [None]:
print(edge_index.shape)
print(edge_label.shape)

Load nodes from Ratings collection in ArangoDB and export them PyG data format.
So, in the above section we read the "Ratings” edge collection from ArangoDB and exported edges into PyG acceptable data format i.e edge_index and edge_label. Now, the next step would be to construct movie node features, in order to construct them, I have written the two following methods:

Sequence Encoder: This method takes two arguments, the first one is movie_docs with the help of which we can access metadata information of each movie stored inside the "Movie" collection. The second argument is model_name which takes a pretrained NLP (based on transformers) model from the SentenceTransformers library and generates text embeddings. In this blogpost, I am generating embeddings for movie titles and representing it as a movie node feature. However, instead of movie title we can also use movie description attribute to generate embeddings for movie nodes. Curious readers can try this out and see if results get better.

Genres Encoder: In this method we perform the one-hot-encodings of the genres present inside the Movie collection.

Once, the features are generated from sequence encoder and genre encoder method, we concatenate these two feature vectors to construct one feature vector for a movie node.

Note: This process of feature generation for movie nodes is inspired from PyG examples.

In [None]:
def SequenceEncoder(movie_docs , model_name=None):
    movie_titles = [doc['movie_title'] for doc in movie_docs]
    model = SentenceTransformer(model_name, device=device)
    title_embeddings = model.encode(movie_titles, show_progress_bar=True,
                              convert_to_tensor=True, device=device)
    
    return title_embeddings

In [None]:
def GenresEncoder(movie_docs):
    gen = []
    #sep = '|'
    for doc in movie_docs:
        gen.append(doc['genres'])
        #genre = doc['movie_genres']
        #gen.append(genre.split(sep))
    
    # getting unique genres
    unique_gen = set(list(itertools.chain(*gen)))
    print("Number of unqiue genres we have:", unique_gen)
    
    mapping = {g: i for i, g in enumerate(unique_gen)}
    x = torch.zeros(len(gen), len(mapping))
    for i, m_gen in enumerate(gen):
        for genre in m_gen:
            x[i, mapping[genre]] = 1
    return x.to(device)

In [None]:
title_emb = SequenceEncoder(db.aql.execute('FOR doc IN Movie RETURN doc'), model_name='all-MiniLM-L6-v2')
encoded_genres = GenresEncoder(db.aql.execute('FOR doc IN Movie RETURN doc'))
print('Title Embeddings shape:', title_emb.shape)
print("Encoded Genres shape:", encoded_genres.shape)

In [None]:
# concat title and genres features of movies
movie_x = torch.cat((title_emb, encoded_genres), dim=-1)
print("Shape of the concatenated features:", movie_x.shape)

# Creating PyG Heterogeneous Graph

Heterogeneous graphs are those graphs which have different types of nodes and edges in the graph for e.g. Knowledge Graphs. The bipartite graph which we have stored in ArangoDB is also a heterogeneous graph since it constitutes two types of nodes in it i.e. user and movie nodes. Therefore, our next step would be to export the graph present inside ArangoDB to a PyG heterogeneous data object.

Since now we have PyG edges, labels and node feature matrix, the next step would be to add these tensors to PyG HeteroData object in order to construct a heterogeneous graph.



In [None]:
data = HeteroData()

In [None]:
data['user'].num_nodes = len(users)  # Users do not have any features.
data['movie'].x = movie_x
data['user', 'rates', 'movie'].edge_index = edge_index
data['user', 'rates', 'movie'].edge_label = edge_label

In [None]:
# Add user node features for message passing:
data['user'].x = torch.eye(data['user'].num_nodes, device=device)

In [None]:
del data['user'].num_nodes

We can now convert data into an appropriate format for training a graph-based machine learning model:

Here, ToUndirected() transforms a directed graph into (the PyG representation of) an undirected graph, by adding reverse edges for all edges in the graph. Thus, future message passing is performed in both direction of all edges. The function may add reverse edge types to the heterogeneous graph, if necessary.

In [None]:
# Add a reverse ('movie', 'rev_rates', 'user') relation for message passing.
data = ToUndirected()(data)
del data['movie', 'rev_rates', 'user'].edge_label  # Remove "reverse" label.

In [None]:
data = data.to(device)