In [1]:
import csv
import numpy as np
from neo4j.v1 import GraphDatabase, basic_auth

In [2]:
host = "bolt://localhost" # replace this with your Sandbox host
password = "neo" # replace this with your Sandbox password

driver = GraphDatabase.driver(host)

Create the userid records

In [3]:
with open('ratings.csv') as f:
    reader = csv.DictReader(f, delimiter=",")
    # userids are in a sequence hence I can just get the max
    userids = range(1, max([int(x['userId']) for x in reader]))
    print(userids)

range(1, 138493)


In [4]:
with driver.session() as session:
    numbers = {"nodes": [{'userid': x} for x in userids]}
    create_userid_query = '''
    UNWIND {nodes} as node
    CREATE (n:UserID {userid: node.userid})
    '''
    result = session.run(create_userid_query, numbers)

In [5]:
with open('movies.csv') as f:
    reader = csv.DictReader(f, delimiter=",")
    genres = []
    for line in reader:
        line_genres = line['genres'].split('|')
        for g in line_genres:
            genres.append(g)
    genres = list(set(genres))
    print(genres)
    print(len(genres))

['Sci-Fi', 'Fantasy', 'Thriller', 'Film-Noir', 'Animation', 'Horror', 'Adventure', 'Action', 'Romance', 'Crime', 'Documentary', 'Musical', 'Western', '(no genres listed)', 'War', 'Children', 'IMAX', 'Comedy', 'Mystery', 'Drama']
20


In [6]:
with driver.session() as session:
    with open('movies.csv') as f:
        reader = csv.DictReader(f, delimiter=",")
        movies = {"nodes": [{'movieId': x['movieId'], 'name': x['title']} for x in reader]}
        create_userid_query = '''
        UNWIND {nodes} as node
        CREATE (n:MovieId {movieId: node.movieId, name: node.name})
        '''
        result = session.run(create_userid_query, movies)

Create the genres nodes

In [7]:
print('Create the genres nodes')
with driver.session() as session:
    with open('movies.csv') as f:
        reader = csv.DictReader(f, delimiter=",")
        genres = [x['genres'].split('|') for x in reader]
        genres = sum(genres, [])
        genres = list(set(genres))
        print(genres)
        genres = {"nodes": [{'name': x} for x in genres]}
    create_genre_nodes = '''
        UNWIND {nodes} as node
        CREATE (n:Genre {name: node.name})
    '''
    result = session.run(create_genre_nodes, genres)

Create the genres nodes
['Sci-Fi', 'Fantasy', 'Thriller', 'Film-Noir', 'Animation', 'Horror', 'Adventure', 'Action', 'Romance', 'Crime', 'Documentary', 'Musical', 'Western', '(no genres listed)', 'War', 'Children', 'IMAX', 'Comedy', 'Mystery', 'Drama']


Create the movieid-belongsto->genres relationship

In [8]:
print('Create the movieid-belongsto->genres relationship')
with driver.session() as session:
    with open('movies.csv') as f:
        reader = csv.DictReader(f, delimiter=",")
        for line in reader:
            movieid = line['movieId']
            genres = line['genres'].split('|')
            movies = {"records": [{'movieId': movieid, 'genres': genres}]}
            create_movie_genre_relationship = '''
                UNWIND {records} as record
                    MATCH (a:MovieId) where a.movieId=record.movieId
                    MATCH (b:Genre) where b.name in record.genres
                    CREATE (a)-[:BELONGSTO]->(b)
            '''
            result = session.run(create_movie_genre_relationship, movies)

Create the movieid-belongsto->genres relationship


Create the edge list to be reviewed by node2vec

In [9]:
print('Create the edge list')
with driver.session() as session, open("graph/movies.edgelist", "w") as edges_file:
    result = session.run("""\
    MATCH (m:MovieId)--(other)
    RETURN id(m) AS source, id(other) AS target
    """)

    writer = csv.writer(edges_file, delimiter=" ")

    for row in result:
        writer.writerow([row["source"], row["target"]])

Create the edge list


In [10]:
print('run node2vec')

run node2vec


In [11]:
%%bash
./node2vec -i:graph/movies.edgelist -o:emb/movies.emb -l:80 -d:100 -p:0.3 -dr -v


An algorithmic framework for representational learning on graphs. [Oct 27 2018]
Input graph path (-i:)=graph/movies.edgelist
Output graph path (-o:)=emb/movies.emb
Number of dimensions. Default is 128 (-d:)=100
Length of walk per source. Default is 80 (-l:)=80
Number of walks per source. Default is 10 (-r:)=10
Context size for optimization. Default is 10 (-k:)=10
Number of epochs in SGD. Default is 1 (-e:)=1
Return hyperparameter. Default is 1 (-p:)=0.3
Inout hyperparameter. Default is 1 (-q:)=1
Verbose output. (-v)=YES
Graph is directed. (-dr)=YES
Graph is weighted. (-w)=NO
Output random walks instead of embeddings. (-ow)=NO
Read 54406 lines from graph/movies.edgelist
Preprocessing progress: 0.00% Preprocessing progress: 0.37% Preprocessing progress: 0.73% Preprocessing progress: 1.10% Preprocessing progress: 1.47% Preprocessing progress: 1.83% Preprocessing progress: 2.20% Preprocessing progress: 2.56% Preprocessing progress: 2.93% Preprocessing progress: 3.30% Preprocess

In [12]:
with open("emb/movies.emb", "r") as movies_file, driver.session() as session:
    next(movies_file)
    reader = csv.reader(movies_file, delimiter=" ")

    params = []
    for row in reader:
        movie_id = row[0]
        params.append({
            "id": int(movie_id),
            "embedding": [float(item) for item in row[1:]]
        })

    session.run("""\
    UNWIND {params} AS param
    MATCH (m:MovieId) WHERE id(m) = param.id
    SET m.embedding = param.embedding
    """, {"params": params})

In [19]:
import pandas as pd

movies_genres_query = """\
MATCH (genre:Genre)
WITH genre ORDER BY genre.name
WITH collect(id(genre)) AS genres
MATCH (m:MovieId)-[:BELONGSTO]->(genre)
WITH genres, id(m) AS source, m.embedding AS embedding, collect(id(genre)) AS target
RETURN source, embedding, [g in genres | CASE WHEN g in target THEN 1 ELSE 0 END] AS genres
"""

with driver.session() as session:
    result = session.run(movies_genres_query)
    df = pd.DataFrame([dict(row) for row in result])

In [20]:
df.head()

Unnamed: 0,embedding,genres,source
0,"[-0.0735659, -0.0411579, -0.0325281, -0.075554...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...",278142
1,"[-0.0585565, -0.0226075, -0.0425021, -0.072493...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...",286345
2,"[-0.0265658, -0.00291156, -0.0213783, -0.04549...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...",280707
3,"[-0.0781281, 0.0149233, -0.0317364, -0.0673305...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...",282669
4,"[-0.0499904, 0.0664172, 0.00637541, -0.0164738...","[0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, ...",282427


### Make some predictions from the results.

In [14]:
from gensim.models import KeyedVectors

In [23]:
filename = 'emb/movies.emb'

In [25]:
model = KeyedVectors.load_word2vec_format(filename, binary=False)

In [27]:
model.most_similar('260169')

  if np.issubdtype(vec.dtype, np.int):


[('252573', 0.9870771169662476),
 ('276773', 0.984420895576477),
 ('260679', 0.9837837219238281),
 ('263498', 0.983630895614624),
 ('254262', 0.9832301735877991),
 ('253096', 0.9815400242805481),
 ('274147', 0.981526255607605),
 ('263074', 0.9814952611923218),
 ('270921', 0.9811148047447205),
 ('270035', 0.9809507131576538)]

In [92]:
def neo4j_most_similar(model, key):
    with driver.session() as session:
#         movies = {"movies": [{'name': key}]}
#         find_movie_query = '''
#             UNWIND {movies} as movie
#                 MATCH (m:MovieId {name: '%s'})
#                 return id(m)
#         ''' % key
        find_movie_query = "MATCH (m:MovieId {name: '%s'}) return id(m)" % key
#         print(find_movie_query)
#         result = session.run(find_movie_query, movies)
        result = session.run(find_movie_query)
        for r in result:
#             print(r.value())
            similar_movies = model.most_similar(str(r.value()))
#             print(similar_movies)
            for s_movie in similar_movies:
                find_movie_query = "MATCH (m:MovieId) where id(m) = %s return m.name" % s_movie[0]
                similar_movie_names = session.run(find_movie_query)
                for sm in similar_movie_names:
                    print(sm.value(), s_movie[1])

In [93]:
neo4j_most_similar(model, 'Money Train (1995)')

Waco: The Rules of Engagement (1997) 0.9953966736793518
Five Wives, Three Secretaries and Me (1998) 0.9936010837554932
Dark Matter (2007) 0.9935312271118164
Old Man and the Sea, The (1958) 0.9933647513389587
Pop Redemption (2013) 0.9931492805480957
Woman Next Door, The (Femme d'à côté, La) (1981) 0.9930013418197632
Brainstorm (1965) 0.9929700493812561
Suddenly (Tan de Repente) (2002) 0.9929301142692566
Springsteen & I (2013) 0.9922378659248352
Fly Away (Bis zum Horizont, dann links!) (2012) 0.9920598864555359


  if np.issubdtype(vec.dtype, np.int):


Money Train: A revengeful New York transit cop resolves to rob a high-tech train laden with an immense amount of money. His foster brother, a fellow policeman, endeavours to safeguard him.

Waco: The Rules of Engagement: This documentary about the 1993 showdown between the FBI and the Branch Davidians in Waco, Texas, presents an alternate theory about the tragedy. The government has long contended that this fringe Christian group was a danger, and that the siege on its compound, which resulted in the death of 70 peo

Seems legit