In [2]:
from py2neo import *
import pandas as pd
import time
from tqdm import tqdm
import getpass

In [3]:
username = input("Enter username: ")
password = getpass.getpass('Enter password: ')
port = input("Enter Neo4j listening port: ")
graph = Graph(f"bolt://localhost:{port}", auth=(username, password))

__Query 1__: Ottenere i __top $\mathbf{k}$ Genres__, dato un __User__.

Esempi di esecuzione con $k=5$

In [5]:
user_uuid_associations = pd.read_pickle("data/users_uuids.pkl")

__Metodo 1: media score__

In [6]:
user_id = user_uuid_associations[1]
K = 5

graph.run(
    (
        "MATCH(u:User)-[r:RATES]->(m:Movie)-[h:HAS_GENRE]->(g:Genre)"
        f"WHERE u.id = '{user_id}'"
        "RETURN g.name as GENRE, AVG(r.score) as AVG_SCORE "
        "ORDER BY AVG_SCORE DESC "
        f"LIMIT {K}"
    )
).to_table()


GENRE,AVG_SCORE
Thriller,4.2
Action,4.0
Animation,4.0
Romance,3.9444444444444446
Crime,3.875


__Metodo 2: max score * count score__

In [34]:
graph.run(
    (
        "MATCH(u:User)-[r:RATES]->(m:Movie)-[h:HAS_GENRE]->(g:Genre) "
        f"WHERE u.id = '{user_id}' "
        "RETURN g.name as GENRE, MAX(r.score)*COUNT(r.score) as SCORE "
        "ORDER BY SCORE DESC "
        f"LIMIT {K}"
    )
).to_table()


GENRE,SCORE
Drama,265
Comedy,115
Romance,90
Adventure,55
Crime,40


__Tempo di computazione (Metodo 1)__

In [37]:
def get_top_k_genres(user_id, K):
    return graph.run(
            (
                "MATCH(u:User)-[r:RATES]->(m:Movie)-[h:HAS_GENRE]->(g:Genre)"
                f"WHERE u.id = '{user_id}'"
                "RETURN g.name as GENRE, AVG(r.score) as AVG_SCORE "
                "ORDER BY AVG_SCORE DESC "
                f"LIMIT {K}"
            )
        ).to_table()

In [43]:
total_time = 0

for user_id in tqdm(user_uuid_associations.values()):
    start_time = time.time()
    get_top_k_genres(user_id, K)
    total_time += (time.time() - start_time)
    
print(total_time/len(user_uuid_associations.values()))

100%|██████████| 162541/162541 [17:48<00:00, 152.10it/s] 

0.00651306757677376





__Query 2__: Ottenere le __top $\mathbf{k}$ Categories__, dato un __User__ 

__Metodo 1: media relevance * media score__

In [19]:
graph.run(
    (
        "MATCH(u:User)-[r:RATES]->(m:Movie)-[h:HAS_CATEGORY]->(c:Category) "
        f"WHERE u.id = '{user_id}' "
        "RETURN c.name as CATEGORY, AVG(h.relevance)*AVG(r.score) as SCORE "
        "ORDER BY SCORE DESC "
        f"LIMIT {K}"
    )
).to_table()


CATEGORY,SCORE
jay and silent bob,4.473
math,4.399875
mathematics,4.33575
tolkien,3.993859375
short-term memory loss,3.992


__Metodo 2: media relevance* count rating * max score__

In [8]:
graph.run(
    (
        "MATCH(u:User)-[r:RATES]->(m:Movie)-[h:HAS_CATEGORY]->(c:Category) "
        f"WHERE u.id = '{user_id}' "
        "RETURN c.name as CATEGORY, AVG(h.relevance)*COUNT(r.score)*MAX(r.score) as SCORE "
        "ORDER BY SCORE DESC "
        f"LIMIT {K}"
    )
).to_table()


CATEGORY,SCORE
original,250.45125
criterion,211.21375
storytelling,209.69875
melancholic,209.55875
reflective,199.2375


__Tempo di computazione (Metodo 1)__

In [20]:
def get_top_k_categories(user_id, K):
    return graph.run(
        (
            "MATCH(u:User)-[r:RATES]->(m:Movie)-[h:HAS_CATEGORY]-(c:Category) "
            f"WHERE u.id = '{user_id}' "
            "RETURN c.name as CATEGORY, AVG(h.relevance)*COUNT(r.score)*MAX(r.score) as SCORE "
            "ORDER BY SCORE DESC "
            f"LIMIT {K}"
        )
    ).to_table()


In [10]:
total_time = 0

for user_id in tqdm(user_uuid_associations.values()):
    start_time = time.time()
    get_top_k_categories(user_id, K)
    total_time += (time.time() - start_time)
    
print(total_time/len(user_uuid_associations.values()))

100%|██████████| 162541/162541 [1:23:21<00:00, 32.50it/s]

0.030554692825194497





__Query 3__ : Ottenere i __top $\mathbf{k}$ Movies__, dato un __Genre__.

In [6]:
genre_uuid_associations = pd.read_pickle("data/genre_uuids.pkl")

__Metodo 1: media degli score__

In [14]:
genre_id = genre_uuid_associations['Action']

graph.run(
    (
        f"MATCH(g:Genre{{id: '{genre_id}'}})<-[h:HAS_GENRE]-(m:Movie)<-[r:RATES]-(u:User)"
        "RETURN m.title as MOVIE, avg(r.score) as SCORE "
        "ORDER BY SCORE DESC "
        "LIMIT 10"
    )
).to_table()


MOVIE,SCORE
Days of Power,5.0
White Rush,5.0
Pit Fighter,5.0
Throwback,5.0
Santosh Subramaniam,5.0
Kabir Singh,5.0
Junga,5.0
Awe!,5.0
Kaithi,5.0
Dangerous Flowers,5.0


__Metodo 2: count * max score__

In [17]:
graph.run(
    (
        f"MATCH(g:Genre{{id: '{genre_id}'}})<-[h:HAS_GENRE]-(m:Movie)<-[r:RATES]-(u:User)"
        "RETURN m.title as MOVIE, count(r.score)*max(r.score) as SCORE "
        "ORDER BY SCORE DESC "
        "LIMIT 10"
    )
).to_table()


MOVIE,SCORE
"Matrix, The",363370
Star Wars: Episode IV - A New Hope,343585
Jurassic Park,320720
Braveheart,295920
Fight Club,293865
Terminator 2: Judgment Day,286895
Star Wars: Episode V - The Empire Strikes Back,286805
Star Wars: Episode VI - Return of the Jedi,274585
Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark),273375
"Lord of the Rings: The Return of the King, The",253985


__Tempo di computazione (Metodo 1)__

In [11]:
def get_top_k_movies(genre_id, K):
    return graph.run(
        (
            f"MATCH(g:Genre{{id: '{genre_id}'}})<-[h:HAS_GENRE]-(m:Movie)<-[r:RATES]-(u:User)"
            "RETURN m.title as MOVIE, count(r.score)*max(r.score) as SCORE "
            "ORDER BY SCORE DESC "
            f"LIMIT {K}"
        )
    )

In [15]:
total_time = 0

for genre_id in tqdm(genre_uuid_associations.values()):
    start_time = time.time()
    get_top_k_movies(genre_id, K)
    total_time += (time.time() - start_time)
    
print(total_time/len(genre_uuid_associations.values()))

100%|██████████| 19/19 [06:01<00:00, 19.00s/it]

19.00047913350557





__Query 4__ : Ottenere i __top $\mathbf{k}$ Movies__, data una __Category__.

In [23]:
categories_uuid_associations = pd.read_pickle("data/categories_uuids.pkl")
category_id = categories_uuid_associations[1]

In [21]:
graph.run(
    (
        f"MATCH(c:Category{{id: '{category_id}'}})<-[h:HAS_CATEGORY]-(m:Movie) "
        "RETURN m.title as MOVIE, h.relevance as RELEVANCE "
        "ORDER BY RELEVANCE DESC "
        "LIMIT 5"
    )
).to_table()


MOVIE,RELEVANCE
Tomorrow Never Dies,0.99975
Octopussy,0.99975
You Only Live Twice,0.99975
Never Say Never Again,0.99975
"View to a Kill, A",0.99975


In [25]:
graph.run(
    (
        f"MATCH(c:Category{{id: '{category_id}'}})<-[h:HAS_CATEGORY]-(m:Movie)<-[r:RATES]-(u:User)"
        "RETURN m.title as MOVIE, h.relevance*avg(r.score) as POP_RELEVANCE "
        "ORDER BY POP_RELEVANCE DESC "
        "LIMIT 5"
    )
).to_table()


MOVIE,POP_RELEVANCE
Casino Royale,3.616791603519913
Goldfinger,3.605932308455252
"Ipcress File, The",3.605676671214189
From Russia with Love,3.5721480889553874
Dr. No,3.5470968750000127


In [26]:
def get_top_k_movies(category_id, K):
    graph.run(
    (
        f"MATCH(c:Category{{id: '{category_id}'}})<-[h:HAS_CATEGORY]-(m:Movie)<-[r:RATES]-(u:User)"
        "RETURN m.title as MOVIE, h.relevance*avg(r.score) as POP_RELEVANCE "
        "ORDER BY POP_RELEVANCE DESC "
        f"LIMIT {K}"
    )
).to_table()

In [27]:
total_time = 0

for category_id in tqdm(categories_uuid_associations.values()):
    start_time = time.time()
    get_top_k_movies(category_id, K)
    total_time += (time.time() - start_time)
    
print(total_time/len(categories_uuid_associations.values()))

  1%|          | 11/1128 [01:34<2:39:07,  8.55s/it]


KeyboardInterrupt: 

#### __USER BASED CF__

In [37]:
"""
CALL gds.graph.project(
  'movie_recommendations_cf',                                
  ["User", "Movie", "Genre", "Category"],  
  [
  {
    HAS_GENRE: {orientation: "UNDIRECTED"}
  },
  {
    RATES : { properties: "score", orientation: "UNDIRECTED"}
  },
  {
    DESCRIBES : { properties: "relevance", orientation: "UNDIRECTED"}
  } 
  ]                
)
YIELD
  graphName, nodeProjection, nodeCount AS nodes, relationshipCount AS rels
RETURN graphName, nodeProjection.Book AS bookProjection, nodes, rels
"""

'\nCALL gds.graph.project(\n  \'movies_recommendations\',                                \n  ["User", "Movie", "Genre", "Category"],  \n  ["HAS_GENRE",                  \n  {\n    RATES : { properties: "score"}\n  },\n  {\n    DESCRIBES : { properties: "relevance"}\n  }\n  ]                       \n)\nYIELD\n  graphName, nodeProjection, nodeCount AS nodes, relationshipCount AS rels\nRETURN graphName, nodeProjection.Book AS bookProjection, nodes, rels\n'

In [None]:
"""
CALL gds.fastRP.mutate('movie_recommendations_cf',
  {
    nodeLabels: ["User", "Movie"],
    relationshipTypes: ["RATES"],
    embeddingDimension: 256,
    relationshipWeightProperty: 'score',
    randomSeed: 42,
    mutateProperty: 'embedding'    
  }
)
YIELD nodePropertiesWritten
"""


In [None]:
"""
CALL gds.fastRP.stream('movie_recommendations_cf',
  {
    nodeLabels: ["User", "Movie"],
    relationshipTypes: ["RATES"],
    embeddingDimension: 256,
    relationshipWeightProperty: 'score',
    randomSeed: 42
  }
)
YIELD nodeId, embedding
"""

In [None]:
"""
CALL gds.graph.writeNodeProperties('movie_recommendations_cf', ['embedding'], ['User'])
YIELD propertiesWritten
"""

In [None]:
"""
CALL gds.graph.project(
  'cf-projection',  
  {
      User: {properties : "embedding"}
  },   
    ["*"]                      
)
YIELD
  graphName, nodeProjection, nodeCount AS nodes, relationshipCount AS rels
RETURN graphName, nodeProjection.Book AS bookProjection, nodes, rels
"""

In [None]:
"""
CALL gds.knn.stream(
  "cf-projection",
  {   
      nodeProperties: ["embedding"]
  }
) YIELD node1, node2, similarity
RETURN gds.util.asNode(node1).id AS User1, gds.util.asNode(node2).id AS User2, similarity
ORDER BY similarity DESCENDING, User1, User2
"""

In [None]:
"""
CALL gds.knn.write('cf-projection', {
    nodeProperties: ["embedding"],
    writeRelationshipType: 'SIMILAR',
    writeProperty: 'score',
    topK: 10,
    nodeProperties: ['embedding']
})
YIELD nodesCompared, relationshipsWritten
"""

Get __similar users__, given a __user__

In [35]:
user_id = "71e1599b-ab75-44c1-9667-1ca41fae6c06"
graph.run(
            (
            f"MATCH (u1:User{{id: '{user_id}'}})-[s:SIMILAR]->(u2:User)"
            " RETURN u2.id AS ID, s.score AS SCORE"
            " ORDER BY SCORE DESC"
            )
        ).to_table()

ID,SCORE
6242c722-4e0e-4db3-80a3-a03b8b1cb2ba,0.9114094972610474
ac3188f8-03a7-4ab8-a906-ef4dae488f9d,0.908204197883606
a5ef5a66-8586-4ba1-90b7-72510d060630,0.903659999370575
954c1fbd-2790-4d65-9622-987511eb5c04,0.8815503120422363
4a8856f0-5d4e-4d17-b1e2-5c567b8ae139,0.8736069202423096
b8766815-c30c-416b-8ebb-eee37f22865e,0.8730874061584473
fb5bb7b3-1c75-4238-9c04-88e6ed443b3b,0.8679203987121582
2e6a8a5c-5a15-4912-a842-2daff9b99303,0.8675627708435059
da95806c-2365-4b69-b4f6-7a9b62e777b9,0.8667223453521729
bd22c54a-5139-4d4a-bab9-54f4e6855ecf,0.8662257790565491


Recommending movies from __similar users__, given a __user__ (collaborative filtering)

In [37]:
user_id = "71e1599b-ab75-44c1-9667-1ca41fae6c06"

graph.run(
    (
        f"MATCH(u:User {{id: '{user_id}'}})-[:RATES]->(m:Movie) "
        "WITH collect(m.id) AS watchedMoviesIds "
        f"MATCH (u1:User{{id: '{user_id}'}})-[s:SIMILAR]->(u2:User)-[r:RATES]->(m:Movie) "
        "WHERE NOT m.id IN watchedMoviesIds "
        "RETURN m.id as ID, m.title as TITLE, r.score*s.score AS SCORE "
        "ORDER BY SCORE DESC "
        "LIMIT 10"
    )
).to_table()


ID,TITLE,SCORE
e8672bfe-d802-4580-96fa-dabe603a903d,Vergeef,4.54102098941803
5885bdd3-99f6-41c8-a9b1-c95af10b9f0f,The Cookie Carnival,4.407751560211182
a0f42a32-e313-46c4-b3ca-1015e8619eb2,The Fox in the Chicken Coop,4.368034601211548
3b60b1ee-ddd1-4b29-b1ef-9f011f7b6ea8,Extinct Pink,4.368034601211548
a184518b-d17a-4444-aa1e-2417b7975aa9,La Parmigiana,4.368034601211548
ebffde54-3815-4668-ac04-01fed3ff8861,The Unfaithfuls,4.368034601211548
8b380cee-cb7d-47af-ad85-f29dc6ee862f,The Castaways of Turtle Island,4.368034601211548
cd631d6c-4bb6-4fc4-afbf-c2d9bb771b0b,Ha! Ha! Ha!,4.368034601211548
0ae632c2-f7b2-4fe9-aa6b-e3e7afa92dbb,The Scavengers,4.368034601211548
6f555b3c-4783-48e3-bbf1-0a81e2ee0eb3,A Lustful Man,4.368034601211548


#### __CONTENT BASED CF__

In [None]:
"""
CALL gds.graph.project.cypher(
  'movie_recommendations_cb',
  'MATCH (n) WHERE n:Movie OR n:Genre OR n:Category RETURN id(n) AS id, labels(n) AS labels',
  'MATCH (n)-[r:DESCRIBES|HAS_GENRE]-(m) RETURN id(n) AS source, id(m) AS target, type(r) AS type, coalesce(r.relevance, 1.0) AS relevance'    
)   
YIELD
  graphName AS graph, nodeCount AS nodes, relationshipCount AS rels 
"""


In [None]:
"""
CALL gds.fastRP.mutate('movie_recommendations_cb',
  {
    nodeLabels: ["Movie", "Genre", "Category"],
    relationshipTypes: ["HAS_GENRE", "DESCRIBES"],
    embeddingDimension: 256,
    randomSeed: 42,
    mutateProperty: 'embedding'    
    relationshipWeightProperty: 'relevance',
    
  }
)
YIELD nodePropertiesWritten
"""


In [None]:
"""
CALL gds.graph.writeNodeProperties('movie_recommendations_cb', ['embedding'], ['Movie'])
YIELD propertiesWritten
"""

In [None]:
"""
CALL gds.graph.project(
  'cb-projection',  
  {
      Movie : {properties : "embedding"}
  },   
    ["*"]                      
)
YIELD
  graphName, nodeProjection, nodeCount AS nodes, relationshipCount AS rels
RETURN graphName, nodeProjection.Book AS bookProjection, nodes, rels
"""

In [None]:
"""
CALL gds.knn.stream(
  "cb-projection",
  {   
      nodeProperties: ["embedding"]
  }
) YIELD node1, node2, similarity
RETURN gds.util.asNode(node1).id AS Movie1, gds.util.asNode(node2).id AS Movie2, similarity
ORDER BY similarity DESCENDING, Movie1, Movie2
"""

In [None]:
"""
CALL gds.knn.write('cb-projection', {
    nodeProperties: ["embedding"],
    writeRelationshipType: 'SIMILAR',
    writeProperty: 'score',
    topK: 10,
    nodeProperties: ['embedding']
})
YIELD nodesCompared, relationshipsWritten
"""

Get __similar movies__, given a movie

In [11]:
movie_id = "0b142bcb-3bcb-4ce7-943e-b73b16cf4f15"
graph.run(
    (
        f"MATCH (m1:Movie{{id: '{movie_id}'}})-[s:SIMILAR]->(m2:Movie) "
        "RETURN m2.id AS ID, m2.title AS TITLE, s.score AS SIMILARITY "
        "ORDER BY SIMILARITY DESC "
    )
).to_table()

ID,TITLE,SIMILARITY
0dafe9c9-bf3f-4a5a-9bc2-aa9cb3419471,Death Rides a Horse (Da uomo a uomo),0.9952797889709472
06054ec4-fdbd-49c4-8fbf-03acd206db3d,Parasite,0.9951488971710204
d5d4d59e-aadf-4d1b-995a-77d32b39d25f,Old Boy,0.9937421083450316
ab10a271-44a5-4352-b03d-f87b195cb60c,Ghost in the Shell Arise - Border 3: Ghost Tears,0.9936655163764954
836e7227-12b1-4215-a6e8-04d5a0e33500,Nightcrawler,0.9936624765396118
527f1a8d-fc9b-48bf-baee-81e6ab846b9c,Memento,0.9936578273773192
691551fb-650f-4b57-a114-8bd196c23352,"Band Called Death, A",0.993545651435852
9501228f-c2f2-402b-9b6c-402ff7fd1f58,Pulp Fiction,0.9934476613998412
41b6d0e2-695d-4eb6-a813-6f67ba664fc9,"Thing: Terror Takes Shape, The",0.9933952689170836
352a1319-54df-4b50-bbcd-c1a59b67a6f2,Blade Runner 2049,0.993350625038147


 Recommending __similar movies__

In [6]:
user_id = "85cd6e98-63a7-4445-9342-85c2a5628be6"

graph.run(
    (   f"MATCH(u:User {{id: '{user_id}'}})-[:RATES]->(m:Movie) "
        "WITH collect(m.id) AS watchedMoviesIds "
        f"MATCH (u:User{{id: '{user_id}'}})-[r:RATES]->(m1:Movie)-[s:SIMILAR]->(m2:Movie) "
        "WHERE NOT m2.id IN watchedMoviesIds "
        "RETURN m2.id AS ID , m2.title AS TITLE, s.score*r.score AS SCORE "
        "ORDER BY SCORE DESC "
        "LIMIT 10"
    )
).to_table()

ID,TITLE,SCORE
7ce39604-1646-44c1-8ac6-8ea31b9a8084,"Pianist, The",4.992698431015015
5f210d5a-f719-48b7-a388-de6e21b3d781,Blue Planet II,4.97689962387085
e79cfc9a-f881-42a3-9daf-84ef2c776f20,Misery,4.9765545129776
68b0af18-162a-4111-b282-60555570d482,"Green Mile, The",4.9761658906936646
cbf03f03-037e-4b76-8fab-74a18f36c95a,Seven (a.k.a. Se7en),4.975505769252777
81f3f3d5-8d2c-4b2c-8a4c-0b8c773750fa,12 Years a Slave,4.97529149055481
cff9c910-4b33-4461-80e0-69ec1bd3d91d,Dear Zachary: A Letter to a Son About His Father,4.9745529890060425
f8b83362-5876-46dc-9168-cac41bc65def,Cosmos: A Spacetime,4.9743252992630005
29aae105-8c06-465c-9588-f24c091e93f0,Runaway Jury,4.974263310432434
34eeffd8-d5a6-4ae6-a6d3-94064fbd5f48,Slumdog Millionaire,4.974065721035004
