In [2]:
from py2neo import *
import pandas as pd
import time
from tqdm import tqdm
import getpass
import importlib

In [3]:
username = input("Enter username: ")
password = getpass.getpass('Enter password: ')
port = input("Enter Neo4j listening port: ")
graph = Graph(f"bolt://localhost:{port}", auth=(username, password))

__Query 1__: Get __top $\mathbf{k}$ Genres__, given a __User__.

Execution examples ($k=5$)

In [29]:
user_uuid_associations = pd.read_pickle("data/users_uuids.pkl")
categories_uuid_associations = pd.read_pickle("data/categories_uuids.pkl")
genre_uuid_associations = pd.read_pickle("data/genre_uuids.pkl")
movies_uuid_associations = pd.read_pickle("data/movies_uuids.pkl")


In [33]:
K=5

__Method 1: average score__

In [6]:
user_id = user_uuid_associations[1]
K = 5

graph.run(
    (
        "MATCH(u:User)-[r:RATES]->(m:Movie)-[h:HAS_GENRE]->(g:Genre)"
        f"WHERE u.id = '{user_id}'"
        "RETURN g.name as GENRE, AVG(r.score) as AVG_SCORE "
        "ORDER BY AVG_SCORE DESC "
        f"LIMIT {K}"
    )
).to_table()


GENRE,AVG_SCORE
Thriller,4.2
Action,4.0
Animation,4.0
Romance,3.9444444444444446
Crime,3.875


__Method 2: max score * count score__

In [34]:
graph.run(
    (
        "MATCH(u:User)-[r:RATES]->(m:Movie)-[h:HAS_GENRE]->(g:Genre) "
        f"WHERE u.id = '{user_id}' "
        "RETURN g.name as GENRE, MAX(r.score)*COUNT(r.score) as SCORE "
        "ORDER BY SCORE DESC "
        f"LIMIT {K}"
    )
).to_table()


GENRE,SCORE
Drama,265
Comedy,115
Romance,90
Adventure,55
Crime,40


__Computational time (Method 1)__

In [37]:
def get_top_k_genres(user_id, K):
    return graph.run(
            (
                "MATCH(u:User)-[r:RATES]->(m:Movie)-[h:HAS_GENRE]->(g:Genre)"
                f"WHERE u.id = '{user_id}'"
                "RETURN g.name as GENRE, AVG(r.score) as AVG_SCORE "
                "ORDER BY AVG_SCORE DESC "
                f"LIMIT {K}"
            )
        ).to_table()

In [43]:
total_time = 0

for user_id in tqdm(user_uuid_associations.values()):
    start_time = time.time()
    get_top_k_genres(user_id, K)
    total_time += (time.time() - start_time)
    
print(total_time/len(user_uuid_associations.values()))

100%|██████████| 162541/162541 [17:48<00:00, 152.10it/s] 

0.00651306757677376





__Query 2__: Get __top $\mathbf{k}$ Categories__, given a __User__ 

__Method 1: average relevance * average score__

In [19]:
graph.run(
    (
        "MATCH(u:User)-[r:RATES]->(m:Movie)-[h:HAS_CATEGORY]->(c:Category) "
        f"WHERE u.id = '{user_id}' "
        "RETURN c.name as CATEGORY, AVG(h.relevance)*AVG(r.score) as SCORE "
        "ORDER BY SCORE DESC "
        f"LIMIT {K}"
    )
).to_table()


CATEGORY,SCORE
jay and silent bob,4.473
math,4.399875
mathematics,4.33575
tolkien,3.993859375
short-term memory loss,3.992


__Method 2: average relevance* rating count* max score__

In [8]:
graph.run(
    (
        "MATCH(u:User)-[r:RATES]->(m:Movie)-[h:HAS_CATEGORY]->(c:Category) "
        f"WHERE u.id = '{user_id}' "
        "RETURN c.name as CATEGORY, AVG(h.relevance)*COUNT(r.score)*MAX(r.score) as SCORE "
        "ORDER BY SCORE DESC "
        f"LIMIT {K}"
    )
).to_table()


CATEGORY,SCORE
original,250.45125
criterion,211.21375
storytelling,209.69875
melancholic,209.55875
reflective,199.2375


__Computation time (Method 1)__

In [20]:
def get_top_k_categories(user_id, K):
    return graph.run(
        (
            "MATCH(u:User)-[r:RATES]->(m:Movie)-[h:HAS_CATEGORY]-(c:Category) "
            f"WHERE u.id = '{user_id}' "
            "RETURN c.name as CATEGORY, AVG(h.relevance)*COUNT(r.score)*MAX(r.score) as SCORE "
            "ORDER BY SCORE DESC "
            f"LIMIT {K}"
        )
    ).to_table()


In [10]:
total_time = 0

for user_id in tqdm(user_uuid_associations.values()):
    start_time = time.time()
    get_top_k_categories(user_id, K)
    total_time += (time.time() - start_time)
    
print(total_time/len(user_uuid_associations.values()))

100%|██████████| 162541/162541 [1:23:21<00:00, 32.50it/s]

0.030554692825194497





__Query 3__ :Get __top $\mathbf{k}$ Movies__, given a __Genre__.

__Method 1: average score__

In [14]:
genre_id = genre_uuid_associations['Action']

graph.run(
    (
        f"MATCH(g:Genre{{id: '{genre_id}'}})<-[h:HAS_GENRE]-(m:Movie)<-[r:RATES]-(u:User)"
        "RETURN m.title as MOVIE, avg(r.score) as SCORE "
        "ORDER BY SCORE DESC "
        "LIMIT 10"
    )
).to_table()


MOVIE,SCORE
Days of Power,5.0
White Rush,5.0
Pit Fighter,5.0
Throwback,5.0
Santosh Subramaniam,5.0
Kabir Singh,5.0
Junga,5.0
Awe!,5.0
Kaithi,5.0
Dangerous Flowers,5.0


__Method 2: count * max score__

In [17]:
graph.run(
    (
        f"MATCH(g:Genre{{id: '{genre_id}'}})<-[h:HAS_GENRE]-(m:Movie)<-[r:RATES]-(u:User)"
        "RETURN m.title as MOVIE, count(r.score)*max(r.score) as SCORE "
        "ORDER BY SCORE DESC "
        "LIMIT 10"
    )
).to_table()


MOVIE,SCORE
"Matrix, The",363370
Star Wars: Episode IV - A New Hope,343585
Jurassic Park,320720
Braveheart,295920
Fight Club,293865
Terminator 2: Judgment Day,286895
Star Wars: Episode V - The Empire Strikes Back,286805
Star Wars: Episode VI - Return of the Jedi,274585
Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark),273375
"Lord of the Rings: The Return of the King, The",253985


__Computation time (Method 1)__

In [11]:
def get_top_k_movies(genre_id, K):
    return graph.run(
        (
            f"MATCH(g:Genre{{id: '{genre_id}'}})<-[h:HAS_GENRE]-(m:Movie)<-[r:RATES]-(u:User)"
            "RETURN m.title as MOVIE, count(r.score)*max(r.score) as SCORE "
            "ORDER BY SCORE DESC "
            f"LIMIT {K}"
        )
    )

In [15]:
total_time = 0

for genre_id in tqdm(genre_uuid_associations.values()):
    start_time = time.time()
    get_top_k_movies(genre_id, K)
    total_time += (time.time() - start_time)
    
print(total_time/len(genre_uuid_associations.values()))

100%|██████████| 19/19 [06:01<00:00, 19.00s/it]

19.00047913350557





__Query 4__ : Get __top $\mathbf{k}$ Movies__, given a __Category__.

In [None]:
category_id = categories_uuid_associations[1]

In [21]:
graph.run(
    (
        f"MATCH(c:Category{{id: '{category_id}'}})<-[h:HAS_CATEGORY]-(m:Movie) "
        "RETURN m.title as MOVIE, h.relevance as RELEVANCE "
        "ORDER BY RELEVANCE DESC "
        "LIMIT 5"
    )
).to_table()


MOVIE,RELEVANCE
Tomorrow Never Dies,0.99975
Octopussy,0.99975
You Only Live Twice,0.99975
Never Say Never Again,0.99975
"View to a Kill, A",0.99975


In [25]:
graph.run(
    (
        f"MATCH(c:Category{{id: '{category_id}'}})<-[h:HAS_CATEGORY]-(m:Movie)<-[r:RATES]-(u:User)"
        "RETURN m.title as MOVIE, h.relevance*avg(r.score) as POP_RELEVANCE "
        "ORDER BY POP_RELEVANCE DESC "
        "LIMIT 5"
    )
).to_table()


MOVIE,POP_RELEVANCE
Casino Royale,3.616791603519913
Goldfinger,3.605932308455252
"Ipcress File, The",3.605676671214189
From Russia with Love,3.5721480889553874
Dr. No,3.5470968750000127


In [7]:
def get_top_k_movies(category_id, K):
    return graph.run(
    (
        f"MATCH(c:Category{{id: '{category_id}'}})<-[h:HAS_CATEGORY]-(m:Movie)<-[r:RATES]-(u:User)"
        "RETURN m.title as MOVIE, h.relevance*avg(r.score) as POP_RELEVANCE "
        "ORDER BY POP_RELEVANCE DESC "
        f"LIMIT {K}"
    )
).to_table()

In [8]:
total_time = 0

for category_id in tqdm(categories_uuid_associations.values()):
    start_time = time.time()
    get_top_k_movies(category_id, K)
    total_time += (time.time() - start_time)
    
print(total_time/len(categories_uuid_associations.values()))

100%|██████████| 1128/1128 [3:20:35<00:00, 10.67s/it] 

10.668235402157967





#### __COLLABORATIVE FILTERING__

In [4]:
import gds_recommendation
importlib.reload(gds_recommendation)

<module 'gds_recommendation' from '/home/alessia/Documenti/University/New Generation Data Models and DBMSs/gds_recommendation.py'>

In [5]:
# creating a GDS graph projection on which operate for COLLABORATIVE FILTERING
gds_recommendation.create_gds_projection(
    neo4j_graph=graph,
    gds_name="cf_projection",
    node_names=["User", "Movie"],
    orientation= "UNDIRECTED",
    rel_names = ["RATES"],
    rel_property= "score"
)


In [6]:
# projecting FastRP embeddings on the GDS graph
gds_recommendation.mutate_fastRP_embedding(
    neo4j_graph= graph,
    gds_name= "cf_projection",
    node_names= ["User", "Movie"],
    rel_names= ["RATES"],
    rel_property= "score",
    embedding_name= "cf_embedding",
)

In [7]:
# writing the embeddings in the neo4j dataset
gds_recommendation.write_fastRP_embedding(neo4j_graph=graph, gds_name="cf_projection", nodes=["User", "Movie"], embedding_name= "cf_embedding")

__USER-BASED__

In [8]:
# projecting a second GDS graph to perform KNN on
gds_recommendation.create_gds_projection(graph, "knn_user_cf", node_names=["User"], node_property= "cf_embedding")

In [9]:
#writing SIMILARITY KNN relationships back to the neo4j dataset
gds_recommendation.write_knn_sim_relationships(neo4j_graph=graph, knn_gds_name="knn_user_cf", node_name= "User", rel_name= "SIMILAR", embedding_name="cf_embedding", property="score")

__Query 5__: Get __Similar Users__, given a __User__

In [12]:
user_id = user_uuid_associations[1]
graph.run(
            (
            f"MATCH (u1:User{{id: '{user_id}'}})-[s:SIMILAR]->(u2:User)"
            " RETURN u2.id AS ID, s.score AS SCORE"
            " ORDER BY SCORE DESC"
            )
        ).to_table()

ID,SCORE
6d5cc357-ffdb-43bf-b168-735ef4f85971,0.9723402261734008
527d00bd-08a6-4702-90c2-2b516318581a,0.972097635269165
933698e2-f31a-4003-9c0e-abb8bea9561d,0.9717543125152588
1d08f028-3e2d-47d0-a08e-5be304133ad9,0.9714540243148804
3ee64c64-0060-4679-b56e-d5bf650187ca,0.9714488983154296


__Computation time__

In [33]:
def get_similar_users(user_id):
    return graph.run(
            (
            f"MATCH (u1:User{{id: '{user_id}'}})-[s:SIMILAR]->(u2:User)"
            " RETURN u2.id AS ID, s.score AS SCORE"
            " ORDER BY SCORE DESC"
            )
        ).to_table()

In [34]:
total_time = 0

for user_id in tqdm(user_uuid_associations.values()):
    start_time = time.time()
    get_similar_users(user_id)
    total_time += (time.time() - start_time)
    
print(total_time/len(user_uuid_associations.values()))

100%|██████████| 162541/162541 [24:22<00:00, 111.13it/s]  

0.008887097293124972





__Query 6__: Given a __User__, recommend __Movies__ movies based on  __Similar users__ 

In [44]:
graph.run(
    (
        f"MATCH(u:User {{id: '{user_id}'}})-[:RATES]->(m:Movie) "
        "WITH collect(m.id) AS watchedMoviesIds "
        f"MATCH (u1:User{{id: '{user_id}'}})-[s:SIMILAR]->(u2:User)-[r:RATES]->(m:Movie) "
        "WHERE NOT m.id IN watchedMoviesIds "
        "RETURN m.id as ID, m.title as TITLE, avg(r.score)*avg(s.score) AS SCORE "
        "ORDER BY SCORE DESC "
        "LIMIT 10"
    )
).to_table()

ID,TITLE,SCORE
79c6fa49-d613-4af3-a72e-4a3739b43bd9,Shaun of the Dead,4.971629977226257
e92771cc-599d-4e90-8597-069a98ca6918,"Ideal Husband, An",4.971629977226257
97f4b2d4-a65d-47fa-a08f-416479e9fa68,Laputa: Castle in the Sky (Tenkû no shiro Rapyuta),4.971629977226257
8435881c-0656-40e2-89ea-3fc463f059c3,"Importance of Being Earnest, The",4.971629977226257
6fb08cda-074a-4420-9932-a445ad2797cf,"Maltese Falcon, The",4.971629977226257
915c7e80-c3ba-4caa-a9ab-b6a82197f0ab,Shallow Grave,4.970742166042328
30364b6a-8313-405c-a16a-22cd44ab73a9,"Good bye, Lenin!",4.970742166042328
c936dcee-e15a-4d0e-ab0a-778197297fe3,Henry V,4.970742166042328
150846fc-92ff-4cfa-b73c-31dea92f536f,True Romance,4.970742166042328
39756336-af98-40a0-9eb4-a255f666bc2d,Source Code,4.970742166042328


__Computation time__

In [5]:
def get_cf_movies(user_id, K):
    return graph.run(
        (
            f"MATCH(u:User {{id: '{user_id}'}})-[:RATES]->(m:Movie) "
            "WITH collect(m.id) AS watchedMoviesIds "
            f"MATCH (u1:User{{id: '{user_id}'}})-[s:SIMILAR]->(u2:User)-[r:RATES]->(m:Movie) "
            "WHERE NOT m.id IN watchedMoviesIds "
            "RETURN m.id as ID, m.title as TITLE, avg(r.score)*avg(s.score) AS SCORE "
            "ORDER BY SCORE DESC "
            f"LIMIT {K}"
        )
    ).to_table()   
    

In [6]:
total_time = 0

for user_id in tqdm(user_uuid_associations.values()):
    start_time = time.time()
    get_cf_movies(user_id, K)
    total_time += (time.time() - start_time)
    
print(total_time/len(user_uuid_associations.values()))

100%|██████████| 162541/162541 [41:41<00:00, 64.98it/s] 


0.015258965377337765


__ITEM-BASED__

In [14]:
# projecting a second GDS graph to perform KNN on
gds_recommendation.create_gds_projection(graph, "knn_item_cf", node_names=["Movie"], node_property= "cf_embedding")

In [15]:
#writing SIMILARITY KNN relationships back to the neo4j dataset
gds_recommendation.write_knn_sim_relationships(neo4j_graph=graph, knn_gds_name="knn_item_cf", node_name= "Movie", rel_name= "USERS_ALSO_LIKED", embedding_name="cf_embedding", property="score")

In [48]:
user_id = user_uuid_associations[1]

graph.run(
    (   f"MATCH(u:User {{id: '{user_id}'}})-[:RATES]->(m:Movie) "
        "WITH collect(m.id) AS watchedMoviesIds "
        f"MATCH (u:User{{id: '{user_id}'}})-[r:RATES]->(m1:Movie)-[s:USERS_ALSO_LIKED]->(m2:Movie) "
        "WHERE NOT m2.id IN watchedMoviesIds "
        "RETURN m2.id AS ID , m2.title AS TITLE, s.score*r.score AS SCORE "
        "ORDER BY SCORE DESC "
        "LIMIT 5"
    )
).to_table()

m1.id,ID,TITLE,SCORE
09d946c5-61ff-457b-a78e-69dcb3b44be9,253491e7-a09c-40be-9c80-0da9b9fed6ce,Three Colors: White (Trzy kolory: Bialy),4.991997480392456
838a7128-9e2a-426a-83ef-de9cb34a880a,336851cb-2cee-4892-b91b-9be7e4d8ba0c,"Eclisse, L' (Eclipse)",4.98407244682312
bac4ba03-6d54-49dc-9591-e89b4387baea,f7990bd7-1794-4fb6-a4c0-d9f3e3f392b8,Seven (a.k.a. Se7en),4.982698559761047
bac4ba03-6d54-49dc-9591-e89b4387baea,500b440d-0626-4218-a9a7-41c4049cddc9,"Silence of the Lambs, The",4.981599450111389
72825dfb-fa4a-4204-af33-475c6e42e46e,35bf95d7-819c-4f80-b88d-dca971c1cd50,Donnie Darko,4.980944395065308


#### __CONTENT BASED CF__

In [40]:
import gds_recommendation
importlib.reload(gds_recommendation)

<module 'gds_recommendation' from '/home/alessia/Documenti/University/New Generation Data Models and DBMSs/gds_recommendation.py'>

In [17]:
gds_recommendation.create_gds_cypher_projection(
    neo4j_graph=graph,
    gds_name="cb_projection",
    node_names=["Movie", "Genre", "Category"],
    rel_names=["HAS_CATEGORY", "HAS_GENRE"],
    property_name="relevance",
)

gds_recommendation.mutate_fastRP_embedding(
    neo4j_graph=graph,
    gds_name="cb_projection",
    node_names=["Genre", "Movie", "Category"],
    rel_names=["HAS_GENRE", "HAS_CATEGORY"],
    rel_property= "relevance",
    embedding_name= "cb_embedding"
)


In [23]:
gds_recommendation.write_fastRP_embedding(
    neo4j_graph=graph,
    gds_name= "cb_projection",
    nodes= ["Movie"],
    embedding_name= "cb_embedding",
)

In [24]:
gds_recommendation.create_gds_projection(
    neo4j_graph=graph,
    gds_name="knn_cb",
    node_names= ["Movie"],
    node_property= "cb_embedding"
)


In [28]:
gds_recommendation.write_knn_sim_relationships(
    neo4j_graph=graph,
    knn_gds_name = "knn_cb",
    node_name= "Movie",
    embedding_name= "cb_embedding",
    rel_name="SIMILAR",
    property= "score"
)

__Query 7__: Given a __Movie__, find similar __Movies__

In [49]:
movie_id = movies_uuid_associations[1]

graph.run(
    (
        f"MATCH (m1:Movie{{id: '{movie_id}'}})-[s:SIMILAR]->(m2:Movie) "
        "RETURN m2.id AS ID, m2.title AS TITLE, s.score AS SIMILARITY "
        "ORDER BY SIMILARITY DESC "
    )
).to_table()

ID,TITLE,SIMILARITY
c1637da5-4a10-4705-91a3-ccd2b2dfae5d,"Monsters, Inc.",0.9974408149719238
291ee42a-318f-4f13-9663-bdfdb4563a00,Finding Nemo,0.996065616607666
e1296d83-27ef-4eb8-acb1-1c843561bfd1,Toy Story 2,0.9955590963363647
7f677070-b37b-4994-a622-4478c1199e7c,Toy Story 3,0.994315266609192
3012ff53-1a92-477e-9721-25418152ff99,Ratatouille,0.9941790103912354


In [50]:
def get_similar_movies(movie_id):
    return graph.run(
        (
            f"MATCH (m1:Movie{{id: '{movie_id}'}})-[s:SIMILAR]->(m2:Movie) "
            "RETURN m2.id AS ID, m2.title AS TITLE, s.score AS SIMILARITY "
            "ORDER BY SIMILARITY DESC "
        )
    ).to_table()

In [51]:
total_time = 0

for movie_id in tqdm(movies_uuid_associations.values()):
    start_time = time.time()
    get_similar_movies(movie_id)
    total_time += (time.time() - start_time)
    
print(total_time/len(movies_uuid_associations.values()))

100%|██████████| 62423/62423 [05:24<00:00, 192.60it/s]

0.0051384527509131485





__Query 8__: Given a __User__, recommend __Movies__ based on __similarità__ with the ones he has watched

In [31]:
user_id = user_uuid_associations[1]

graph.run(
    (   f"MATCH(u:User {{id: '{user_id}'}})-[:RATES]->(m:Movie) "
        "WITH collect(m.id) AS watchedMoviesIds "
        f"MATCH (u:User{{id: '{user_id}'}})-[r:RATES]->(m1:Movie)-[s:SIMILAR]->(m2:Movie) "
        "WHERE NOT m2.id IN watchedMoviesIds "
        "RETURN m2.id AS ID , m2.title AS TITLE, s.score*r.score AS SCORE "
        "ORDER BY SCORE DESC "
        "LIMIT 10"
    )
).to_table()

ID,TITLE,SCORE
d94f08ef-db2a-43b1-803b-687d5b6404cd,Reservoir Dogs,4.989688396453857
8f6da847-c589-4703-8acc-f09946ebfb53,Youth,4.983693361282349
7a227d5b-d4fd-48e9-b06b-8466edaa9fc5,3-Iron (Bin-jip),4.981707334518433
c392b034-a10f-4a43-b474-6e4d4d49415d,Akira Kurosawa's Dreams (Dreams),4.981599748134613
f1d11eed-5ce7-4e6a-898a-9bc9910300ed,Reconstruction,4.981414079666138
3545fcc6-35d0-4250-b6be-fa16a750132f,In Bruges,4.981272220611572
e6095f71-9c8b-4e8c-bd5c-cfea313f85ab,Last Life in the Universe (Ruang rak noi nid mahasan),4.9810075759887695
39a2f054-3a0e-40c1-9877-5d38c5121a18,Once Upon a Time in Hollywood,4.980625808238983
8cb22eaa-51df-44c1-96cd-d800376a08ce,Nostalghia,4.980106949806213
b3345260-d037-43cb-b333-8f7a41cf9476,The Hateful Eight,4.979964196681976


__Computation time__

In [34]:
def get_cb_movies(user_id, K):
    return graph.run(
        (   f"MATCH(u:User {{id: '{user_id}'}})-[:RATES]->(m:Movie) "
            "WITH collect(m.id) AS watchedMoviesIds "
            f"MATCH (u:User{{id: '{user_id}'}})-[r:RATES]->(m1:Movie)-[s:SIMILAR]->(m2:Movie) "
            "WHERE NOT m2.id IN watchedMoviesIds "
            "RETURN m2.id AS ID , m2.title AS TITLE, s.score*r.score AS SCORE "
            "ORDER BY SCORE DESC "
            f"LIMIT {K}"
        )
    ).to_table()

In [35]:
total_time = 0

for user_id in tqdm(user_uuid_associations.values()):
    start_time = time.time()
    get_cb_movies(user_id, K)
    total_time += (time.time() - start_time)
    
print(total_time/len(user_uuid_associations.values()))

100%|██████████| 162541/162541 [34:14<00:00, 79.13it/s]  


0.012507320784452268
