<h1>Step 1: Data Exploration</h1>

In [221]:
import networkx as nx
print(nx.__version__)


3.4.2


In [222]:
import matplotlib.pyplot as plt
import certifi
import ssl

context = ssl.create_default_context(cafile=certifi.where())
print("SSL context created from certifi CA bundle")


SSL context created from certifi CA bundle


In [223]:
from langchain_community.graphs import Neo4jGraph
import certifi
import ssl

# Create custom SSL context
ssl_context = ssl.create_default_context(cafile=certifi.where())

kg = Neo4jGraph(
    url="bolt://demo.neo4jlabs.com:7687",  # 🔁 Use plain scheme
    username="recommendations",
    password="recommendations",
    database="recommendations",
    driver_config={
        "encrypted": True,                # 🔒 Enable encryption manually
        "ssl_context": ssl_context        # ✅ Pass trusted CA bundle
    }
)

print(kg.query("CALL db.labels()"))


[{'label': 'Movie'}, {'label': 'Genre'}, {'label': 'User'}, {'label': 'Actor'}, {'label': 'Director'}, {'label': 'Person'}, {'label': '_Bloom_Perspective_'}, {'label': '_Bloom_Scene_'}]


In [82]:
kg.query("MATCH (n) RETURN COUNT(n) AS total_nodes")

[{'total_nodes': 28865}]

In [5]:
kg.query("MATCH ()-[r]->() RETURN COUNT(r) AS total_relationships")


[{'total_relationships': 166262}]

In [45]:
# Get all distinct node labels to understand the types of entities in the graph (Movie, Actor, Genre, etc.)
kg.query("CALL db.labels()")


[{'label': 'Movie'},
 {'label': 'Genre'},
 {'label': 'User'},
 {'label': 'Actor'},
 {'label': 'Director'},
 {'label': 'Person'},
 {'label': '_Bloom_Perspective_'},
 {'label': '_Bloom_Scene_'}]

In [46]:
kg.query("""
CALL db.labels() YIELD label
WITH label, 'MATCH (n:`' + label + '`) RETURN count(n)' AS query
CALL apoc.cypher.run(query, {}) YIELD value
RETURN label, value['count(n)'] AS count
""")


[{'label': 'Movie', 'count': 9125},
 {'label': 'Genre', 'count': 20},
 {'label': 'User', 'count': 671},
 {'label': 'Actor', 'count': 15443},
 {'label': 'Director', 'count': 4091},
 {'label': 'Person', 'count': 19047},
 {'label': '_Bloom_Perspective_', 'count': 1},
 {'label': '_Bloom_Scene_', 'count': 1}]

## View All Relationship Types

In [47]:
# View all relationship types (e.g., ACTED_IN, RATED) to understand how entities are connected
kg.query("CALL db.relationshipTypes()")


[{'relationshipType': 'IN_GENRE'},
 {'relationshipType': 'RATED'},
 {'relationshipType': 'ACTED_IN'},
 {'relationshipType': 'DIRECTED'},
 {'relationshipType': '_Bloom_HAS_SCENE_'}]

In [48]:
# List all property keys available in the graph, such as title, plot, plotEmbedding, etc.

kg.query("CALL db.propertyKeys()")


[{'propertyKey': 'movieId'},
 {'propertyKey': 'name'},
 {'propertyKey': 'userId'},
 {'propertyKey': 'imdbId'},
 {'propertyKey': 'title'},
 {'propertyKey': 'rating'},
 {'propertyKey': 'timestamp'},
 {'propertyKey': 'tmdbId'},
 {'propertyKey': 'year'},
 {'propertyKey': 'countries'},
 {'propertyKey': 'languages'},
 {'propertyKey': 'plot'},
 {'propertyKey': 'imdbRating'},
 {'propertyKey': 'imdbVotes'},
 {'propertyKey': 'released'},
 {'propertyKey': 'runtime'},
 {'propertyKey': 'poster'},
 {'propertyKey': 'revenue'},
 {'propertyKey': 'budget'},
 {'propertyKey': 'tagline'},
 {'propertyKey': 'role'},
 {'propertyKey': 'born'},
 {'propertyKey': 'died'},
 {'propertyKey': 'bornIn'},
 {'propertyKey': 'bio'},
 {'propertyKey': 'url'},
 {'propertyKey': 'plotEmbedding'},
 {'propertyKey': 'posterEmbedding'},
 {'propertyKey': 'id'},
 {'propertyKey': 'data'},
 {'propertyKey': 'nodes'},
 {'propertyKey': 'relationships'},
 {'propertyKey': 'style'},
 {'propertyKey': 'visualisation'},
 {'propertyKey': 'versi

In [49]:
# Visualize the graph schema — shows how node types and relationships are structured

kg.query("CALL db.schema.visualization()")


[{'nodes': [{'name': '_Bloom_Perspective_',
    'indexes': [],
    'constraints': ["Constraint( id=3, name='constraint_f7832722', type='UNIQUENESS', schema=(:_Bloom_Perspective_ {id}), ownedIndex=1 )"]},
   {'name': 'Movie',
    'indexes': ['year',
     'imdbRating',
     'released',
     'imdbId',
     'title',
     'tagline',
     'title,plot',
     'plotEmbedding',
     'posterEmbedding'],
    'constraints': ["Constraint( id=77, name='constraint_737d9c1d', type='UNIQUENESS', schema=(:Movie {tmdbId}), ownedIndex=61 )",
     "Constraint( id=75, name='constraint_3d5fcb7f', type='UNIQUENESS', schema=(:Movie {movieId}), ownedIndex=59 )"]},
   {'name': 'User',
    'indexes': ['name'],
    'constraints': ["Constraint( id=76, name='constraint_3b27b0', type='UNIQUENESS', schema=(:User {userId}), ownedIndex=64 )"]},
   {'name': 'Actor', 'indexes': [], 'constraints': []},
   {'name': 'Director', 'indexes': [], 'constraints': []},
   {'name': 'Genre',
    'indexes': [],
    'constraints': ["Con

In [50]:
# Return a few Movie nodes with all their properties to preview the movie data (title, plot, embedding, etc.)
#plotEmbedding → a numerical vector representing the movie's plot
#posterEmbedding → a numerical vector representing the movie's poster image
kg.query("MATCH (m:Movie) RETURN m LIMIT 5")


[{'m': {'languages': ['English'],
   'plotEmbedding': [-0.026989128440618515,
    -0.024155009537935257,
    0.006058253347873688,
    -0.024324016645550728,
    -0.022516941651701927,
    -0.0050864629447460175,
    -0.013442561961710453,
    -0.004462436772882938,
    0.001889954088255763,
    -0.017147717997431755,
    0.00504421116784215,
    -0.007975833490490913,
    0.03221534565091133,
    -0.012272513471543789,
    0.01178499311208725,
    0.02133389189839363,
    0.028627198189496994,
    -0.0005025522550567985,
    0.014040587469935417,
    -0.014157592318952084,
    0.0014495606301352382,
    0.008027835749089718,
    -0.0222049281001091,
    -0.025013046339154243,
    0.004394183866679668,
    -0.00825534574687481,
    0.023660989478230476,
    -0.025416063144803047,
    0.037181556224823,
    0.00314450659789145,
    0.008619360625743866,
    -0.012064504437148571,
    0.006025752052664757,
    -0.006955291144549847,
    -0.03286537528038025,
    -0.024545026943087578,
  

In [51]:
# Return sample Actor nodes to explore properties like name, birth info, and bio
kg.query("MATCH (a:Actor) RETURN a LIMIT 5")


[{'a': {'bornIn': 'France',
   'tmdbId': '1271225',
   'imdbId': '2083046',
   'born': neo4j.time.Date(1877, 2, 4),
   'name': 'François Lallement',
   'died': neo4j.time.Date(1954, 1, 1),
   'url': 'https://themoviedb.org/person/1271225'}},
 {'a': {'tmdbId': '1602569',
   'imdbId': '6170115',
   'born': neo4j.time.Date(1862, 1, 1),
   'name': 'Jules-Eugène Legris',
   'died': neo4j.time.Date(1926, 1, 1),
   'url': 'https://themoviedb.org/person/1602569'}},
 {'a': {'bornIn': 'Springfield, Ohio, USA',
   'tmdbId': '8828',
   'imdbId': '0001273',
   'born': neo4j.time.Date(1893, 10, 14),
   'name': 'Lillian Gish',
   'bio': "\u200bFrom Wikipedia, the free encyclopedia\n\nLillian Diana Gish (October 14, 1893 – February 27, 1993) was an American stage, screen and television actress whose film acting career spanned 75 years, from 1912 to 1987. \n\nShe was a prominent film star of the 1910s and 1920s, particularly associated with the films of director D. W.  Griffith, including her leading r

## Visualisation

In [65]:
from pyvis.network import Network
import networkx as nx
from IPython.display import IFrame

query = """
MATCH (m:Movie)-[r:IN_GENRE]->(g:Genre)
RETURN elementId(m) AS source, elementId(g) AS target,
       type(r) AS rel, r,
       COALESCE(m.title, 'Movie') AS source_name,
       COALESCE(g.name, 'Genre') AS target_name
LIMIT 100
"""
results = kg.query(query)

G = nx.DiGraph()
for row in results:
    G.add_node(row['source'], label=row['source_name'])
    G.add_node(row['target'], label=row['target_name'])
    G.add_edge(row['source'], row['target'], label=row['rel'])

net = Network(notebook=True, directed=True, cdn_resources="in_line")
for node in G.nodes(data=True):
    net.add_node(node[0], label=node[1]['label'])
for edge in G.edges(data=True):
    net.add_edge(edge[0], edge[1], title=edge[2]['label'])

net.show("knowledge_graph_IN_GENRE.html")
IFrame("knowledge_graph_IN_GENRE.html", width=1000, height=1000)


knowledge_graph_IN_GENRE.html


In [66]:
query = """
MATCH (u:User)-[r:RATED]->(m:Movie)
RETURN elementId(u) AS source, elementId(m) AS target,
       type(r) AS rel, r,
       COALESCE(u.name, 'User') AS source_name,
       COALESCE(m.title, 'Movie') AS target_name
LIMIT 800
"""
results = kg.query(query)

G = nx.DiGraph()
for row in results:
    G.add_node(row['source'], label=row['source_name'])
    G.add_node(row['target'], label=row['target_name'])

    edge_label = row['rel']
    if isinstance(row['r'], dict) and 'rating' in row['r']:
        edge_label += f" ({row['r']['rating']})"
    G.add_edge(row['source'], row['target'], label=edge_label)

net = Network(notebook=True, directed=True, cdn_resources="in_line")
for node in G.nodes(data=True):
    net.add_node(node[0], label=node[1]['label'])
for edge in G.edges(data=True):
    net.add_edge(edge[0], edge[1], title=edge[2]['label'])

net.show("knowledge_graph_RATED.html")
IFrame("knowledge_graph_RATED.html", width=1000, height=1000)


knowledge_graph_RATED.html


In [67]:
query = """
MATCH (a:Actor)-[r:ACTED_IN]->(m:Movie)
RETURN elementId(a) AS source, elementId(m) AS target,
       type(r) AS rel, r,
       COALESCE(a.name, 'Actor') AS source_name,
       COALESCE(m.title, 'Movie') AS target_name
LIMIT 800
"""
results = kg.query(query)

G = nx.DiGraph()
for row in results:
    G.add_node(row['source'], label=row['source_name'])
    G.add_node(row['target'], label=row['target_name'])
    G.add_edge(row['source'], row['target'], label=row['rel'])

net = Network(notebook=True, directed=True, cdn_resources="in_line")
for node in G.nodes(data=True):
    net.add_node(node[0], label=node[1]['label'])
for edge in G.edges(data=True):
    net.add_edge(edge[0], edge[1], title=edge[2]['label'])

net.show("knowledge_graph_ACTED_IN.html")
IFrame("knowledge_graph_ACTED_IN.html", width=1000, height=1000)


knowledge_graph_ACTED_IN.html


In [68]:
query = """
MATCH (d:Director)-[r:DIRECTED]->(m:Movie)
RETURN elementId(d) AS source, elementId(m) AS target,
       type(r) AS rel, r,
       COALESCE(d.name, 'Director') AS source_name,
       COALESCE(m.title, 'Movie') AS target_name
LIMIT 800
"""
results = kg.query(query)

G = nx.DiGraph()
for row in results:
    G.add_node(row['source'], label=row['source_name'])
    G.add_node(row['target'], label=row['target_name'])
    G.add_edge(row['source'], row['target'], label=row['rel'])

net = Network(notebook=True, directed=True, cdn_resources="in_line")
for node in G.nodes(data=True):
    net.add_node(node[0], label=node[1]['label'])
for edge in G.edges(data=True):
    net.add_edge(edge[0], edge[1], title=edge[2]['label'])

net.show("knowledge_graph_DIRECTED.html")
IFrame("knowledge_graph_DIRECTED.html", width=1000, height=1000)


knowledge_graph_DIRECTED.html


In [69]:
query = """
MATCH (p:Person)-[r:ACTED_IN|DIRECTED]->(m:Movie)
RETURN elementId(p) AS source, elementId(m) AS target,
       type(r) AS rel, r,
       COALESCE(p.name, 'Person') AS source_name,
       COALESCE(m.title, 'Movie') AS target_name
LIMIT 800
"""
results = kg.query(query)

G = nx.DiGraph()
for row in results:
    G.add_node(row['source'], label=row['source_name'])
    G.add_node(row['target'], label=row['target_name'])
    G.add_edge(row['source'], row['target'], label=row['rel'])

net = Network(notebook=True, directed=True, cdn_resources="in_line")
for node in G.nodes(data=True):
    net.add_node(node[0], label=node[1]['label'])
for edge in G.edges(data=True):
    net.add_edge(edge[0], edge[1], title=edge[2]['label'])

net.show("knowledge_graph_PERSON_COMBINED.html")
IFrame("knowledge_graph_PERSON_COMBINED.html", width=1000, height=1000)


knowledge_graph_PERSON_COMBINED.html


In [73]:
!pip install pandas numpy scikit-learn matplotlib


Collecting pandas
  Downloading pandas-2.2.3-cp311-cp311-macosx_11_0_arm64.whl.metadata (89 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.2.3-cp311-cp311-macosx_11_0_arm64.whl (11.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.3/11.3 MB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hUsing cached tzdata-2025.2-py2.py3-none-any.whl (347 kB)
Installing collected packages: tzdata, pandas
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [pandas]2m1/2[0m [pandas]
[1A[2KSuccessfully installed pandas-2.2.3 tzdata-2025.2


#### Checking embedding from online neo4j and created embedding

In [83]:
kg.query("""
MATCH (m:Movie {title: "Toy Story"})
RETURN m.plotEmbedding AS plotEmbedding
""")


[{'plotEmbedding': [-0.026989128440618515,
   -0.024155009537935257,
   0.006058253347873688,
   -0.024324016645550728,
   -0.022516941651701927,
   -0.0050864629447460175,
   -0.013442561961710453,
   -0.004462436772882938,
   0.001889954088255763,
   -0.017147717997431755,
   0.00504421116784215,
   -0.007975833490490913,
   0.03221534565091133,
   -0.012272513471543789,
   0.01178499311208725,
   0.02133389189839363,
   0.028627198189496994,
   -0.0005025522550567985,
   0.014040587469935417,
   -0.014157592318952084,
   0.0014495606301352382,
   0.008027835749089718,
   -0.0222049281001091,
   -0.025013046339154243,
   0.004394183866679668,
   -0.00825534574687481,
   0.023660989478230476,
   -0.025416063144803047,
   0.037181556224823,
   0.00314450659789145,
   0.008619360625743866,
   -0.012064504437148571,
   0.006025752052664757,
   -0.006955291144549847,
   -0.03286537528038025,
   -0.024545026943087578,
   0.004871953744441271,
   -0.028627198189496994,
   0.0121035063639283

In [84]:
import pandas as pd
import ast

# Load your movie embedding CSV
df = pd.read_csv('movieembed.csv')

# Check the first few rows
print(df.head())


                         title  \
0                    Toy Story   
1                      Jumanji   
2             Grumpier Old Men   
3            Waiting to Exhale   
4  Father of the Bride Part II   

                                           embedding  
0  [-0.026989128440618515, -0.024155009537935257,...  
1  [-0.0016367682255804539, -0.02242148295044899,...  
2  [0.008853926323354244, -0.02395768091082573, 0...  
3  [-0.024737104773521423, -0.03457356244325638, ...  
4  [-0.004050840623676777, -0.024880992248654366,...  


In [85]:
# Just check the type of the first embedding
print(type(df['embedding'].iloc[0]))


<class 'str'>


In [86]:
print(df['embedding'].iloc[0])


[-0.026989128440618515, -0.024155009537935257, 0.006058253347873688, -0.024324016645550728, -0.022516941651701927, -0.0050864629447460175, -0.013442561961710453, -0.004462436772882938, 0.001889954088255763, -0.017147717997431755, 0.00504421116784215, -0.007975833490490913, 0.03221534565091133, -0.012272513471543789, 0.01178499311208725, 0.02133389189839363, 0.028627198189496994, -0.0005025522550567985, 0.014040587469935417, -0.014157592318952084, 0.0014495606301352382, 0.008027835749089718, -0.0222049281001091, -0.025013046339154243, 0.004394183866679668, -0.00825534574687481, 0.023660989478230476, -0.025416063144803047, 0.037181556224823, 0.00314450659789145, 0.008619360625743866, -0.012064504437148571, 0.006025752052664757, -0.006955291144549847, -0.03286537528038025, -0.024545026943087578, 0.004871953744441271, -0.028627198189496994, 0.012103506363928318, -0.005560982506722212, 0.011414477601647377, 0.03746756538748741, -0.029381228610873222, 0.004104921594262123, -0.006922789383679

In [103]:
print(df.columns)


Index(['title', 'embedding'], dtype='object')


# New Implementation after loading all data

In [2]:
import pandas as pd

# List your CSV file names
csv_files = [
    'movies.csv', 'movieembed.csv', 'ratings.csv', 'users.csv',
    'genre.csv', 'in_genre.csv', 'actor.csv', 'acted_in.csv',
    'directors.csv', 'directed.csv'
]

# Load and inspect each one
for file in csv_files:
    print(f"\n==== {file} ====")
    df = pd.read_csv(file)
    print(df.columns.tolist())   # Show column names
    print(df.head(2))             # Show first 2 rows as sample



==== movies.csv ====
['title', 'plot', 'year', 'runtime', 'budget', 'revenue', 'imdbRating', 'imdbVotes', 'imdbId', 'tmdbId', 'poster', 'countries', 'languages', 'url', 'embedding', 'movieId']
       title                                               plot    year  \
0  Toy Story  A cowboy doll is profoundly threatened and jea...  1995.0   
1    Jumanji  When two kids find and play a magical board ga...  1995.0   

   runtime      budget      revenue  imdbRating  imdbVotes  imdbId  tmdbId  \
0     81.0  30000000.0  373554033.0         8.3   591836.0  114709   862.0   
1    104.0  65000000.0  262797249.0         6.9   198355.0  113497  8844.0   

                                              poster countries  \
0  https://image.tmdb.org/t/p/w440_and_h660_face/...     [USA]   
1  https://image.tmdb.org/t/p/w440_and_h660_face/...     [USA]   

            languages                                url  \
0           [English]   https://themoviedb.org/movie/862   
1  [English,  French]  htt

## Step 1: Load all CSV files into Pandas DataFrames

In [3]:
import pandas as pd

# Load nodes
movies_df = pd.read_csv('movies.csv')
users_df = pd.read_csv('users.csv')
genres_df = pd.read_csv('genre.csv')
actors_df = pd.read_csv('actor.csv')
directors_df = pd.read_csv('directors.csv')

# Load relationships
ratings_df = pd.read_csv('ratings.csv')
in_genre_df = pd.read_csv('in_genre.csv')
acted_in_df = pd.read_csv('acted_in.csv')
directed_df = pd.read_csv('directed.csv')

print("✅ All CSV files loaded!")


✅ All CSV files loaded!


## Step 2: Check basic info about each DataFrame

In [43]:
print("Movies:", movies_df.shape, movies_df.columns.tolist())
print("Users:", users_df.shape, users_df.columns.tolist())
print("Genres:", genres_df.shape, genres_df.columns.tolist())
print("Actors:", actors_df.shape, actors_df.columns.tolist())
print("Directors:", directors_df.shape, directors_df.columns.tolist())

print("Ratings:", ratings_df.shape, ratings_df.columns.tolist())
print("In Genre:", in_genre_df.shape, in_genre_df.columns.tolist())
print("Acted In:", acted_in_df.shape, acted_in_df.columns.tolist())
print("Directed:", directed_df.shape, directed_df.columns.tolist())


Movies: (9083, 16) ['title', 'plot', 'year', 'runtime', 'budget', 'revenue', 'imdbRating', 'imdbVotes', 'imdbId', 'tmdbId', 'poster', 'countries', 'languages', 'url', 'embedding', 'movieId']
Users: (671, 1) ['userId']
Genres: (20, 1) ['name']
Actors: (15443, 1) ['name']
Directors: (4091, 1) ['name']
Ratings: (100004, 4) ['userId', 'movieTitle', 'rating', 'timestamp']
In Genre: (20340, 2) ['movie_title', 'genre_name']
Acted In: (35910, 2) ['actor_name', 'movie_title']
Directed: (10007, 2) ['director_name', 'movie_title']


# NEW TRIPLETS WITH COMPLETE USE OF DATA

In [45]:
import pandas as pd
import ast

# 1. Load all CSVs
movies_df = pd.read_csv('movies.csv')
ratings_df = pd.read_csv('ratings.csv')
in_genre_df = pd.read_csv('in_genre.csv')
acted_in_df = pd.read_csv('acted_in.csv')
directed_df = pd.read_csv('directed.csv')

# 2. Create triplets separately
triplets = []

## Movie properties
for idx, row in movies_df.iterrows():
    title = row['title']
    if pd.notnull(row['year']):
        triplets.append((title, "was released in year", str(int(row['year']))))
    if pd.notnull(row['runtime']):
        triplets.append((title, "has a runtime of", f"{int(row['runtime'])} minutes"))
    if pd.notnull(row['budget']) and row['budget'] > 0:
        triplets.append((title, "had a budget of", f"${int(row['budget'])}"))
    if pd.notnull(row['revenue']) and row['revenue'] > 0:
        triplets.append((title, "earned revenue of", f"${int(row['revenue'])}"))
    if pd.notnull(row['imdbRating']):
        triplets.append((title, "has IMDb rating", str(row['imdbRating'])))
    if pd.notnull(row['countries']):
        try:
            countries = ast.literal_eval(row['countries']) if isinstance(row['countries'], str) else row['countries']
            for country in countries:
                triplets.append((title, "was produced in", country))
        except:
            pass
    if pd.notnull(row['languages']):
        try:
            languages = ast.literal_eval(row['languages']) if isinstance(row['languages'], str) else row['languages']
            for language in languages:
                triplets.append((title, "is in language", language))
        except:
            pass

## Ratings (User rated Movie)
for idx, row in ratings_df.iterrows():
    triplets.append((f"User {row['userId']}", f"rated {row['movieTitle']}", f"{row['rating']}"))

## In Genre (Movie belongs to Genre)
for idx, row in in_genre_df.iterrows():
    triplets.append((row['movie_title'], "is in genre", row['genre_name']))

## Acted In (Actor acted in Movie)
for idx, row in acted_in_df.iterrows():
    triplets.append((row['actor_name'], "acted in", row['movie_title']))

## Directed (Director directed Movie)
for idx, row in directed_df.iterrows():
    triplets.append((row['director_name'], "directed", row['movie_title']))

# 3. Confirm total
print(f"✅ Total Triplets Created: {len(triplets)}")

# 4. Peek a few
for t in triplets[:10]:
    print(t)


✅ Total Triplets Created: 203572
('Toy Story', 'was released in year', '1995')
('Toy Story', 'has a runtime of', '81 minutes')
('Toy Story', 'had a budget of', '$30000000')
('Toy Story', 'earned revenue of', '$373554033')
('Toy Story', 'has IMDb rating', '8.3')
('Jumanji', 'was released in year', '1995')
('Jumanji', 'has a runtime of', '104 minutes')
('Jumanji', 'had a budget of', '$65000000')
('Jumanji', 'earned revenue of', '$262797249')
('Jumanji', 'has IMDb rating', '6.9')


## Turn Triplets into Natural Language Sentences

In [None]:
triplet_sentences = []

for subj, rel, obj in triplets:
    sentence = f"{subj} {rel} {obj}"
    triplet_sentences.append(sentence)

print(f" Total Sentences: {len(triplet_sentences)}")
for s in triplet_sentences[:5]:
    print(s)


✅ Total Sentences: 203572
Toy Story was released in year 1995
Toy Story has a runtime of 81 minutes
Toy Story had a budget of $30000000
Toy Story earned revenue of $373554033
Toy Story has IMDb rating 8.3


## Encode these Sentences into Embeddings

In [48]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')

# Encode all sentences
triplet_embeddings = model.encode(triplet_sentences, batch_size=64, show_progress_bar=True)

print(triplet_embeddings.shape)  # Should be (203572, 384)


Batches: 100%|██████████| 3181/3181 [01:00<00:00, 52.33it/s]


(203572, 384)


## Store embeddings in FAISS Index

In [49]:
import faiss
import numpy as np

dimension = triplet_embeddings.shape[1]  # 384
index = faiss.IndexFlatL2(dimension)
index.add(np.array(triplet_embeddings).astype('float32'))

print(f"✅ FAISS Index Created. Total vectors: {index.ntotal}")


✅ FAISS Index Created. Total vectors: 203572


## Build Retriever Function

In [134]:
def retrieve_top_triplets(query, model, index, sentences, top_k=5000):
    query_embedding = model.encode([query])
    D, I = index.search(np.array(query_embedding).astype('float32'), top_k)
    results = []
    for idx, dist in zip(I[0], D[0]):
        if idx < len(sentences):
            results.append((sentences[idx], dist))
    return results


### Test Retriever

In [135]:
sample_queries = [
    "Find adventure movies released after 2000",
    "Movies acted by Tim Allen",
    "Movies directed by Steven Spielberg",
    "Movies produced in USA",
    "Find English language movies",
    "Find movies rated more than 4 by User 1"
]

for query in sample_queries:
    results = retrieve_top_triplets(query, model, index, triplet_sentences, top_k=5)
    print(f"\nQuery: {query}")
    for sentence, dist in results:
        print(f" - {sentence} (Distance: {dist:.4f})")



Query: Find adventure movies released after 2000
 - Adventureland was released in year 2009 (Distance: 0.8231)
 - Back to the Future Part II is in genre Adventure (Distance: 0.8522)
 - Adventures of Rocky and Bullwinkle, The was released in year 2000 (Distance: 0.8528)
 - Back to the Future is in genre Adventure (Distance: 0.8568)
 - Year One is in genre Adventure (Distance: 0.8693)

Query: Movies acted by Tim Allen
 - Tim Allen acted in Toy Story (Distance: 0.4307)
 - Tim Allen acted in Zoom (Distance: 0.4566)
 - Tim Allen acted in Toy Story 3 (Distance: 0.4921)
 - Tim Allen acted in Big Trouble (Distance: 0.4972)
 - Tim Allen acted in Toy Story 2 (Distance: 0.5048)

Query: Movies directed by Steven Spielberg
 - Steven Spielberg directed 1941 (Distance: 0.2879)
 - Steven Spielberg directed Munich (Distance: 0.4188)
 - Steven Spielberg directed Always (Distance: 0.4402)
 - Steven Spielberg directed Duel (Distance: 0.4550)
 - Steven Spielberg directed Jurassic Park (Distance: 0.4676)



In [58]:
pip install langchain-groq


Collecting langchain-groq
  Using cached langchain_groq-0.3.2-py3-none-any.whl.metadata (2.6 kB)
Collecting groq<1,>=0.4.1 (from langchain-groq)
  Using cached groq-0.23.1-py3-none-any.whl.metadata (15 kB)
Collecting distro<2,>=1.7.0 (from groq<1,>=0.4.1->langchain-groq)
  Using cached distro-1.9.0-py3-none-any.whl.metadata (6.8 kB)
Using cached langchain_groq-0.3.2-py3-none-any.whl (15 kB)
Using cached groq-0.23.1-py3-none-any.whl (127 kB)
Using cached distro-1.9.0-py3-none-any.whl (20 kB)
Installing collected packages: distro, groq, langchain-groq
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3/3[0m [langchain-groq]
[1A[2KSuccessfully installed distro-1.9.0 groq-0.23.1 langchain-groq-0.3.2
Note: you may need to restart the kernel to use updated packages.


In [59]:
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.vectorstores import FAISS
from langchain.schema import Document
from langchain_groq import ChatGroq	

## Build a Simple Retriever Class

In [136]:
from langchain.schema import Document
from langchain.docstore import InMemoryDocstore
from langchain.vectorstores import FAISS

llm = ChatGroq(
    temperature=0,
    model_name="llama3-70b-8192",
    groq_api_key="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"
)


# Fix mapping: Make sure keys are Python int not np.int64
index_to_docstore_id = {int(i): str(i) for i in range(len(triplet_texts))}

# Rebuild docstore correctly
docstore = InMemoryDocstore({str(i): Document(page_content=triplet_texts[i]) for i in range(len(triplet_texts))})

# Rebuild FAISS vectorstore
vectorstore = FAISS(
    embedding_function=model.encode,
    index=index,
    docstore=docstore,
    index_to_docstore_id=index_to_docstore_id
)

# Rebuild retriever
retriever = vectorstore.as_retriever(search_kwargs={"k": 500})


# 4. Build QA chain
from langchain.chains import RetrievalQA
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="stuff"
)


`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.


In [137]:
#Find adventure movies released after 2000.
query = "Find adventure movies released after 2000."
result = qa_chain.invoke(query)

print("Answer:", result['result'])


kg.query("""
MATCH (m:Movie)-[:IN_GENRE]->(g:Genre)
WHERE g.name = "Adventure" AND m.year > 2000
RETURN m.title, m.year
ORDER BY m.year ASC
""")



Answer: Based on the provided data, here are the adventure movies released after 2000:

1. Hero (Ying xiong) (2002) - genre: Adventure
2. Indiana Jones and the Last Crusade (2008) - genre: Action (note: this is a re-release, the original release was in 1989)
3. 3:10 to Yuma (2007) - genre: Adventure
4. Yes, Madam (a.k.a. Police Assassins) (a.k.a. In the Line of Duty 2) (Huang gu shi jie) (2000) - genre: Action
5. All Is Lost (2013) - genre: Action
6. The Raid 2: Berandal (2014) - genre: Action
7. Jet Li's Fearless (Huo Yuan Jia) (2006) - genre: Action
8. Knight's Tale, A (2001) - genre: Action

Please note that the list might not be exhaustive, as the provided data is limited.


[{'m.title': 'Spy Kids', 'm.year': 2001},
 {'m.title': 'Joe Dirt', 'm.year': 2001},
 {'m.title': 'Mummy Returns, The', 'm.year': 2001},
 {'m.title': 'Shrek', 'm.year': 2001},
 {'m.title': 'Atlantis: The Lost Empire', 'm.year': 2001},
 {'m.title': 'Lara Croft: Tomb Raider', 'm.year': 2001},
 {'m.title': 'A.I. Artificial Intelligence', 'm.year': 2001},
 {'m.title': 'Final Fantasy: The Spirits Within', 'm.year': 2001},
 {'m.title': 'Jurassic Park III', 'm.year': 2001},
 {'m.title': 'Planet of the Apes', 'm.year': 2001},
 {'m.title': 'Jay and Silent Bob Strike Back', 'm.year': 2001},
 {'m.title': 'Musketeer, The', 'm.year': 2001},
 {'m.title': 'Extreme Days', 'm.year': 2001},
 {'m.title': 'Joy Ride', 'm.year': 2001},
 {'m.title': 'Monsters, Inc.', 'm.year': 2001},
 {'m.title': "Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone)",
  'm.year': 2001},
 {'m.title': 'Black Knight', 'm.year': 2001},
 {'m.title': 'Baran', 'm.year': 2001},
 {'m.title': 'Jimmy N

## Recall@K Checking

In [147]:
from langchain.vectorstores import FAISS
from langchain.schema import Document
from langchain.docstore import InMemoryDocstore

# 1. Prepare documents
documents = [Document(page_content=text) for text in triplet_texts]

# 2. Precompute embeddings using model.encode()
embeddings = model.encode([doc.page_content for doc in documents], normalize_embeddings=True)

# 3. Build FAISS manually
import faiss
import numpy as np

dimension = embeddings.shape[1]  # Example: 384
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings).astype('float32'))

# 4. Create docstore
docstore = InMemoryDocstore({str(i): documents[i] for i in range(len(documents))})
index_to_docstore_id = {i: str(i) for i in range(len(documents))}

# 5. Final FAISS vectorstore
vectorstore = FAISS(
    embedding_function=model.encode,
    index=index,
    docstore=docstore,
    index_to_docstore_id=index_to_docstore_id
)

# 6. Create retriever
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})


`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.


In [148]:
# Ground truth queries and expected answer substrings
ground_truth = [
    {
        "query": "Find adventure movies released after 2000",
        "expected": ["Adventure"]   # Look for word 'Adventure' in the answer triplets
    },
    {
        "query": "Who directed Jurassic Park?",
        "expected": ["Steven Spielberg directed Jurassic Park"]
    },
    {
        "query": "Find movies acted by Tim Allen",
        "expected": ["Tim Allen acted in"]
    },
    {
        "query": "Movies produced in USA",
        "expected": ["was produced in USA"]
    },
    {
        "query": "Find movies rated greater than 4 by any user",
        "expected": ["rated", "4.0"]
    }
]


In [149]:
def evaluate_recall_at_k(ground_truth, retriever, model, k=5):
    total = len(ground_truth)
    hits = 0

    for example in ground_truth:
        query = example["query"]
        expected_answers = example["expected"]

        # Use retriever properly in new LangChain
        results = retriever.get_relevant_documents(query)
        retrieved_sentences = [doc.page_content for doc in results]

        # Check if any expected answer is present
        found = False
        for expected in expected_answers:
            for sentence in retrieved_sentences[:k]:  # manually take top-k
                if expected.lower() in sentence.lower():
                    found = True
                    break
            if found:
                break

        if found:
            hits += 1

    recall_at_k = hits / total
    print(f"✅ Recall@{k}: {recall_at_k:.2f} ({hits}/{total} queries matched)")
    return recall_at_k


In [150]:
evaluate_recall_at_k(
    ground_truth=ground_truth,
    retriever=retriever,    # correct retriever object
    model=model,
    k=5
)


✅ Recall@5: 0.80 (4/5 queries matched)


0.8

# FINAL APPROACH

In [224]:
kg.query("CALL db.labels()")


[{'label': 'Movie'},
 {'label': 'Genre'},
 {'label': 'User'},
 {'label': 'Actor'},
 {'label': 'Director'},
 {'label': 'Person'},
 {'label': '_Bloom_Perspective_'},
 {'label': '_Bloom_Scene_'}]

In [225]:
kg.query("CALL db.relationshipTypes()")


[{'relationshipType': 'IN_GENRE'},
 {'relationshipType': 'RATED'},
 {'relationshipType': 'ACTED_IN'},
 {'relationshipType': 'DIRECTED'},
 {'relationshipType': '_Bloom_HAS_SCENE_'}]

In [226]:
kg.query("CALL db.propertyKeys()")


[{'propertyKey': 'movieId'},
 {'propertyKey': 'name'},
 {'propertyKey': 'userId'},
 {'propertyKey': 'imdbId'},
 {'propertyKey': 'title'},
 {'propertyKey': 'rating'},
 {'propertyKey': 'timestamp'},
 {'propertyKey': 'tmdbId'},
 {'propertyKey': 'year'},
 {'propertyKey': 'countries'},
 {'propertyKey': 'languages'},
 {'propertyKey': 'plot'},
 {'propertyKey': 'imdbRating'},
 {'propertyKey': 'imdbVotes'},
 {'propertyKey': 'released'},
 {'propertyKey': 'runtime'},
 {'propertyKey': 'poster'},
 {'propertyKey': 'revenue'},
 {'propertyKey': 'budget'},
 {'propertyKey': 'tagline'},
 {'propertyKey': 'role'},
 {'propertyKey': 'born'},
 {'propertyKey': 'died'},
 {'propertyKey': 'bornIn'},
 {'propertyKey': 'bio'},
 {'propertyKey': 'url'},
 {'propertyKey': 'plotEmbedding'},
 {'propertyKey': 'posterEmbedding'},
 {'propertyKey': 'id'},
 {'propertyKey': 'data'},
 {'propertyKey': 'nodes'},
 {'propertyKey': 'relationships'},
 {'propertyKey': 'style'},
 {'propertyKey': 'visualisation'},
 {'propertyKey': 'versi

In [227]:
kg.query("""
MATCH (a:Actor)-[r:ACTED_IN]->(m:Movie)
RETURN a.name AS actor, m.title AS movie
LIMIT 10
""")


[{'actor': 'Jim Varney', 'movie': 'Toy Story'},
 {'actor': 'Tim Allen', 'movie': 'Toy Story'},
 {'actor': 'Tom Hanks', 'movie': 'Toy Story'},
 {'actor': 'Don Rickles', 'movie': 'Toy Story'},
 {'actor': 'Robin Williams', 'movie': 'Jumanji'},
 {'actor': 'Bradley Pierce', 'movie': 'Jumanji'},
 {'actor': 'Kirsten Dunst', 'movie': 'Jumanji'},
 {'actor': 'Jonathan Hyde', 'movie': 'Jumanji'},
 {'actor': 'Walter Matthau', 'movie': 'Grumpier Old Men'},
 {'actor': 'Ann-Margret', 'movie': 'Grumpier Old Men'}]

In [228]:
kg.query("""
MATCH (m:Movie)-[r:IN_GENRE]->(g:Genre)
RETURN m.title AS movie, g.name AS genre
LIMIT 10
""")


[{'movie': 'Boxtrolls, The', 'genre': 'Adventure'},
 {'movie': 'The Book of Life', 'genre': 'Adventure'},
 {'movie': 'Teenage Mutant Ninja Turtles', 'genre': 'Adventure'},
 {'movie': 'Jupiter Ascending', 'genre': 'Adventure'},
 {'movie': 'Planes: Fire & Rescue', 'genre': 'Adventure'},
 {'movie': 'Transformers: Age of Extinction', 'genre': 'Adventure'},
 {'movie': 'Hercules', 'genre': 'Adventure'},
 {'movie': 'The Expendables 3', 'genre': 'Adventure'},
 {'movie': 'Guardians of the Galaxy', 'genre': 'Adventure'},
 {'movie': 'Maleficent', 'genre': 'Adventure'}]

In [229]:
kg.query("""
MATCH (u:User)-[r:RATED]->(m:Movie)
RETURN u.userId AS user, m.title AS movie, r.rating AS rating
LIMIT 10
""")


[{'user': '1', 'movie': 'Antz', 'rating': 2.0},
 {'user': '1', 'movie': 'Fly, The', 'rating': 2.5},
 {'user': '1', 'movie': 'Time Bandits', 'rating': 1.0},
 {'user': '1', 'movie': 'Blazing Saddles', 'rating': 3.0},
 {'user': '1', 'movie': 'French Connection, The', 'rating': 4.0},
 {'user': '1', 'movie': 'Tron', 'rating': 4.0},
 {'user': '1', 'movie': 'Gods Must Be Crazy, The', 'rating': 3.0},
 {'user': '1', 'movie': 'Willow', 'rating': 2.0},
 {'user': '1', 'movie': 'Sleepers', 'rating': 3.0},
 {'user': '1', 'movie': 'Escape from New York', 'rating': 2.0}]

In [230]:
kg.query("""
MATCH (d:Director)-[r:DIRECTED]->(m:Movie)
RETURN d.name AS director, m.title AS movie
LIMIT 10
""")


[{'director': 'Harold Lloyd', 'movie': 'Kid Brother, The'},
 {'director': 'Paul Wegener',
  'movie': 'Golem, The (Golem, wie er in die Welt kam, Der)'},
 {'director': 'Buster Keaton', 'movie': 'Cameraman, The'},
 {'director': 'Buster Keaton', 'movie': 'Boat, The'},
 {'director': 'Buster Keaton', 'movie': 'Play House, The'},
 {'director': 'Buster Keaton', 'movie': 'Haunted House, The'},
 {'director': 'Buster Keaton', 'movie': 'College'},
 {'director': 'Buster Keaton', 'movie': 'Steamboat Bill, Jr.'},
 {'director': 'Buster Keaton', 'movie': 'Cops'},
 {'director': 'Buster Keaton', 'movie': 'Navigator, The'}]

#### Pull movies, genres, actors, directors, users

In [231]:
# 1. Movies Basic Info
movies = kg.query("""
MATCH (m:Movie)
RETURN m.title AS title, m.year AS year, m.runtime AS runtime, m.budget AS budget,
       m.revenue AS revenue, m.imdbRating AS imdbRating, m.countries AS countries,
       m.languages AS languages
""")

# 2. Movies and Genres
movie_genres = kg.query("""
MATCH (m:Movie)-[:IN_GENRE]->(g:Genre)
RETURN m.title AS movie, g.name AS genre
""")

# 3. Actors and Movies
acted_in = kg.query("""
MATCH (a:Actor)-[:ACTED_IN]->(m:Movie)
RETURN a.name AS actor, m.title AS movie
""")

# 4. Directors and Movies
directed = kg.query("""
MATCH (d:Director)-[:DIRECTED]->(m:Movie)
RETURN d.name AS director, m.title AS movie
""")

# 5. Users and Ratings
user_ratings = kg.query("""
MATCH (u:User)-[r:RATED]->(m:Movie)
RETURN u.userId AS userId, m.title AS movie, r.rating AS rating
""")

print("All data retrieved!")


All data retrieved!


## Build Triplets from the Retrieved Data

In [232]:
triplets = []

# 1. Movie properties
for row in movies:
    title = row['title']
    if title is None:
        continue
    if row['year'] is not None:
        triplets.append((title, "was released in year", str(int(row['year']))))
    if row['runtime'] is not None:
        triplets.append((title, "has a runtime of", f"{int(row['runtime'])} minutes"))
    if row['budget'] is not None and row['budget'] > 0:
        triplets.append((title, "had a budget of", f"${int(row['budget'])}"))
    if row['revenue'] is not None and row['revenue'] > 0:
        triplets.append((title, "earned revenue of", f"${int(row['revenue'])}"))
    if row['imdbRating'] is not None:
        triplets.append((title, "has IMDb rating", str(row['imdbRating'])))
    if row['countries'] is not None:
        for country in row['countries']:
            triplets.append((title, "was produced in", country))
    if row['languages'] is not None:
        for language in row['languages']:
            triplets.append((title, "is in language", language))

# 2. Genres
for row in movie_genres:
    movie = row['movie']
    genre = row['genre']
    if movie and genre:
        triplets.append((movie, "is in genre", genre))

# 3. Actors
for row in acted_in:
    actor = row['actor']
    movie = row['movie']
    if actor and movie:
        triplets.append((actor, "acted in", movie))

# 4. Directors
for row in directed:
    director = row['director']
    movie = row['movie']
    if director and movie:
        triplets.append((director, "directed", movie))

# 5. User Ratings
for row in user_ratings:
    user = row['userId']
    movie = row['movie']
    rating = row['rating']
    if user and movie and rating is not None:
        triplets.append((f"User {user}", f"rated {movie}", str(rating)))

# Confirm
print(f"Total Triplets Created: {len(triplets)}")
for t in triplets[:10]:
    print(t)


Total Triplets Created: 229894
('Toy Story', 'was released in year', '1995')
('Toy Story', 'has a runtime of', '81 minutes')
('Toy Story', 'had a budget of', '$30000000')
('Toy Story', 'earned revenue of', '$373554033')
('Toy Story', 'has IMDb rating', '8.3')
('Toy Story', 'was produced in', 'USA')
('Toy Story', 'is in language', 'English')
('Jumanji', 'was released in year', '1995')
('Jumanji', 'has a runtime of', '104 minutes')
('Jumanji', 'had a budget of', '$65000000')


#### Turn these triplets into plain English sentences

In [233]:
triplet_sentences = []
for subj, rel, obj in triplets:
    sentence = f"{subj} {rel} {obj}"
    triplet_sentences.append(sentence)

print(f"Total Sentences: {len(triplet_sentences)}")
for s in triplet_sentences[:10]:
    print(s)


Total Sentences: 229894
Toy Story was released in year 1995
Toy Story has a runtime of 81 minutes
Toy Story had a budget of $30000000
Toy Story earned revenue of $373554033
Toy Story has IMDb rating 8.3
Toy Story was produced in USA
Toy Story is in language English
Jumanji was released in year 1995
Jumanji has a runtime of 104 minutes
Jumanji had a budget of $65000000


## Encode Sentences into Embeddings

In [234]:
from sentence_transformers import SentenceTransformer

# Load local sentence embedding model
encoder_model = SentenceTransformer('all-MiniLM-L6-v2')

# Encode all sentences
triplet_embeddings = encoder_model.encode(triplet_sentences, batch_size=64, show_progress_bar=True)

print(f"Shape of embeddings: {triplet_embeddings.shape}")


Batches: 100%|██████████| 3593/3593 [01:08<00:00, 52.76it/s]


Shape of embeddings: (229894, 384)


## Create the FAISS index

In [235]:
import faiss
import numpy as np

dimension = triplet_embeddings.shape[1]  # 384
index = faiss.IndexFlatL2(dimension)
index.add(np.array(triplet_embeddings).astype('float32'))

print(f"FAISS Index Created. Total vectors: {index.ntotal}")


FAISS Index Created. Total vectors: 229894


## Build a simple Retriever function to search inside FAISS

In [236]:
# Define retrieval function
def retrieve_triplets(query, model, index, sentences, top_k=20):
    query_embedding = model.encode([query])
    D, I = index.search(np.array(query_embedding).astype('float32'), top_k)
    results = []
    for idx, dist in zip(I[0], D[0]):
        if idx < len(sentences):
            results.append((sentences[idx], dist))
    return results


#### Updated retrieval function with re-ranking

In [237]:
from sentence_transformers import util  # Make sure you import this!

model = SentenceTransformer('all-MiniLM-L6-v2')

def retrieve_triplets(query, model, index, sentences, top_k=20):
    # Step 1: Encode the query
    query_embedding = model.encode([query])

    # Step 2: Search FAISS index
    D, I = index.search(np.array(query_embedding).astype('float32'), top_k)

    # Step 3: Collect initial results
    initial_results = []
    for idx, dist in zip(I[0], D[0]):
        if idx < len(sentences):
            initial_results.append((sentences[idx], dist))
    
    # Step 4: Re-rank based on cosine similarity (optional but improves quality)
    if initial_results:
        triplet_texts = [r[0] for r in initial_results]
        triplet_embeddings = model.encode(triplet_texts)
        similarities = util.cos_sim(query_embedding, triplet_embeddings)[0]  # shape: (top_k,)
        
        # Pair triplets with similarity and sort
        triplet_similarity_pairs = list(zip(triplet_texts, similarities.tolist()))
        triplet_similarity_pairs.sort(key=lambda x: x[1], reverse=True)  # Higher similarity first
        
        reranked_results = [(t, s) for t, s in triplet_similarity_pairs]
        return reranked_results

    return initial_results


#### test the retriever

In [238]:
# Sample queries to test
sample_queries = [
    "Movies released after 2010",
    "Movies directed by Steven Spielberg",
    "Movies in English language",
    "Movies acted by Tom Hanks",
    "Adventure movies produced in USA",
    "Movies rated 5.0 by User 1"
]

# Test retrieval
for query in sample_queries:
    print(f"\nQuery: {query}")
    results = retrieve_triplets(query, model, index, triplet_sentences, top_k=10)
    for sentence, dist in results:
        print(f" - {sentence} (Distance: {dist:.4f})")



Query: Movies released after 2010
 - Another Year was released in year 2010 (Distance: 0.7014)
 - 2012 was released in year 2009 (Distance: 0.6826)
 - Other Guys, The was released in year 2010 (Distance: 0.6793)
 - Role/Play was released in year 2010 (Distance: 0.6672)
 - Next Three Days, The was released in year 2010 (Distance: 0.6546)
 - 10 Years was released in year 2011 (Distance: 0.6462)
 - Tourist, The was released in year 2010 (Distance: 0.6450)
 - Trip, The was released in year 2010 (Distance: 0.6315)
 - American, The was released in year 2010 (Distance: 0.6252)
 - Losers, The was released in year 2010 (Distance: 0.6178)

Query: Movies directed by Steven Spielberg
 - Steven Spielberg directed 1941 (Distance: 0.8560)
 - Steven Spielberg directed Munich (Distance: 0.7906)
 - Steven Spielberg directed Always (Distance: 0.7799)
 - Steven Spielberg directed Duel (Distance: 0.7725)
 - Steven Spielberg directed Jurassic Park (Distance: 0.7662)
 - Steven Spielberg directed Lincoln (Di

In [282]:
# Sample queries to test
sample_queries = [
    "Movies released after 2010",
    "Movies directed by Steven Spielberg",
    "Movies in English language",
    "Movies acted by Tom Hanks",
    "Adventure movies produced in USA",
    "Movies rated 5.0 by User 1"
]

# Test retrieval
for query in sample_queries:
    print(f"\nQuery: {query}")
    results = retrieve_triplets(query, model, index, triplet_sentences, top_k=5)
    for sentence, dist in results:
        print(f" - {sentence} (Distance: {dist:.4f})")



Query: Movies released after 2010
 - Another Year was released in year 2010 (Distance: 0.7014)
 - 2012 was released in year 2009 (Distance: 0.6826)
 - Other Guys, The was released in year 2010 (Distance: 0.6793)
 - Role/Play was released in year 2010 (Distance: 0.6672)
 - Next Three Days, The was released in year 2010 (Distance: 0.6546)

Query: Movies directed by Steven Spielberg
 - Steven Spielberg directed 1941 (Distance: 0.8560)
 - Steven Spielberg directed Munich (Distance: 0.7906)
 - Steven Spielberg directed Always (Distance: 0.7799)
 - Steven Spielberg directed Duel (Distance: 0.7725)
 - Steven Spielberg directed Jurassic Park (Distance: 0.7662)

Query: Movies in English language
 - American Movie is in language English (Distance: 0.7894)
 - Date Movie is in language English (Distance: 0.7840)
 - Left Behind: The Movie is in language English (Distance: 0.7616)
 - Inbetweeners Movie, The is in language English (Distance: 0.7587)
 - Hello Ladies: The Movie is in language English 

#### RAG CHAIN

test new method

In [239]:
from langchain_groq import ChatGroq
from langchain.vectorstores import FAISS
from langchain.docstore import InMemoryDocstore
from langchain.schema import Document
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

import faiss
import numpy as np

# 1. Setup LLM
llm = ChatGroq(
    temperature=0,
    model_name="llama3-70b-8192",
    groq_api_key="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"
)

# 2. Build FAISS
dimension = triplet_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(triplet_embeddings).astype('float32'))

index_to_docstore_id = {i: str(i) for i in range(len(triplet_sentences))}
docstore = InMemoryDocstore({str(i): Document(page_content=triplet_sentences[i]) for i in range(len(triplet_sentences))})
vectorstore = FAISS(embedding_function=model.encode, index=index, docstore=docstore, index_to_docstore_id=index_to_docstore_id)

retriever = vectorstore.as_retriever(search_kwargs={"k": 500})

# 3. Build Prompt
prompt_template = PromptTemplate.from_template("""
You are a movie recommendation expert.

Facts:
{context}

Question:
{question}

Answer:
""")

# 4. Build RetrievalQA Chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="stuff",
    chain_type_kwargs={"prompt": prompt_template}
)



`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.


In [111]:
# 5. Query Example
query = "List adventure movies released after 2000"
result = qa_chain.invoke({"query": query})
print("Answer:", result['result'])


Answer: Here is the list of adventure movies released after 2000:

1. Jurassic Park III (2001)
2. The Lord of the Rings: The Two Towers (2002)
3. The Lord of the Rings: The Return of the King (2003)
4. Pirates of the Caribbean: The Curse of the Black Pearl (2003)
5. Shrek 2 (2004)
6. The Incredibles (2004)
7. Batman Begins (2005)
8. Superman Returns (2006)
9. Pirates of the Caribbean: Dead Man's Chest (2006)
10. Casino Royale (2006)
11. The Bourne Ultimatum (2007)
12. National Treasure: Book of Secrets (2007)
13. Indiana Jones and the Kingdom of the Crystal Skull (2008)
14. The Dark Knight (2008)
15. Kung Fu Panda (2008)
16. Madagascar: Escape 2 Africa (2008)
17. Avatar (2009)
18. The A-Team (2010)
19. The Last Airbender (2010)
20. Pirates of the Caribbean: On Stranger Tides (2011)
21. The Adventures of Tintin (2011)
22. The Avengers (2012)
23. The Dark Knight Rises (2012)
24. The Hobbit: An Unexpected Journey (2012)
25. Iron Man 3 (2013)
26. The Hunger Games: Catching Fire (2013)
27. 

In [112]:
result = kg.query("""
MATCH (m:Movie)-[:IN_GENRE]->(g:Genre)
WHERE g.name = "Adventure" AND m.year > 2000
RETURN m.title AS title, m.year AS year
ORDER BY m.year ASC
""")
result

[{'title': 'Spy Kids', 'year': 2001},
 {'title': 'Joe Dirt', 'year': 2001},
 {'title': 'Mummy Returns, The', 'year': 2001},
 {'title': 'Shrek', 'year': 2001},
 {'title': 'Atlantis: The Lost Empire', 'year': 2001},
 {'title': 'Lara Croft: Tomb Raider', 'year': 2001},
 {'title': 'A.I. Artificial Intelligence', 'year': 2001},
 {'title': 'Final Fantasy: The Spirits Within', 'year': 2001},
 {'title': 'Jurassic Park III', 'year': 2001},
 {'title': 'Planet of the Apes', 'year': 2001},
 {'title': 'Jay and Silent Bob Strike Back', 'year': 2001},
 {'title': 'Musketeer, The', 'year': 2001},
 {'title': 'Extreme Days', 'year': 2001},
 {'title': 'Joy Ride', 'year': 2001},
 {'title': 'Monsters, Inc.', 'year': 2001},
 {'title': "Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone)",
  'year': 2001},
 {'title': 'Black Knight', 'year': 2001},
 {'title': 'Baran', 'year': 2001},
 {'title': 'Jimmy Neutron: Boy Genius', 'year': 2001},
 {'title': 'Lord of the Rings: The Fel

In [115]:
# 5. Query Example
query = "Find movies acted by Leonardo DiCaprio released after 2000."
result = qa_chain.invoke({"query": query})
print("Answer:", result['result'])


Answer: Based on the provided facts, here are the movies acted by Leonardo DiCaprio released after 2000:

1. Gangs of New York (2002)
2. Catch Me If You Can (2002)
3. The Aviator (2004)
4. Blood Diamond (2006)
5. The Departed (2006)
6. Revolutionary Road (2008)
7. Inception (2010)
8. Django Unchained (2012)
9. The Wolf of Wall Street (2013)
10. The Revenant (2015)

Note: The list only includes movies released after 2000, as per the question.


In [116]:
result = kg.query("""
MATCH (p:Person)-[:ACTED_IN]->(m:Movie)
WHERE p.name = "Leonardo DiCaprio" AND m.year > 2000
RETURN m.title AS title, m.year AS year
ORDER BY m.year ASC


""")
result

[{'title': 'Gangs of New York', 'year': 2002},
 {'title': 'Catch Me If You Can', 'year': 2002},
 {'title': 'Aviator, The', 'year': 2004},
 {'title': 'Departed, The', 'year': 2006},
 {'title': 'Blood Diamond', 'year': 2006},
 {'title': 'Body of Lies', 'year': 2008},
 {'title': 'Revolutionary Road', 'year': 2008},
 {'title': 'Shutter Island', 'year': 2010},
 {'title': 'Inception', 'year': 2010},
 {'title': 'J. Edgar', 'year': 2011},
 {'title': 'Django Unchained', 'year': 2012},
 {'title': 'Great Gatsby, The', 'year': 2013},
 {'title': 'Wolf of Wall Street, The', 'year': 2013},
 {'title': 'The Revenant', 'year': 2015}]

In [117]:
# 5. Query Example
query = "Find movies rated 5.0 by User 4."
result = qa_chain.invoke({"query": query})
print("Answer:", result['result'])


Answer: Based on the provided data, User 4 has rated the following movies as 5.0:

1. The Godfather
2. Aliens
3. Big
4. Pulp Fiction
5. Only You
6. Goodfellas
7. Die Hard
8. Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark)
9. Aladdin
10. Say Anything...
11. The NeverEnding Story
12. Psycho
13. Star Wars: Episode I - The Phantom Menace


In [118]:
result = kg.query("""
MATCH (u:User)-[r:RATED]->(m:Movie)
WHERE u.userId = "4" AND r.rating = 5.0
RETURN m.title AS title, r.rating AS rating
ORDER BY m.title



""")
result

[{'title': '101 Dalmatians (One Hundred and One Dalmatians)', 'rating': 5.0},
 {'title': 'Abyss, The', 'rating': 5.0},
 {'title': 'Agnes of God', 'rating': 5.0},
 {'title': 'Airplane!', 'rating': 5.0},
 {'title': 'Aladdin', 'rating': 5.0},
 {'title': 'Alice in Wonderland', 'rating': 5.0},
 {'title': 'Alien', 'rating': 5.0},
 {'title': 'Aliens', 'rating': 5.0},
 {'title': 'Amadeus', 'rating': 5.0},
 {'title': 'Annie Hall', 'rating': 5.0},
 {'title': 'Apocalypse Now', 'rating': 5.0},
 {'title': 'Aristocats, The', 'rating': 5.0},
 {'title': 'Babe', 'rating': 5.0},
 {'title': 'Back to the Future', 'rating': 5.0},
 {'title': 'Bambi', 'rating': 5.0},
 {'title': 'Bedknobs and Broomsticks', 'rating': 5.0},
 {'title': 'Beetlejuice', 'rating': 5.0},
 {'title': 'Better Off Dead...', 'rating': 5.0},
 {'title': 'Big', 'rating': 5.0},
 {'title': 'Birdcage, The', 'rating': 5.0},
 {'title': 'Blade Runner', 'rating': 5.0},
 {'title': 'Blob, The', 'rating': 5.0},
 {'title': 'Blues Brothers, The', 'ratin

In [44]:
# 5. Query Example
query = "Can you recommend me a recent movie with Adventure genre and rating greater than 5"
result = qa_chain.invoke({"query": query})
print("Answer:", result['result'])


Answer: Based on the data, I recommend "Wild Tales" (2014) which is an Adventure movie with a rating of 8.1.


In [52]:
result = kg.query("""
MATCH (m:Movie)-[:IN_GENRE]->(g:Genre)
MATCH (u:User)-[r:RATED]->(m)
WHERE g.name = "Adventure" AND r.rating >= 5.0
RETURN m.title AS title, m.year AS year, r.rating AS rating
ORDER BY m.year DESC, r.rating DESC


""")
result

[{'title': 'Cowboy Bebop', 'year': None, 'rating': 5.0},
 {'title': 'Cowboy Bebop', 'year': None, 'rating': 5.0},
 {'title': 'Day of the Doctor, The', 'year': None, 'rating': 5.0},
 {'title': 'Into the Woods', 'year': None, 'rating': 5.0},
 {'title': 'Clockstoppers', 'year': None, 'rating': 5.0},
 {'title': 'Ice Age: The Great Egg-Scapade', 'year': 2016, 'rating': 5.0},
 {'title': 'Zootopia', 'year': 2016, 'rating': 5.0},
 {'title': 'Zootopia', 'year': 2016, 'rating': 5.0},
 {'title': 'Zootopia', 'year': 2016, 'rating': 5.0},
 {'title': 'Finding Dory', 'year': 2016, 'rating': 5.0},
 {'title': 'Kung Fu Panda 3', 'year': 2016, 'rating': 5.0},
 {'title': 'Warcraft', 'year': 2016, 'rating': 5.0},
 {'title': 'Deadpool', 'year': 2016, 'rating': 5.0},
 {'title': 'Deadpool', 'year': 2016, 'rating': 5.0},
 {'title': 'Deadpool', 'year': 2016, 'rating': 5.0},
 {'title': 'Deadpool', 'year': 2016, 'rating': 5.0},
 {'title': 'Peanuts Movie, The', 'year': 2015, 'rating': 5.0},
 {'title': 'The Man fro

In [120]:
# 5. Query Example
query = "Can you recommend me a recent movie with Adventure genre and rating greater than 3 with actor leonardo dicaprio"
result = qa_chain.invoke({"query": query})
print("Answer:", result['result'])


Answer: Based on the provided data, I can recommend a recent movie with the Adventure genre and a rating greater than 3 that features Leonardo DiCaprio.

The movie is "The Revenant" (2015), which has an IMDb rating of 8.0 and belongs to the Adventure genre. Leonardo DiCaprio stars in the film, which was directed by Alejandro G. Iñárritu.

Please note that this recommendation is based on the provided data and might not reflect your personal preferences or opinions.


In [122]:
result = kg.query("""
MATCH (a:Actor)-[:ACTED_IN]->(m:Movie)-[:IN_GENRE]->(g:Genre)
MATCH (u:User)-[r:RATED]->(m)
WHERE g.name = "Adventure"
  AND toLower(a.name) CONTAINS "leonardo dicaprio"
  AND r.rating > 3
RETURN DISTINCT m.title AS title, m.year AS year
ORDER BY m.year DESC
""")
result


[{'title': 'The Revenant', 'year': 2015},
 {'title': 'Blood Diamond', 'year': 2006},
 {'title': 'Beach, The', 'year': 2000},
 {'title': 'Man in the Iron Mask, The', 'year': 1998}]

In [132]:
result = kg.query("""
MATCH (m:Movie)-[:IN_GENRE]->(g:Genre)
        WHERE g.name = "Comedy"
        RETURN m.title AS title
                  """)
result

[{'title': 'Two Night Stand'},
 {'title': 'Stretch'},
 {'title': 'Boxtrolls, The'},
 {'title': 'This Is Where I Leave You'},
 {'title': 'Tusk'},
 {'title': 'St. Vincent'},
 {'title': 'Rewrite, The'},
 {'title': 'Big Hero 6'},
 {'title': 'What We Do in the Shadows'},
 {'title': "Let's Be Cops"},
 {'title': 'Inbetweeners 2, The'},
 {'title': 'Housebound'},
 {'title': 'Magic in the Moonlight'},
 {'title': 'Teenage Mutant Ninja Turtles'},
 {'title': 'The Hundred-Foot Journey'},
 {'title': 'Dim Sum: A Little Bit of Heart'},
 {'title': 'Bambi Meets Godzilla'},
 {'title': 'Laggies'},
 {'title': 'One I Love, The'},
 {'title': 'Pride'},
 {'title': 'Willie & Phil'},
 {'title': 'Planes: Fire & Rescue'},
 {'title': 'Tammy'},
 {'title': 'Frank'},
 {'title': 'They Came Together'},
 {'title': 'Think Like a Man Too'},
 {'title': 'Trip to Italy, The'},
 {'title': 'Words and Pictures'},
 {'title': 'Sex Tape'},
 {'title': 'Premature'},
 {'title': 'And So It Goes'},
 {'title': 'Zombeavers'},
 {'title': 'A

In [133]:
# 5. Query Example
query = "Find comedy movies produced in USA"
result = qa_chain.invoke({"query": query})
print("Answer:", result['result'])


Answer: Here are the comedy movies produced in the USA:

1. Airplane!
2. American Pie
3. American Pie 2
4. American Reunion (American Pie 4)
5. American Splendor
6. American Wedding (American Pie 3)
7. Analyze That
8. Another Stakeout
9. Are We There Yet?
10. Around the World in 80 Days
11. Associate, The
12. Author! Author!
13. Bad Company
14. Barbershop
15. Beavis and Butt-Head Do America
16. Beerfest
17. Benchwarmers, The
18. Bewitched
19. Big Hit, The
20. Big Lebowski, The
21. Big Picture, The
22. Black Dynamite
23. Blue Collar Comedy Tour: The Movie
24. Borat: Cultural Learnings of America for Make Benefit Glorious Nation of Kazakhstan
25. Broadcast News
26. Bucket List, The
27. Bull Durham
28. Bunny and the Bull
29. Butch Cassidy and the Sundance Kid
30. Cage aux Folles, La
31. California Split
32. Campaign, The
33. Career Opportunities
34. Cars
35. Cast Away
36. Catch and Release
37. Cedar Rapids
38. Celebrity
39. Chances Are
40. Chicago
41. Cops and Robbersons
42. Crash
43. Cro

# VALIDATION AND ACCURACY OF MODEL

#### Sample 5 queries

In [271]:
sample_queries = [
    "List adventure movies released after 2000",
    "Find movies acted by Leonardo DiCaprio released after 2000"
    ]


#### RAG Predictions (LLM output)

In [272]:
rag_results = {}

for query in sample_queries:
    print(f"\nQuery: {query}")
    result = qa_chain.invoke({"query": query})
    print("RAG Answer:", result['result'])
    rag_results[query] = result['result']



Query: List adventure movies released after 2000
RAG Answer: Here is the list of adventure movies released after 2000:

1. Jurassic Park III (2001)
2. The Lord of the Rings: The Two Towers (2002)
3. The Lord of the Rings: The Return of the King (2003)
4. Pirates of the Caribbean: The Curse of the Black Pearl (2003)
5. Shrek 2 (2004)
6. The Incredibles (2004)
7. Batman Begins (2005)
8. Superman Returns (2006)
9. Pirates of the Caribbean: Dead Man's Chest (2006)
10. Casino Royale (2006)
11. The Bourne Ultimatum (2007)
12. National Treasure: Book of Secrets (2007)
13. Indiana Jones and the Kingdom of the Crystal Skull (2008)
14. The Dark Knight (2008)
15. Kung Fu Panda (2008)
16. Madagascar: Escape 2 Africa (2008)
17. Avatar (2009)
18. The A-Team (2010)
19. The Last Airbender (2010)
20. Pirates of the Caribbean: On Stranger Tides (2011)
21. The Adventures of Tintin (2011)
22. The Avengers (2012)
23. The Dark Knight Rises (2012)
24. The Hobbit: An Unexpected Journey (2012)
25. Iron Man 3 

#### Manual KG Queries

In [275]:
kg_queries = {
    "List adventure movies released after 2000": """
        MATCH (m:Movie)-[:IN_GENRE]->(g:Genre)
        WHERE g.name = "Adventure" AND m.year > 2000
        RETURN m.title AS title
    """,
    "Find movies acted by Leonardo DiCaprio released after 2000": """
        MATCH (p:Person)-[:ACTED_IN]->(m:Movie)
        WHERE p.name = "Leonardo DiCaprio" AND m.year > 2000
        RETURN m.title AS title, m.year AS year
        ORDER BY m.year ASC
    """
}



ground_truth_results = {}

for query, cypher in kg_queries.items():
    output = kg.query(cypher)
    titles = [item['title'] for item in output]
    ground_truth_results[query] = titles


#### Normalize function

In [276]:
import re

def normalize_titles(title_list):
    normalized = []
    for title in title_list:
        title = title.lower()
        title = re.sub(r'[^a-z0-9 ]', '', title)  # Remove punctuations
        title = title.strip()
        normalized.append(title)
    return normalized


#### Extraction helper (parse LLM text)

In [277]:
def extract_titles_from_rag_output(text):
    lines = text.split('\n')
    titles = []
    for line in lines:
        match = re.search(r"\d+\.\s*(.+?)\s*(\(|$)", line.strip())
        if match:
            titles.append(match.group(1))
    return titles


#### Evaluation Metrics

In [278]:
# Update your metric function first
def compute_precision_recall_f1(predictions, ground_truth, k=10):
    pred_set = set(predictions[:k])
    gt_set = set(ground_truth)

    intersection = pred_set.intersection(gt_set)
    
    precision = len(intersection) / k if k > 0 else 0
    recall = len(intersection) / len(gt_set) if len(gt_set) > 0 else 0

    if precision + recall == 0:
        f1 = 0
    else:
        f1 = 2 * (precision * recall) / (precision + recall)

    return precision, recall, f1


In [193]:
for query in sample_queries:
    print(f"\nEvaluating Query: {query}")

    rag_titles = extract_titles_from_rag_output(rag_results[query])
    rag_titles_norm = normalize_titles(rag_titles)

    # Check if ground truth available
    if query not in ground_truth_results:
        print("Ground truth not available for this query. Skipping...")
        continue

    ground_truth_titles_norm = normalize_titles(ground_truth_results[query])

    precision, recall, f1 = compute_precision_recall_f1(
        rag_titles_norm, ground_truth_titles_norm, k=10
    )

    print(f"Precision@10: {precision:.2f}")
    print(f"Recall@10: {recall:.2f}")
    print(f"F1-score@10: {f1:.2f}")



Evaluating Query: List adventure movies released after 2000
Precision@10: 0.60
Recall@10: 0.01
F1-score@10: 0.02

Evaluating Query: Find movies acted by Leonardo DiCaprio released after 2000
Precision@10: 0.70
Recall@10: 0.50
F1-score@10: 0.58


In [194]:
# Initialize accumulators
total_precision = 0
total_recall = 0
total_f1 = 0
count = 0  # Number of queries evaluated

for query in sample_queries:
    print(f"\nEvaluating Query: {query}")

    rag_titles = extract_titles_from_rag_output(rag_results[query])
    rag_titles_norm = normalize_titles(rag_titles)

    if query not in ground_truth_results:
        print("Ground truth not available for this query. Skipping...")
        continue

    ground_truth_titles_norm = normalize_titles(ground_truth_results[query])

    precision, recall, f1 = compute_precision_recall_f1(
        rag_titles_norm, ground_truth_titles_norm, k=10
    )

    print(f"Precision@10: {precision:.2f}")
    print(f"Recall@10: {recall:.2f}")
    print(f"F1-score@10: {f1:.2f}")

    total_precision += precision
    total_recall += recall
    total_f1 += f1
    count += 1

# Compute overall averages
if count > 0:
    avg_precision = total_precision / count
    avg_recall = total_recall / count
    avg_f1 = total_f1 / count

    print("\n===========================")
    print(f"Overall RAG System Performance ")
    print(f"Average Precision@10: {avg_precision:.2f}")
    print(f"Average Recall@10: {avg_recall:.2f}")
    print(f"Average F1-score@10: {avg_f1:.2f}")
    print("===========================")
else:
    print("\nNo queries evaluated!")



Evaluating Query: List adventure movies released after 2000
Precision@10: 0.60
Recall@10: 0.01
F1-score@10: 0.02

Evaluating Query: Find movies acted by Leonardo DiCaprio released after 2000
Precision@10: 0.70
Recall@10: 0.50
F1-score@10: 0.58

Overall RAG System Performance 
Average Precision@10: 0.65
Average Recall@10: 0.26
Average F1-score@10: 0.30
