In [12]:
import os
import neo4j
from langchain_community.graphs import Neo4jGraph
import pandas as pd

from dotenv import load_dotenv

load_dotenv(override=True)

True

In [13]:
#########################################################################
#       Neo4j instance 
#########################################################################
print(f' ###  neo4j initializing ### ')

NEO4J_URI = os.getenv('NEO4J_URI')
NEO4J_URL = NEO4J_URI
NEO4J_USERNAME = os.getenv('NEO4J_USERNAME')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')
NEO4J_DATABASE = os.getenv('NEO4J_DATABASE')

embed_dim=384
embedding_dimension=embed_dim

print(f"Connecting to Neo4j DBMS at {NEO4J_URL} as {NEO4J_USERNAME}")
graph = Neo4jGraph(url=NEO4J_URL, username=NEO4J_USERNAME, password=NEO4J_PASSWORD)

graph.refresh_schema()

graph.get_schema


 ###  neo4j initializing ### 
Connecting to Neo4j DBMS at neo4j://localhost:7687 as neo4j


'Node properties:\nMovie {id: INTEGER, released: DATE, title: STRING, tagline: STRING, imdbRating: FLOAT}\nPerson {name: STRING}\nGenre {name: STRING}\nLocation {name: STRING}\nSimilarMovie {name: STRING}\nRelationship properties:\n\nThe relationships:\n(:Movie)-[:IN_GENRE]->(:Genre)\n(:Movie)-[:WAS_TAKEN_IN]->(:Location)\n(:Movie)-[:IS_SIMILAR_TO]->(:SimilarMovie)\n(:Person)-[:DIRECTED]->(:Movie)\n(:Person)-[:ACTED_IN]->(:Movie)'

In [14]:
#########################################################################
#       Cleanup and start from scratch
#########################################################################
# Delete everything in a database
cypher = """
MATCH (n) DETACH DELETE n
"""
graph.query(cypher)

print("## Existing graph schema...")
print(graph.schema)

print("Deleting all nodes...")
# Match all nodes in the graph
cypher = """
  MATCH (n)
  RETURN count (n)
  """
result = graph.query(cypher)

print("Dropping all constraints...")
for constraint in graph.query('SHOW CONSTRAINTS'):
    graph.query(f"DROP CONSTRAINT {constraint['name']}")

print("Dropping all indexes...")
for index in graph.query('SHOW INDEXES'):
    print(f"Removing index {index['name']}:")
    graph.query(f"""
        DROP INDEX `{index['name']}`
    """)

print()
print("## Blank schema...")
graph.refresh_schema()
print(graph.schema)

## Existing graph schema...
Node properties:
Movie {id: INTEGER, released: DATE, title: STRING, tagline: STRING, imdbRating: FLOAT}
Person {name: STRING}
Genre {name: STRING}
Location {name: STRING}
SimilarMovie {name: STRING}
Relationship properties:

The relationships:
(:Movie)-[:IN_GENRE]->(:Genre)
(:Movie)-[:WAS_TAKEN_IN]->(:Location)
(:Movie)-[:IS_SIMILAR_TO]->(:SimilarMovie)
(:Person)-[:DIRECTED]->(:Movie)
(:Person)-[:ACTED_IN]->(:Movie)
Deleting all nodes...
Dropping all constraints...
Dropping all indexes...
Removing index movie_tagline_embeddings:

## Blank schema...
Node properties:

Relationship properties:

The relationships:



In [15]:
#########################################################################
#   Import graph
#########################################################################


# Import a test movie database csv file only 20 rows for now 
df = pd.read_csv("https://raw.githubusercontent.com/tomasonjo/blog-datasets/main/movies/movies_small.csv", nrows=20)
print(df.shape)
print(df.columns)


# the movie database has only movie details.  Will add 
# Description 
# Location movies were made 
# Similar movies 
# Generate some taglines using chatgot:

taglines = ["The adventure life of toys takes off!",
"Roll the dice and unleash the excitement!",
"Still Yelling. Still Fighting. Still Ready for Love.",
"Friends are the people who let you be yourself... and never let you forget it.",
"Just When His World Is Back To Normal... He's In For The Surprise Of His Life!",
"A Los Angeles crime saga",
"You are cordially invited to the most surprising merger of the year.",
"The Original Bad Boys.",
"Terror goes into overtime.",
"No limits. No fears. No substitutes.",
"Why can't the most powerful man in the world have the one thing he wants most?",
"Give blood...a whole new meaning.",
"Part Dog. Part Wolf. All Hero.",
"He had greatness within his grasp.",
"The Course Has Been Set. There Is No Turning Back. Prepare Your Weapons. Summon Your Courage. Discover the Adventure of a Lifetime!",
"No one stays at the top forever.",
"Lose your heart and come to your senses.",
"Twelve outrageous guests. Four scandalous requests. And one lone bellhop, in his first day on the job, who's in for the wildest New year's Eve of his life.",
"New animals. New adventures. Same hair.",
"Get on, or GET OUT THE WAY!"]

location = ["United States", "United States", "United States", "United States", "United States",
           "United States", "United States", "United States", "United States", "United Kingdom",
           "United States", "United States", "United States", "United States", "Malta",
           "United States", "United Kingdom", "United States", "United States", "United States"]

similar_movie = ["Finding Nemo", "Jumanji: Welcome to the Jungle", "The Bucket List", "The Best Man Holiday", "Cheaper by the Dozen",
                 "The Departed", "Notting Hill", "The Adventures of Huck Finn", "Die Hard", "Mission Impossible",
                 "Dave", "Dead and Loving It: Young Frankenstein", "Spirit: Stallion of the Cimarron", "JFK", "Pirates of the Caribbean: The Curse of the Black Pearl",
                 "Goodfellas", "Pride and Prejudice", "Pulp Fiction", "The Mask", "Speed"
                 ]

# Add this to df
df["similar_movie"] = similar_movie
df["tagline"]       = taglines
df["location"]      = location
# df.to_csv('/home/saraghava/moviebot/sample_data/movie.csv')
print(f' ########################33 stage3 ')

(20, 7)
Index(['movieId', 'released', 'title', 'actors', 'director', 'genres',
       'imdbRating'],
      dtype='object')
 ########################33 stage3 


In [16]:
df.to_dict(orient='records')

[{'movieId': 1,
  'released': '1995-11-22',
  'title': 'Toy Story',
  'actors': 'Jim Varney|Tim Allen|Tom Hanks|Don Rickles',
  'director': 'John Lasseter',
  'genres': 'Adventure|Animation|Children|Comedy|Fantasy',
  'imdbRating': 8.3,
  'similar_movie': 'Finding Nemo',
  'tagline': 'The adventure life of toys takes off!',
  'location': 'United States'},
 {'movieId': 2,
  'released': '1995-12-15',
  'title': 'Jumanji',
  'actors': 'Robin Williams|Bradley Pierce|Kirsten Dunst|Jonathan Hyde',
  'director': 'Joe Johnston',
  'genres': 'Adventure|Children|Fantasy',
  'imdbRating': 6.9,
  'similar_movie': 'Jumanji: Welcome to the Jungle',
  'tagline': 'Roll the dice and unleash the excitement!',
  'location': 'United States'},
 {'movieId': 3,
  'released': '1995-12-22',
  'title': 'Grumpier Old Men',
  'actors': 'Walter Matthau|Ann-Margret|Jack Lemmon|Sophia Loren',
  'director': 'Howard Deutch',
  'genres': 'Comedy|Romance',
  'imdbRating': 6.6,
  'similar_movie': 'The Bucket List',
  'ta

### Use a query parameter

Here we pass in the dataframe converted into a list of records as a query parameter named 'movies'.

The Cypher `UNWIND` will turn break the array into a series of rows named `row`.
We return the `row.actors` to illustrate.

In [17]:
graph.query(f"""
            UNWIND $movies AS row
            RETURN row.actors
            """, 
            params={'movies':df.to_dict(orient='records')})

[{'row.actors': 'Jim Varney|Tim Allen|Tom Hanks|Don Rickles'},
 {'row.actors': 'Robin Williams|Bradley Pierce|Kirsten Dunst|Jonathan Hyde'},
 {'row.actors': 'Walter Matthau|Ann-Margret|Jack Lemmon|Sophia Loren'},
 {'row.actors': 'Whitney Houston|Lela Rochon|Angela Bassett|Loretta Devine'},
 {'row.actors': 'Steve Martin|Kimberly Williams-Paisley|Diane Keaton|Martin Short'},
 {'row.actors': 'Al Pacino|Robert De Niro|Val Kilmer|Jon Voight'},
 {'row.actors': 'Julia Ormond|Harrison Ford|Nancy Marchand|Greg Kinnear'},
 {'row.actors': 'Jonathan Taylor Thomas|Brad Renfro|Eric Schweig|Charles Rocket'},
 {'row.actors': 'Jean-Claude Van Damme|Powers Boothe|Raymond J. Barry|Whittni Wright'},
 {'row.actors': 'Pierce Brosnan|Famke Janssen|Sean Bean|Izabella Scorupco'},
 {'row.actors': 'Martin Sheen|Michael J. Fox|Michael Douglas|Annette Bening'},
 {'row.actors': 'Peter MacNicol|Leslie Nielsen|Steven Weber|Amy Yasbeck'},
 {'row.actors': 'Kevin Bacon|Bob Hoskins|Jim Cummings|Bridget Fonda'},
 {'row.ac

In [18]:
######################################################################################################
##the above pd frame needs to be copied into the correct path for neo4j to import
##### ORiginal query local pointer did not work

# graph.query("""
# LOAD CSV WITH HEADERS FROM  'file:///opt/neo4j/import/movie.csv'   // Load CSV data from a file specified by $movie_directory
# AS row                                                      // Each row in the CSV will be represented as 'row'

## ABK - instead, the dataframe is passed as a query parameter and directly loaded to Neo4j...
graph.query("""
UNWIND $movies AS row                                       // Each row in the CSV will be represented as 'row'

MERGE (m:Movie {id:row.movieId})                            // Merge a Movie node with the id from the row
SET m.released = date(row.released),                        // Set the 'released' property of the Movie node to the date from the row
    m.title = row.title,                                    // Set the 'title' property of the Movie node to the title from the row
    m.tagline = row.tagline,                                // Set the 'tagline' property of the Movie node to the tagline from the row
    m.imdbRating = toFloat(row.imdbRating)                  // Convert the 'imdbRating' from string to float and set it as the property

FOREACH (director in split(row.director, '|') |             // For each director in the list of directors from the row (split by '|')
    MERGE (p:Person {name:trim(director)})                  // Merge a Person node with the director's name from the row, trimming any extra spaces
    MERGE (p)-[:DIRECTED]->(m))                             // Create a DIRECTED relationship from the director to the Movie

FOREACH (actor in split(row.actors, '|') |                  // For each actor in the list of actors from the row (split by '|')
    MERGE (p:Person {name:trim(actor)})                     // Merge a Person node with the actor's name from the row, trimming any extra spaces
    MERGE (p)-[:ACTED_IN]->(m))                             // Create an ACTED_IN relationship from the actor to the Movie

FOREACH (genre in split(row.genres, '|') |                  // For each genre in the list of genres from the row (split by '|')
    MERGE (g:Genre {name:trim(genre)})                      // Merge a Genre node with the genre's name from the row, trimming any extra spaces
    MERGE (m)-[:IN_GENRE]->(g))                             // Create an IN_GENRE relationship from the Movie to the Genre

MERGE (l:Location {name:trim(row.location)})
MERGE (m)-[:WAS_TAKEN_IN]->(l)

MERGE (s:SimilarMovie {name:trim(row.similar_movie)})
MERGE (m)-[:IS_SIMILAR_TO]->(s)
""",
# params={"movie_directory": str( movie_csv_path )}   )         # Pass the parameter movie_directory which contains the path to the CSV file
params={"movies": df.to_dict(orient='records') }   )         # Pass the parameter movies which contains prepared dataframe
print(f' #############   ALL INSERTED TO NEO4J   ###########33 stage3 ')


 #############   ALL INSERTED TO NEO4J   ###########33 stage3 


In [19]:
#########################################################
# Embeddings 
#########################################################
from sentence_transformers import SentenceTransformer, util
#model = SentenceTransformer('BAAI/bge-large-zh-v1.5')
model = SentenceTransformer('all-MiniLM-L6-v2')
from typing import List

def embed_text(text:str)->List:
    """
    Embeds the given text using the specified model.
    Parameters:
        text (str): The text to be embedded.
    Returns:
        List: A list containing the embedding of the text.
    """
    response = model.encode(text)
    return response

######################################################################
# For all taglines generate semantic meaning 
######################################################################
print(f' creating embeddings for all taglines')
print(f'{df["tagline"]}')
embedding_list = [embed_text(i)  for i in df["tagline"]]
df["taglineEmbedding"] = embedding_list 

print("Number of vectors:", len(embedding_list))
print("Embedding dimension:", len(embedding_list[0]))
#print(f'{embedding_list[0][:5]}')
#print(f'{embedding_list[19][:5]}')
#print(f'{df.head(5)}')
############################################


 creating embeddings for all taglines
0                 The adventure life of toys takes off!
1             Roll the dice and unleash the excitement!
2     Still Yelling. Still Fighting. Still Ready for...
3     Friends are the people who let you be yourself...
4     Just When His World Is Back To Normal... He's ...
5                              A Los Angeles crime saga
6     You are cordially invited to the most surprisi...
7                                The Original Bad Boys.
8                            Terror goes into overtime.
9                  No limits. No fears. No substitutes.
10    Why can't the most powerful man in the world h...
11                    Give blood...a whole new meaning.
12                       Part Dog. Part Wolf. All Hero.
13                   He had greatness within his grasp.
14    The Course Has Been Set. There Is No Turning B...
15                     No one stays at the top forever.
16             Lose your heart and come to your senses.
17    Twel

In [20]:
from langchain.vectorstores import Neo4jVector

# Delete the existing vector index
#Neo4jVector.delete_index("taglineEmbedding")
# Create a new vector index with the correct dimension
#Neo4jVector.create_new_index("taglineEmbedding", dimension=384)
#vec_size = graph.retrieve_existing_index()
#print(f'{vec_size}')
########################################
## Create vector index
########################################
graph.query("""
  CREATE VECTOR INDEX movie_tagline_embeddings IF NOT EXISTS      // Create a vector index named 'movie_tagline_embeddings' if it doesn't already exist  
  FOR (m:Movie) ON (m.taglineEmbedding)                           // Index the 'taglineEmbedding' property of Movie nodes 
  OPTIONS { indexConfig: {                                        // Set options for the index
    `vector.dimensions`: 384,                                    // Specify the dimensionality of the vector space (384 dimensions)
    `vector.similarity_function`: 'cosine'                        // Specify the similarity function to be cosine similarity
  }}"""
)


[]

In [21]:
graph.query("""
  SHOW VECTOR INDEXES     // Retrieves information about all vector indexes in the database
  """
)


[{'id': 1,
  'name': 'movie_tagline_embeddings',
  'state': 'ONLINE',
  'populationPercent': 100.0,
  'type': 'VECTOR',
  'entityType': 'NODE',
  'labelsOrTypes': ['Movie'],
  'properties': ['taglineEmbedding'],
  'indexProvider': 'vector-2.0',
  'owningConstraint': None,
  'lastRead': None,
  'readCount': 0}]

In [23]:
#####################################################################
#    Query and write to the neo4j database 
#####################################################################
# for index, row in df.iterrows():
#     movie_id = row['movieId']
#     embedding = row['taglineEmbedding']
#     graph.query(f"MATCH (m:Movie {{id: '{movie_id}'}}) SET m.taglineEmbedding = '{embedding}'")

graph.query("""
UNWIND $movies AS row                                       // Each row in the CSV will be represented as 'row'

MATCH (m:Movie {id:row.movieId})                            // MATCH a Movie node with the id from the row
SET m.taglineEmbedding = row['taglineEmbedding']           // Set the 'taglineEmbedding' property of the Movie node to the taglineEmbedding from the row
""",
    params={"movies": df.to_dict(orient='records') }   )
graph.refresh_schema()
print(graph.schema)
##########################################################################################


Node properties:
Movie {id: INTEGER, released: DATE, title: STRING, tagline: STRING, imdbRating: FLOAT, taglineEmbedding: LIST}
Person {name: STRING}
Genre {name: STRING}
Location {name: STRING}
SimilarMovie {name: STRING}
Relationship properties:

The relationships:
(:Movie)-[:IN_GENRE]->(:Genre)
(:Movie)-[:WAS_TAKEN_IN]->(:Location)
(:Movie)-[:IS_SIMILAR_TO]->(:SimilarMovie)
(:Person)-[:DIRECTED]->(:Movie)
(:Person)-[:ACTED_IN]->(:Movie)


In [25]:
# Find one movie with a non-null taglineEmbedding, return the title, vector dimensions, and the embedding
graph.query("""
  MATCH (n) WHERE (n.taglineEmbedding) IS NOT NULL 
  RETURN n.title, size(n.taglineEmbedding) as vectorDimenstions, n.taglineEmbedding AS taglineEmbedding LIMIT 1
"""
)



[{'n.title': 'Toy Story',
  'vectorDimenstions': 384,
  'taglineEmbedding': [-0.014858802780508995,
   0.05397352948784828,
   0.08536024391651154,
   0.027792304754257202,
   0.00335156568326056,
   0.037838175892829895,
   0.05251464620232582,
   -0.05486423149704933,
   -0.016065603122115135,
   0.06950561702251434,
   0.0002916154626291245,
   0.1087937131524086,
   0.024859245866537094,
   0.049804672598838806,
   0.01531610544770956,
   0.03190014138817787,
   -0.039590977132320404,
   0.01300706434994936,
   0.043832141906023026,
   0.014914039522409439,
   0.02285548858344555,
   -0.0337604321539402,
   0.007378506939858198,
   0.00672050379216671,
   -0.06292126327753067,
   0.09283105283975601,
   -0.02255891263484955,
   -0.007455799728631973,
   -0.034123145043849945,
   -0.028378935530781746,
   0.04302135482430458,
   0.06763318926095963,
   -0.0017028233269229531,
   -0.027964133769273758,
   0.05187506228685379,
   0.07441570609807968,
   -0.012373019941151142,
   -0.13