In [1]:
from langchain_community.utilities import SQLDatabase
from langchain.chains import create_sql_query_chain
from langchain_community.tools.sql_database.tool import QuerySQLDataBaseTool
from langchain_community.agent_toolkits import create_sql_agent
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from pyprojroot import here
from operator import itemgetter
import chromadb
import os
from dotenv import load_dotenv
load_dotenv()

True

### Comparing RAG and SQL Agent for different questions

In [None]:
question1 = "What is the lowest rated movies with more than 1 million votes?"
question2 = "How many movies have Tom Hanks played in?"
question3 = "Name two actors that have played together in more than 4 movies."
question4 = "Name two popular movies about chess."

### SQL

In [None]:
os.environ['OPENAI_API_KEY'] = os.getenv("OPENAI_API_KEY")
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)


In [None]:
sqldb_directory = here("data/db/imdb.db")
db = SQLDatabase.from_uri(f"sqlite:///{sqldb_directory}")

### Simple SQL query writer

In [None]:
system_role = """Given the following user question, corresponding SQL query, and SQL result, answer the user question.\n
    Question: {question}\n
    SQL Query: {query}\n
    SQL Result: {result}\n
    Answer:
    """

execute_query = QuerySQLDataBaseTool(db=db)
write_query = create_sql_query_chain(llm, db)
answer_prompt = PromptTemplate.from_template(system_role)
answer = answer_prompt | llm | StrOutputParser()

chain = (
    RunnablePassthrough.assign(query=write_query)
    .assign(result=itemgetter("query") | execute_query)
    .assign(answer=answer)
)



In [None]:
system_role = """Given the following user question, corresponding SQL query, and SQL result, answer the user question.\n
    Question: {question}\n
    SQL Query: {query}\n
    SQL Result: {result}\n
    Answer:
    """

execute_query = QuerySQLDataBaseTool(db=db)
write_query = create_sql_query_chain(
    llm, db)
answer_prompt = PromptTemplate.from_template(
    system_role)
answer = answer_prompt | llm | StrOutputParser()

chain = (
    RunnablePassthrough.assign(query=write_query)
    .assign(result=itemgetter("query") | execute_query)
    .assign(answer=answer)
)

In [32]:
response1 = chain.invoke({"question": question1})


In [33]:
response1

{'question': 'What is the lowest rated movies with more than 1 million votes?',
 'query': 'SELECT tb."primaryTitle", tr."averageRating", tr."numVotes"\nFROM "title.basics" tb\nJOIN "title.ratings" tr ON tb.tconst = tr.tconst\nWHERE tr."numVotes" > 1000000\nORDER BY tr."averageRating" ASC\nLIMIT 5;',
 'result': "[('The Hunger Games', 7.2, 1031437), ('Titanic', 7.9, 1325403), ('Iron Man', 7.9, 1159536), ('Avatar', 7.9, 1419190), ('The Avengers', 8.0, 1494463)]",
 'answer': 'The lowest rated movies with more than 1 million votes are:\n1. The Hunger Games with an average rating of 7.2 and 1,031,437 votes\n2. Titanic with an average rating of 7.9 and 1,325,403 votes\n3. Iron Man with an average rating of 7.9 and 1,159,536 votes\n4. Avatar with an average rating of 7.9 and 1,419,190 votes\n5. The Avengers with an average rating of 8.0 and 1,494,463 votes'}

In [None]:
response = chain.invoke({"question": question2})
response

In [None]:
response = chain.invoke({"question": question3})
response

In [None]:
response = chain.invoke({"question": question4}, config={"verbose": True})
response

### LangChain SQL Agent

In [None]:
agent_executor = create_sql_agent(
    llm, 
    db=db, 
    agent_type="openai-tools", 
    verbose=True, 
    system_message="You are an advanced SQL agent. Your task is to generate accurate and efficient SQL queries to answer user questions based on the database schema and data. The database is the IMDB database with multiple tables and information about movies and tv shows. Do not use the id of actors and movies in the result (e.g. tt00001 and nm000001), instead find the real names."
)

In [None]:
response1_sql = agent_executor.invoke({"input": question1})

In [None]:
response2_sql = agent_executor.invoke({"input": question2})

In [None]:
response3_sql = agent_executor.invoke({"input": question3})

In [None]:
response4_sql = agent_executor.invoke({"input": question4})

### RAG

In [None]:
chroma_client = chromadb.PersistentClient(path=here("data/db/chroma"))
collection_name = "imdb"
collection = chroma_client.get_collection(collection_name)

In [38]:
embedding_client = OpenAIEmbeddings(
    model="text-embedding-3-small",
    # With the `text-embedding-3` class
    # of models, you can specify the size
    # of the embeddings you want returned.
    # dimensions=1024
)

In [50]:
question1_embedding = embedding_client.embed_query(question1)
question1_vector_search = collection.query(
    query_embeddings = question1_embedding,
    n_results=3 #top_k
)
vector_db_results = str(question1_vector_search["documents"])

In [51]:
vector_db_results

'[[\'\\nTitle: 537 Votes (2020)\\nType: movie\\nRuntime: 109.0 minutes\\nGenres: Documentary\\nIMDb Rating: 7.2 (Votes: 930)\\n\\nCast & Crew:\\nName:Ben Affleck 1972, characters:["Self"], job:archive_footage\\nName:Fernand Amandi None, characters:["Self"], job:self\\nName:Ann Louise Bardach None, characters:["Self"], job:self\\nName:Mitchell Berger None, characters:["Self"], job:self\\nName:Brad Blakeman None, characters:["Self"], job:self\\nName:Jon Bon Jovi 1962, characters:["Self"], job:archive_footage\\nName:George W. Bush 1946, characters:["Self"], job:archive_footage\\nName:Fidel Castro 1926, characters:["Self"], job:archive_footage\\nName:Dave Chappelle 1973, characters:["Self"], job:archive_footage\\nName:Bill Clinton 1946, characters:["Self"], job:archive_footage\\nName:Billy Corben 1978, characters:None, job:director\\nName:Billy Corben 1978, characters:None, job:producer\\nName:Alfred Spellman 1978, characters:None, job:producer\\nName:Brian Robertson None, characters:None,

In [68]:


system_role = """Given the following user question, and corresponding results from the vector databse, answer the user question.\n
If the vector search results are not related to the question, say "I don't know".\n

Question: {question}\n
Vector search results: {result}\n
Answer:
"""
    
answer_prompt = PromptTemplate.from_template(system_role)
answer = answer_prompt | llm | StrOutputParser()
chain = (
    RunnablePassthrough.assign(embedding=lambda x: embedding_client.embed_query(x["question"]))
    .assign(vector_search=lambda x: collection.query(query_embeddings=x["embedding"], n_results=3))
    .assign(result=lambda x: str(x["vector_search"]["documents"]))
    .assign(answer=answer)
)

In [74]:
response1 = chain.invoke({"question": question1})
print(response1["result"])
print(response1["answer"])

[['\nTitle: 537 Votes (2020)\nType: movie\nRuntime: 109.0 minutes\nGenres: Documentary\nIMDb Rating: 7.2 (Votes: 930)\n\nCast & Crew:\nName:Ben Affleck 1972, characters:["Self"], job:archive_footage\nName:Fernand Amandi None, characters:["Self"], job:self\nName:Ann Louise Bardach None, characters:["Self"], job:self\nName:Mitchell Berger None, characters:["Self"], job:self\nName:Brad Blakeman None, characters:["Self"], job:self\nName:Jon Bon Jovi 1962, characters:["Self"], job:archive_footage\nName:George W. Bush 1946, characters:["Self"], job:archive_footage\nName:Fidel Castro 1926, characters:["Self"], job:archive_footage\nName:Dave Chappelle 1973, characters:["Self"], job:archive_footage\nName:Bill Clinton 1946, characters:["Self"], job:archive_footage\nName:Billy Corben 1978, characters:None, job:director\nName:Billy Corben 1978, characters:None, job:producer\nName:Alfred Spellman 1978, characters:None, job:producer\nName:Brian Robertson None, characters:None, job:composer\nName:Jon

In [75]:
reseponse2 = chain.invoke({"question": question2})
print(reseponse2["result"])
print(reseponse2["answer"])

[['\nTitle: Cast Away (2000)\nType: movie\nRuntime: 143.0 minutes\nGenres: Adventure,Drama,Romance\nIMDb Rating: 7.8 (Votes: 658740)\n\nCast & Crew:\nName:Tom Hanks 1956, characters:["Chuck Noland"], job:actor\nName:Helen Hunt 1963, characters:["Kelly Frears"], job:actress\nName:Paul Sanchez None, characters:["Ramon"], job:actor\nName:Lari White 1965, characters:["Bettina Peterson"], job:actress\nName:Leonid Citer 1960, characters:["Fyodor"], job:actor\nName:David Allen Brooks 1947, characters:["Dick Peterson"], job:actor\nName:Yelena Popovic None, characters:["Beautiful Russian Woman"], job:actress\nName:Valentina Ananina 1933, characters:["Russian Babushka"], job:actress\nName:Semion Sudarikov None, characters:["Nicolai"], job:actor\nName:Peter Von Berg 1947, characters:["Yuri"], job:actor\nName:Robert Zemeckis 1952, characters:None, job:director\nName:William Broyles Jr. 1944, characters:None, job:writer\nName:Tom Hanks 1956, characters:None, job:producer\nName:Jack Rapke None, char

In [76]:
reseponse3 = chain.invoke({"question": question3})
print(reseponse3["result"])
print(reseponse3["answer"])

[['\nTitle: Four Friends (1981)\nType: movie\nRuntime: 114.0 minutes\nGenres: Comedy,Drama\nIMDb Rating: 7.0 (Votes: 2082)\n\nCast & Crew:\nName:Craig Wasson 1954, characters:["Danilo"], job:actor\nName:Jodi Thelen 1962, characters:["Georgia"], job:actress\nName:Michael Huddleston 1952, characters:["David"], job:actor\nName:Jim Metzler 1951, characters:["Tom"], job:actor\nName:Scott Hardt None, characters:["Young Danilo"], job:actor\nName:Elizabeth Lawrence 1922, characters:["Mrs. Prozor"], job:actress\nName:Miklos Simon None, characters:["Mr. Prozor"], job:actor\nName:Michael Kovacs None, characters:["Prozor Neighbor"], job:actor\nName:Beatrice Fredman 1918, characters:["Mrs. Zoldos"], job:actress\nName:Pier Calabria None, characters:["Conductor"], job:actress\nName:Arthur Penn 1922, characters:None, job:director\nName:Steve Tesich 1942, characters:None, job:writer\nName:Gene Lasko 1929, characters:None, job:producer\nName:Arthur Penn 1922, characters:None, job:producer\nName:Elizabet

In [77]:
reseponse4 = chain.invoke({"question": question4})
print(reseponse4["result"])
print(reseponse4["answer"])

[['\nTitle: Computer Chess (2013)\nType: movie\nRuntime: 92.0 minutes\nGenres: Comedy\nIMDb Rating: 6.2 (Votes: 4906)\n\nCast & Crew:\nName:Kriss Schludermann None, characters:["ADVANTAGE Member"], job:actor\nName:Tom Fletcher None, characters:["DEEP SPEED Member"], job:actor\nName:Wiley Wiggins 1976, characters:["Beuscher"], job:actor\nName:Patrick Riester None, characters:["Bishton"], job:actor\nName:Kevin Bewersdorf None, characters:["Cameraman"], job:actor\nName:Gene Williams None, characters:["MONSIEUR D\'ECHECS member"], job:actor\nName:Jim Lewis None, characters:["John"], job:actor\nName:Cole Noppenberg None, characters:["CAPA X Member"], job:actor\nName:Myles Paige None, characters:["Papageorge"], job:actor\nName:Gerald Peary None, characters:["Henderson"], job:actor\nName:Andrew Bujalski 1977, characters:None, job:director\nName:Andrew Bujalski 1977, characters:None, job:writer\nName:Houston King None, characters:None, job:producer\nName:Alex Lipschultz None, characters:None, 