# Imports

In [133]:
# Pinecone
from pinecone import Pinecone, PodSpec

# Langchain
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain.chains.query_constructor.base import (
    StructuredQueryOutputParser,
    get_query_constructor_prompt,
    AttributeInfo
)
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.retrievers.self_query.pinecone import PineconeTranslator
from langchain_openai import (
    ChatOpenAI, 
    OpenAIEmbeddings
)
from langchain_pinecone import PineconeVectorStore
from langchain.indexes import SQLRecordManager, index
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

# General
import os
from dotenv import load_dotenv
load_dotenv()

True

# Convert CSV files to Docs

In [120]:
# Loading in data from all csv files
loader = DirectoryLoader(
    path="./data",
    glob="*.csv",
    loader_cls=CSVLoader,
    show_progress=True)

docs = loader.load()

metadata_field_info = [
    AttributeInfo(
        name="Title", description="The title of the movie", type="string"),
    AttributeInfo(name="Runtime (minutes)",
                  description="The runtime of the movie in minutes", type="integer"),
    AttributeInfo(name="Language",
                  description="The language of the movie", type="string"),
    AttributeInfo(name="Release Year",
                  description="The release year of the movie as an integer", type="integer"),
    AttributeInfo(name="Genre", description="The genre of the movie",
                  type="string or list[string]"),
    AttributeInfo(name="Actors", description="The actors in the movie",
                  type="string or list[string]"),
    AttributeInfo(name="Directors", description="The directors of the movie",
                  type="string or list[string]"),
    AttributeInfo(name="Stream", description="The streaming platforms for the movie",
                  type="string or list[string]"),
    AttributeInfo(name="Buy", description="The platforms where the movie can be bought",
                  type="string or list[string]"),
    AttributeInfo(name="Rent", description="The platforms where the movie can be rented",
                  type="string or list[string]"),
    AttributeInfo(name="Production Companies",
                  description="The production companies of the movie", type="string or list[string]"),
]

def convert_to_list(doc, field):
    if field in doc.metadata and doc.metadata[field] is not None:
        doc.metadata[field] = [item.strip()
                               for item in doc.metadata[field].split(',')]
        
def convert_to_int(doc, field):
    if field in doc.metadata and doc.metadata[field] is not None:
        doc.metadata[field] = int(
            doc.metadata[field])

fields_to_convert_list = ['Genre', 'Actors', 'Directors',
                          'Production Companies', 'Stream', 'Buy', 'Rent']
fields_to_convert_int = ['Runtime (minutes)', 'Release Year']

# Set 'overview' and 'keywords' as 'page_content' and other fields as 'metadata'
for doc in docs:
    # Parse the page_content string into a dictionary
    page_content_dict = dict(line.split(": ", 1)
                             for line in doc.page_content.split("\n") if ": " in line)
    
    doc.page_content = 'Overview: ' + page_content_dict.get(
        'Overview') + '. Keywords: ' + page_content_dict.get('Keywords')
    doc.metadata = {field.name: page_content_dict.get(
        field.name) for field in metadata_field_info}
        
    # Convert fields from string to list of strings
    for field in fields_to_convert_list:
        convert_to_list(doc, field)      

    # Convert fields from string to integers
    for field in fields_to_convert_int:
        convert_to_int(doc, field)

100%|██████████| 104/104 [00:00<00:00, 174.54it/s]


In [121]:
print(docs[5])

page_content='Overview: For a book project, photographer Timothy Greenfield-Sanders took photographs of 30 stars of adult movies, each pair of photographs in the same pose, clothed and nude. This film records the photo shoots and includes interviews with the performers and commentary from eight writers (and John Waters). The actors and writers discuss economics, nudity and exhibitionism, careers, and private lives.. Keywords: pornography, interview, photo shoot, voyeur' metadata={'Title': 'Thinking XXX', 'Runtime (minutes)': 58, 'Language': 'English', 'Release Year': 2004, 'Genre': ['Documentary'], 'Actors': ['Tera Patrick', 'Sunrise Adams', 'Jenna Jameson', 'Ron Jeremy', 'Belladonna'], 'Directors': ['Timothy Greenfield-Sanders'], 'Stream': [''], 'Buy': [''], 'Rent': [''], 'Production Companies': ['Perfect Day Films'], 'Website': 'Unknown'}


# Create Pinecone Index and Upload Docs

In [122]:
# Create empty index
PINECONE_KEY, PINECONE_INDEX_NAME = os.getenv(
    'PINECONE_API_KEY'), os.getenv('PINECONE_INDEX_NAME')

pc = Pinecone(api_key=PINECONE_KEY)

# Uncomment if index is not created already
# pc.create_index(
#     name=PINECONE_INDEX_NAME,
#     dimension=1536,
#     metric="cosine",
#     spec=PodSpec(
#         environment="gcp-starter"
#     )
# )

# Target index and check status
pc_index = pc.Index(PINECONE_INDEX_NAME)
print(pc_index.describe_index_stats())

embeddings = OpenAIEmbeddings(model='text-embedding-ada-002')

vectorstore = PineconeVectorStore(
    pc_index, embeddings
)

# Create record manager
namespace = f"pinecone/{PINECONE_INDEX_NAME}"
record_manager = SQLRecordManager(
    namespace, db_url="sqlite:///record_manager_cache.sql"
)

record_manager.create_schema()

{'dimension': 1536,
 'index_fullness': 0.04021,
 'namespaces': {'': {'vector_count': 4021}},
 'total_vector_count': 4021}


In [123]:
def _clear():
    """
    Hacky helper method to clear content.
    """
    index([], record_manager, vectorstore,
          cleanup="full", source_id_key="Website")

# Uncomment this line if you want to clear the Pinecone vectorstore
_clear()

# Upload documents to pinecome
index(docs, record_manager, vectorstore,
      cleanup="full", source_id_key="Website")

{'num_added': 9622, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 0}

# Creating Self-Querying Retriever

In [124]:
document_content_description = "Brief overview of a movie, along with keywords"

# Define allowed comparators list
allowed_comparators = [
    "$eq",  # Equal to (number, string, boolean)
    "$ne",  # Not equal to (number, string, boolean)
    "$gt",  # Greater than (number)
    "$gte",  # Greater than or equal to (number)
    "$lt",  # Less than (number)
    "$lte",  # Less than or equal to (number)
    "$in",  # In array (string or number)
    "$nin",  # Not in array (string or number)
]

examples = [
    (
        "I'm looking for a sci-fi comedy released after 2021.",
        {
            "query": "sci-fi comedy",
            "filter": "and(eq('Genre', 'Science'), eq('Genre', 'Comedy'), gt('Release Year', 2021))",
        },
    ),
    (
        "Show me critically acclaimed dramas without Tom Hanks.",
        {
            "query": "critically acclaimed drama",
            "filter": "and(eq('Genre', 'Drama'), nin('Actors', ['Tom Hanks']))",
        },
    ),
    (
        "Recommend some films by Yorgos Lanthimos.",
        {
            "query": "Yorgos Lanthimos",
            "filter": 'in("Directors", ["Yorgos Lanthimos]")',
        },
    ),
    (
        "Films similar to Yorgos Lanthmios movies.",
        {
            "query": "Dark comedy, absurd, Greek Weird Wave",
            "filter": 'NO_FILTER',
        },
    ),
    (
        "Find me thrillers with a strong female lead released between 2015 and 2020.",
        {
            "query": "thriller strong female lead",
            "filter": "and(eq('Genre', 'Thriller'), gt('Release Year', 2015), lt('Release Year', 2021))",
        },
    ),
    (
        "Find me highly rated drama movies in English that are less than 2 hours long",
        {
            "query": "Highly rated drama English under 2 hours",
            "filter": 'and(eq("Genre", "Drama"), eq("Language", "English"), lt("Runtime (minutes)", 120))',
        },
    ),
]

constructor_prompt = get_query_constructor_prompt(
    document_content_description,
    metadata_field_info,
    allowed_comparators=allowed_comparators,
    examples=examples,
)

query_model = ChatOpenAI(
    # model='gpt-3.5-turbo-0125',
    model='gpt-4-0125-preview',
    temperature=0,
    streaming=True,
)

output_parser = StructuredQueryOutputParser.from_components()
query_constructor = constructor_prompt | query_model | output_parser

In [130]:
question = "Comedy films"
# question = "Find me thrillers with a strong female lead released between 2015 and 2020."
# print(constructor_prompt.format(query=question))
# print(type(constructor_prompt))

In [131]:
query_constructor.invoke(
    {
        "query": question
    }
)

StructuredQuery(query='Comedy', filter=Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='Genre', value='Comedy'), limit=None)

In [132]:
retriever = SelfQueryRetriever(
    query_constructor=query_constructor,
    vectorstore=vectorstore,
    structured_query_translator=PineconeTranslator(),
    search_kwargs={'k': 10}
)

retriever.invoke(question)

[Document(page_content='Overview: Silent comedy about a poor country bumpkin who goes to Hollywood to make good.. Keywords: big city, motion pictures, country bumpkin, short film', metadata={'Actors': ['Marcella Daly', 'Frank Jonasson', 'Arthur Thalasso', 'Glen Cavender', 'Lloyd Hamilton'], 'Buy': [''], 'Directors': ['Roscoe Arbuckle'], 'Genre': ['Comedy'], 'Language': 'English', 'Production Companies': ['Lloyd Hamilton Corporation'], 'Release Year': 1925.0, 'Rent': [''], 'Runtime (minutes)': 19.0, 'Stream': [''], 'Title': 'The Movies', 'Website': 'Unknown'}),
 Document(page_content='Overview: A series of loosely connected skits that spoof news programs, commercials, porno films, kung-fu films, disaster films, blaxploitation films, spy films, mafia films, and the fear that somebody is watching you on the other side of the TV.. Keywords: commercial, journalism, manipulation of the media, satire, tv ratings, television producer, sketch comedy', metadata={'Actors': ['Bong Soo Han', 'Saul 

# Create RAG Chain

In [128]:
def format_docs(docs):
    return "\n\n".join(f"{doc.page_content}\n\nMetadata: {doc.metadata}" for doc in docs)

chat_model = ChatOpenAI(
    model='gpt-3.5-turbo-0125',
    # model='gpt-4-0125-preview',
    temperature=0,
    streaming=True,
)

prompt = ChatPromptTemplate.from_messages(
    [
        (
            'system',
            """
            Your goal is to recommend films to users based on their 
            query and the retrieved context. If a retrieved film doesn't seem 
            relevant, omit it from your response. Never refer to films that
            are not in your context. If you cannot recommend any 
            films, suggest better queries to the user. You cannot 
            recommend more than five films. Your recommendation should 
            be relevant, original, and at least two to three sentences 
            long.
            
            YOU CANNOT RECOMMEND A FILM IF IT DOES NOT APPEAR IN YOUR 
            CONTEXT.

            # TEMPLATE FOR OUTPUT
            - [Title of Film](source link):
                - Runtime:
                - Release Year:
                - (Your reasoning for recommending this film)
            
            Question: {question} 
            Context: {context} 
            """
        ),
    ]
)

# Create a chatbot Question & Answer chain from the retriever
rag_chain_from_docs = (
    RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
    | prompt
    | chat_model
    | StrOutputParser()
)

rag_chain_with_source = RunnableParallel(
    {"context": retriever, "question": RunnablePassthrough()}
).assign(answer=rag_chain_from_docs)


query_constructor.invoke(
    {
        "query": question
    }
)
# Only prints final answer
# for chunk in rag_chain_with_source.stream(question):
#     for key in chunk:
#         if key == 'answer':
#             print(chunk[key], end="", flush=True)

# Prints everything
output = {}
curr_key = None
for chunk in rag_chain_with_source.stream(question):
    for key in chunk:
        if key not in output:
            output[key] = chunk[key]
        else:
            output[key] += chunk[key]
        if key != curr_key:
            print(f"\n\n{key}: {chunk[key]}", end="", flush=True)
        else:
            print(chunk[key], end="", flush=True)
        curr_key = key
output



question: Drama films by A24.

context: [Document(page_content='Overview: A reclusive English teacher suffering from severe obesity attempts to reconnect with his estranged teenage daughter for one last chance at redemption.. Keywords: regret, nurse, missionary, idaho, bible, redemption, overweight man, addiction, based on play or musical, teacher, grief, neighbor, obesity, religion, death of lover, election, rebellious daughter, guilt, death, lgbt, sister-in-law, eating disorder, father daughter reunion, empathy, shame, english teacher, abandonment, one location, father daughter relationship, 2010s, gay theme, apartment, essay, food addiction, religious symbolism', metadata={'Actors': ['Hong Chau', 'Ty Simpkins', 'Sadie Sink', 'Samantha Morton', 'Brendan Fraser'], 'Buy': ['Apple TV', 'Amazon Video', 'Google Play Movies', 'YouTube', 'Vudu', 'Microsoft Store'], 'Directors': ['Darren Aronofsky'], 'Genre': ['Drama'], 'Language': 'English', 'Production Companies': ['A24', 'Protozoa Pictu

{'question': 'Drama films by A24.',
 'context': [Document(page_content='Overview: A reclusive English teacher suffering from severe obesity attempts to reconnect with his estranged teenage daughter for one last chance at redemption.. Keywords: regret, nurse, missionary, idaho, bible, redemption, overweight man, addiction, based on play or musical, teacher, grief, neighbor, obesity, religion, death of lover, election, rebellious daughter, guilt, death, lgbt, sister-in-law, eating disorder, father daughter reunion, empathy, shame, english teacher, abandonment, one location, father daughter relationship, 2010s, gay theme, apartment, essay, food addiction, religious symbolism', metadata={'Actors': ['Hong Chau', 'Ty Simpkins', 'Sadie Sink', 'Samantha Morton', 'Brendan Fraser'], 'Buy': ['Apple TV', 'Amazon Video', 'Google Play Movies', 'YouTube', 'Vudu', 'Microsoft Store'], 'Directors': ['Darren Aronofsky'], 'Genre': ['Drama'], 'Language': 'English', 'Production Companies': ['A24', 'Protozoa