In [None]:
!pip install langchain duckdb==0.7.1 sentence-transformers lark openai pinecone-client tiktoken
import os

os.environ['OPENAI_API_KEY'] =  
os.environ['HUGGINGFACEHUB_API_TOKEN'] = 
os.environ["PINECONE_API_KEY"]= 
os.environ["PINECONE_ENV"]= 

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings
model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {"device": "cuda"}
embeddings = HuggingFaceEmbeddings(model_name=model_name, cache_folder='/kaggle/working',model_kwargs=model_kwargs)


In [5]:
import duckdb
from langchain.document_loaders import DuckDBLoader

loader = DuckDBLoader(
    """SELECT
    title,
    stars,
    directors,
    year,
    genre,
    runtime,
    ratingCount,
    plot,
    summary,
    imdb_rating,
    source FROM movie_plots m""",
    database="/content/drive/MyDrive/Colab Notebooks/db.duckdb_ex",
    page_content_columns=["summary", "plot"],
    metadata_columns=[
        "source",
        "title",
        "stars",
        "directors",
        "year",
        "genre",
        "runtime",
        "imdb_rating",
        "ratingCount",
    ],
)

data=loader.load()
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(data)
print(docs[1])

[Document(page_content='summary: {{Short description,1965 film by Roman Polanski}}\n{{Use British English,date=November 2021}}\n{{Infobox film\n, name           = Repulsion\n, image          = Repulsion (1965 film poster).jpg\n, caption        = Theatrical release poster\n, director       = Roman Polanski\n, screenplay     = {{plainlist,\n* Roman Polanski\n* Gérard Brach\n* David Stone{{efn,Adaptation and additional dialogue}}\n}}\n, story          = {{plainlist,\n* Roman Polanski\n* Gérard Brach\n}}\n, producer       = Gene Gutowski\n, starring       = {{plainlist,\n* Catherine Deneuve\n* Ian Hendry\n* John Fraser (actor),John Fraser\n* Patrick Wymark\n* Yvonne Furneaux\n}}\n, cinematography = Gilbert Taylor\n, editing        = Alastair McIntyre\n, music          = Chico Hamilton\n, studio         = {{plainlist,\n* Compton Films\n* Tekli British Productions\n}}\n, distributor    = Compton Films\n, released       = {{Film date,df=yes,1965,6,10,United Kingdom}}\n, runtime        = 105 m

In [7]:
from langchain.vectorstores import Pinecone
import pinecone

os.environ["PINECONE_API_KEY"]='7b2a9297-22b7-4e22-811e-cbdd69231876'
os.environ["PINECONE_ENV"]='northamerica-northeast1-gcp'

pinecone.init(api_key=os.environ["PINECONE_API_KEY"], environment=os.environ["PINECONE_ENV"])
pinecone.delete_index("cinemattr")
pinecone.create_index("cinemattr", dimension=768)
vectorstore = Pinecone.from_documents(
    docs, embeddings,index_name="cinemattr"
)
vectorstore = Pinecone.from_existing_index('cinemattr', embeddings)

In [8]:
from langchain.llms import OpenAI
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo

metadata_field_info = [
    AttributeInfo(
        name="title",
        description="The title of the movie (in lowercase). Case sensitive",
        type="string",
    ),
    # AttributeInfo(
    #     name="description",
    #     description="The description of the movie (in lowercase)",
    #     type="string",
    # ),
    AttributeInfo(
        name="genre",
        description="The genres of the movie (in lowercase). Case sensitive",
        type="string or list[string]",
    ),
    # AttributeInfo(
    #     name="certificate",
    #     description="The certificate of the movie",
    #     type="string",
    # ),
    AttributeInfo(
        name="year",
        description="The year the movie was released. Only integers allowed",
        type="integer",
    ),
    AttributeInfo(
        name="stars",
        description="The name of the movie actors (in lowercase). Case sensitive",
        type="string or list[string]",
    ),
    AttributeInfo(
        name="directors",
        description="The name of the movie directors (in lowercase). Case sensitive",
        type="string or list[string]",
    ),
    AttributeInfo(
        name="runtime",
        description="The runtime of the movie in minutes",
        type="string",
    ),
    AttributeInfo(
        name="imdb_rating",
        description="A 1-10 rating for the movie on IMDB",
        type="float",
    ),
    AttributeInfo(
        name="ratingCount",
        description="How many people rated the movie on IMDB. Indicator of movie's popularity",
        type="integer",
    ),
]
document_content_description = "Summary and plot of the movie"

llm = OpenAI(temperature=0, model_name="gpt-3.5-turbo")
retriever = SelfQueryRetriever.from_llm(llm, vectorstore, document_content_description, metadata_field_info, verbose=True)
retriever.search_kwargs = {"k":10}

In [17]:
from langchain.callbacks import get_openai_callback
with get_openai_callback() as cb:
    result = llm("Tell me a joke")
    query = "movies with Tom Cruise in the cast"
    import re
    query = re.sub(r'[^0-9A-Za-z .-]', '', query)
    print(query)
    results = retriever.get_relevant_documents(query)
    titles = [result.metadata["source"]  for result in results]
    print(titles)
    print(cb)

query='Indian college life' filter=None


[Document(page_content='college.', metadata={'certificate': 'Approved', 'description': 'Four very different college girls drive to Fort Lauderdale, Florida for spring break and seek out various adventures and romance for themselves.', 'directors': ['Henry Levin'], 'genre': ['Comedy', ' Drama', ' Romance'], 'imdb_rating': 6.599999904632568, 'ratingCount': 2870.0, 'runtime': '99 min', 'source': 'tt0054469', 'stars': ['Dolores Hart', 'George Hamilton', 'Yvette Mimieux', 'Jim Hutton'], 'title': 'Where the Boys Are', 'year': 1960.0}),
 Document(page_content='plot:  An American son of Indian immigrants, Krishnagopal Reddy, or "Kris" as he prefers to be called (to distance himself from his heritage), does not associate with the Indian culture that his parents have pushed upon him. Upon arriving at Rutgers University, he finds out to his dismay that all of his roommates are of Indian descent, and all except one are international students from India. His roommates include Ajay (Kal Penn), who i

In [None]:
# for doc in reversed(docs):
#   title = doc.metadata['source']
#   print (title)
#   query = vectorstore.similarity_search_with_score(doc.page_content,filter={
#           "source":title ,
#       })
#   #No title
#   if len(query)>0:
#       score = query[0][1]
#       print("Score:",score)
#       #Title but a different chunk
#       if (score)>0.99:
#         continue
#       else:
#         vectorstore.add_texts([doc.page_content],[doc.metadata])
#   else:
#     vectorstore.add_texts([doc.page_content],[doc.metadata])


# from langchain.vectorstores import Chroma
# vectorstore = Chroma.from_documents(
#     data, hf,persist_directory="/content/drive/MyDrive/Colab Notebooks/chromadb"
# )