In [3]:
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_community.document_loaders import OnlinePDFLoader
from langchain_community.embeddings import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.chat_models import ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain.retrievers.multi_query import MultiQueryRetriever

In [5]:
local_path = "tsla-20231231-gen.pdf"

# Local PDF file uploads
if local_path:
    loader = UnstructuredPDFLoader(file_path=local_path)
    data = loader.load()
else:
    print("Upload a PDF file")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ankit\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ankit\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


In [7]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=7500, 
                                                  chunk_overlap=100)
chunks = text_splitter.split_documents(data)

In [9]:
vector_db = Chroma.from_documents(
    documents=chunks,
    embedding=OllamaEmbeddings(model="nomic-embed-text", show_progress=True),
    collection_name="local-ai",
)

OllamaEmbeddings: 100%|████████████████████████████████████████████████████████████████| 63/63 [03:28<00:00,  3.31s/it]


In [10]:
local_model = "mistral"
llm = ChatOllama(model=local_model)

In [11]:
QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an AI language model assistant. Your task is to generate three
    different versions of the given user question to retrieve relevant documents from
    a vector database. By generating multiple perspectives on the user question, your
    goal is to help the user overcome some of the limitations of the distance-based
    similarity search. Provide these alternative questions separated by newlines.
    Original question: {question}""",
)

In [12]:
retriever = MultiQueryRetriever.from_llm(
    vector_db.as_retriever(), llm, prompt=QUERY_PROMPT
)

In [13]:
template = """Answer the question based ONLY on the following context:
{context}
Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

In [14]:
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [15]:
chain.invoke(input("What is the document all about?"))

What is the document all about? What is the document all about? ↑↓ for history. Search history with c-↑/c-↓


OllamaEmbeddings: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.01s/it]
OllamaEmbeddings: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.08s/it]
OllamaEmbeddings: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.08s/it]


" The document appears to be an excerpt from a company's Annual Report or similar financial filing, specifically Item 8 (Consolidated Financial Statements) and Part IV (Exhibits). It includes various sections such as Principal Accountant Fees and Services, Certain Relationships and Related Transactions, Director Independence, Security Ownership of Certain Beneficial Owners and Management, and certain exhibits like the Amended and Restated Certificate of Incorporation. The document provides financial information about the company and details about various aspects related to its management, relationships, and financial transactions."

In [16]:
import streamlit as st

ModuleNotFoundError: No module named 'streamlit'