<a href="https://colab.research.google.com/github/arunpandey2023/deep/blob/main/LangChain_Master.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#1 INSTALL ALL THE LIBRARIES

In [None]:
pip install openai python-dotenv langchain pypdf yt_dlp Chroma chromadb kaleido python-multipart tiktoken lark pydub duckduckgo-search langchain[docarray] langchain_experimental langchain-cli

#2 IMPORT ALL LIBRARIES I.E. OPENAI/LANGCHAIN/DUCKDUCKGO

In [None]:
import os, openai, sys
from langchain.document_loaders import PyPDFLoader,WebBaseLoader,TextLoader
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA, ConversationalRetrievalChain
from langchain.retrievers import ContextualCompressionRetriever
from langchain.document_loaders.generic import GenericLoader
from langchain.document_loaders.parsers import OpenAIWhisperParser
from langchain.document_loaders.blob_loaders.youtube_audio import YoutubeAudioLoader
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser
from langchain.tools import DuckDuckGoSearchRun
from langchain.vectorstores import DocArrayInMemorySearch
from langchain.memory import ConversationBufferMemory
from langchain_experimental.utilities import PythonREPL
import numpy as np
import datetime
import panel as pn
import param

#3 SET THE ENVIRONMENT AND OPENAI_API_KEY


In [None]:
llm_name = "gpt-3.5-turbo"
os.environ["OPENAI_API_KEY"] = "YOUR API KEY"
#llm = OpenAI(temperature=0)
llm = ChatOpenAI(model_name=llm_name, temperature=0)
compressor = LLMChainExtractor.from_llm(llm)
persist_directory = '/sample_data/chroma'
sys.path.append('../..')
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())
openai.api_key = os.environ['OPENAI_API_KEY']
embedding = OpenAIEmbeddings()
vectordb = Chroma(
    persist_directory=persist_directory,
    embedding_function=embedding
)

#4 LOAD THE FILES YOU WANT TO USE FOR TESTING

##4.1 Using web Based loader to load files

In [None]:
from langchain.document_loaders import WebBaseLoader

# List of URLs to load files from
urls = [
    "https://docs.google.com/document/d/10D6fWeQu87YDeKLqOxjwh9eczaLTPEOWrxR9jNBV7ho/edit?usp=sharing",
    "https://docs.google.com/document/d/1W9Gw6XwF5dVTzVEOHk-OJOJE9rrkPtC84bjt-gNvZNI/edit?usp=sharing",
    "https://docs.google.com/document/d/1X6OyNNoAXHc4X0ryhu4x-J9cj-wntCER7OxN55_FWUo/edit?usp=sharing",


]

# Initialize a list to store loaded documents
loaded_documents = []

# Create a WebBaseLoader instance for each URL and load the documents
for url in urls:
    loader = WebBaseLoader(url)
    documents = loader.load()
    loaded_documents.extend(documents)  # Append loaded documents to the list

# Now `loaded_documents` contains the documents loaded from all URLs

In [None]:
loaders = [
    # Duplicate documents on purpose - messy data

    PyPDFLoader("/content/drive/MyDrive/c1.pdf"),
    PyPDFLoader("/content/drive/MyDrive/c2.pdf"),
    PyPDFLoader("/content/drive/MyDrive/c3.pdf"),
    PyPDFLoader("/content/drive/MyDrive/c4.pdf"),
    PyPDFLoader("/content/drive/MyDrive/c5.pdf"),
]
docs = []
for loader in loaders:
    docs.extend(loader.load())

#5 SET THE SPLITTER AND VECTORDB

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 150
)
splits = text_splitter.split_documents(docs)
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embedding,
    persist_directory=persist_directory
)

In [None]:
#vectordb.delete_collection()

#6 CHECK THE DOCS AND VECTORDB

In [None]:
print(len(docs))
print(vectordb._collection.count())

10
11


#7 START Q&A USING SIMILARITY SEARCH

In [None]:
question = "what is status of the application?"
docs = vectordb.similarity_search(question,k=3) # k is number of documents

In [None]:
docs[2].page_content

#8 FIND THE DOCUMENTS USING COMPRESSION SEARCH

In [None]:
def pretty_print_docs(docs):
    print(f"\n{'-' * 100}\n".join([f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]))

In [None]:
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=vectordb.as_retriever()
)

In [None]:
question = "what are the names of the counsels?"
compressed_docs = compression_retriever.get_relevant_documents(question)
pretty_print_docs(compressed_docs)

#9 FIND THE DOCUMENTS USING MIXED MODE

In [None]:
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=vectordb.as_retriever(search_type = "mmr")
)

#10 RETRIEVE CONTENT USING RETRIEVAL QA

In [None]:
question = "what were the sections applied?"
compressed_docs = compression_retriever.get_relevant_documents(question)
pretty_print_docs(compressed_docs)
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever()
)



Document 1:

the requirement of compulsory packaging of cement and
fertilizer in jute bags has been done away with


In [None]:
question = "find all the judges mentioned?"
docs = vectordb.similarity_search(question,k=3)
result = qa_chain({"query": question})
result["result"]

"The judges mentioned in the given context are:\n1. HON'BLE MR. JUSTICE MANMOHAN\n2. HON'BLE MR. JUSTICE SAURABH BANERJEE"

#11 CHATBOT WHICH CAN WORK ON SELECTIVE DOCUMENTS

In [None]:
# This will initialize your database and retriever chain
def load_db(file, chain_type, k):
    # load documents
    loader = PyPDFLoader(file)
    documents = loader.load()
    # split documents
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
    docs = text_splitter.split_documents(documents)
    # define embedding
    embeddings = OpenAIEmbeddings()
    # create vector database from data
    db = DocArrayInMemorySearch.from_documents(docs, embeddings)
    # define retriever
    retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": k})
    # create a chatbot chain. Memory is managed externally.
    qa = ConversationalRetrievalChain.from_llm(
        llm=ChatOpenAI(model_name=llm_name, temperature=0),
        chain_type=chain_type,
        retriever=retriever,
        return_source_documents=True,
        return_generated_question=True,
    )
    return qa

In [None]:
class cbfs(param.Parameterized):
    chat_history = param.List([])
    answer = param.String("")
    db_query  = param.String("")
    db_response = param.List([])

    def __init__(self,  **params):
        super(cbfs, self).__init__( **params)
        self.panels = []
        self.loaded_file = "/content/drive/MyDrive/0_GENAIDATA/Introduction.pdf"
        self.qa = load_db(self.loaded_file,"stuff", 4)

    def call_load_db(self, count):
        if count == 0 or file_input.value is None:  # init or no file specified :
            return pn.pane.Markdown(f"Loaded File: {self.loaded_file}")
        else:
            file_input.save("temp.pdf")  # local copy
            self.loaded_file = file_input.filename
            button_load.button_style="outline"
            self.qa = load_db("temp.pdf", "stuff", 4)
            button_load.button_style="solid"
        self.clr_history()
        return pn.pane.Markdown(f"Loaded File: {self.loaded_file}")

    def convchain(self, query):
        if not query:
            return pn.WidgetBox(pn.Row('User:', pn.pane.Markdown("", width=600)), scroll=True)
        result = self.qa({"question": query, "chat_history": self.chat_history})
        self.chat_history.extend([(query, result["answer"])])
        self.db_query = result["generated_question"]
        self.db_response = result["source_documents"]
        self.answer = result['answer']
        self.panels.extend([
            pn.Row('User:', pn.pane.Markdown(query, width=600)),
            pn.Row('ChatBot:', pn.pane.Markdown(self.answer, width=600, style={'background-color': '#F6F6F6'}))
        ])
        inp.value = ''  #clears loading indicator when cleared
        return pn.WidgetBox(*self.panels,scroll=True)

    @param.depends('db_query ', )
    def get_lquest(self):
        if not self.db_query :
            return pn.Column(
                pn.Row(pn.pane.Markdown(f"Last question to DB:", styles={'background-color': '#F6F6F6'})),
                pn.Row(pn.pane.Str("no DB accesses so far"))
            )
        return pn.Column(
            pn.Row(pn.pane.Markdown(f"DB query:", styles={'background-color': '#F6F6F6'})),
            pn.pane.Str(self.db_query )
        )

    @param.depends('db_response', )
    def get_sources(self):
        if not self.db_response:
            return
        rlist=[pn.Row(pn.pane.Markdown(f"Result of DB lookup:", styles={'background-color': '#F6F6F6'}))]
        for doc in self.db_response:
            rlist.append(pn.Row(pn.pane.Str(doc)))
        return pn.WidgetBox(*rlist, width=600, scroll=True)

    @param.depends('convchain', 'clr_history')
    def get_chats(self):
        if not self.chat_history:
            return pn.WidgetBox(pn.Row(pn.pane.Str("No History Yet")), width=600, scroll=True)
        rlist=[pn.Row(pn.pane.Markdown(f"Current Chat History variable", styles={'background-color': '#F6F6F6'}))]
        for exchange in self.chat_history:
            rlist.append(pn.Row(pn.pane.Str(exchange)))
        return pn.WidgetBox(*rlist, width=600, scroll=True)

    def clr_history(self,count=0):
        self.chat_history = []
        return

In [None]:
cb = cbfs()

file_input = pn.widgets.FileInput(accept='.pdf')
button_load = pn.widgets.Button(name="Load DB", button_type='primary')
button_clearhistory = pn.widgets.Button(name="Clear History", button_type='warning')
button_clearhistory.on_click(cb.clr_history)
inp = pn.widgets.TextInput( placeholder='Enter text here…')

bound_button_load = pn.bind(cb.call_load_db, button_load.param.clicks)
conversation = pn.bind(cb.convchain, inp)

jpg_pane = pn.pane.Image( './img/convchain.jpg')

tab1 = pn.Column(
    pn.Row(inp),
    pn.layout.Divider(),
    pn.panel(conversation,  loading_indicator=True, height=300),
    pn.layout.Divider(),
)
tab2= pn.Column(
    pn.panel(cb.get_lquest),
    pn.layout.Divider(),
    pn.panel(cb.get_sources ),
)
tab3= pn.Column(
    pn.panel(cb.get_chats),
    pn.layout.Divider(),
)
tab4=pn.Column(
    pn.Row( file_input, button_load, bound_button_load),
    pn.Row( button_clearhistory, pn.pane.Markdown("Clears chat history. Can use to start a new topic" )),
    pn.layout.Divider(),
    pn.Row(jpg_pane.clone(width=400))
)
dashboard = pn.Column(
    pn.Row(pn.pane.Markdown('# ChatWithYourData_Bot')),
    pn.Tabs(('Conversation', tab1), ('Database', tab2), ('Chat History', tab3),('Configure', tab4))
)
pn.extension()
dashboard

#12 USING AGENTS

In [None]:
df = pd.read_csv("/content/sample_data/california_housing_test.csv")
agent = create_pandas_dataframe_agent(OpenAI(temperature=0), df, verbose=True)

In [None]:
agent.run("how many rows are there?")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: I need to find out how many rows are in the dataframe
Action: python_repl_ast
Action Input: len(df)[0m
Observation: [36;1m[1;3m3000[0m
Thought:[32;1m[1;3m I now know the final answer
Final Answer: There are 3000 rows in the dataframe.[0m

[1m> Finished chain.[0m


'There are 3000 rows in the dataframe.'

In [None]:
df.shape

(3000, 9)

#13 GENERATE PYTHON CODE USING AGENTS AND EXECUTE THAT CODE

In [None]:
template = """Write some python code to solve the user's problem.

Return only python code in Markdown format, e.g.:

```python
....
```"""
prompt = ChatPromptTemplate.from_messages([("system", template), ("human", "{input}")])

model = ChatOpenAI()

def _sanitize_output(text: str):
    _, after = text.split("```python")
    return after.split("```")[0]

chain = prompt | model | StrOutputParser() | _sanitize_output | PythonREPL().run

result = chain.invoke({"input": "generate a python script to create a pandas dataframe"})
print(result)

     Name  Age      City
0    John   25  New York
1    Emma   28    London
2    Mike   22     Paris
3  Sophia   30     Tokyo



#14 USE DUCKDUCKGO TO GENERATE GENERIC SEARCHES

In [None]:
search = DuckDuckGoSearchRun()

template = """Turn the following user input into a search query for a search engine:

{input}"""
prompt = ChatPromptTemplate.from_template(template)

model = ChatOpenAI()

chain = prompt | model | StrOutputParser() | search

search_result = chain.invoke({"input": "who is the prime minister of united kingdom?"})
print(search_result)

Deputy Prime Minister of United Kingdom Oliver Dowden addresses the 78th session of the United Nations General Assembly, Friday, Sept. 22, 2023, at United Nations headquarters. The prime minister of United Kingdom is the head of the government in the United Kingdom.12 May 1980 born Rishi Sunak held two Cabinet positions under Prime Minister Boris Johnson and has been a Member of Parliament (MP) for Richmond (Yorks) since 2015.Liz Truss left the office after the shortest-serving period of 49 days in history she just ...


#15 INPUT/OUTPUT IN DIFFERENT FORMATS

In [None]:
from langchain.schema.messages import HumanMessage, SystemMessage
chat = ChatOpenAI()
messages = [
    SystemMessage(content="you are the M S Dhoni"),
    HumanMessage(content="which brands do you endorse"),
]
response = chat.invoke(messages)
print(response)

content='As an AI, I do not have personal endorsements. However, Mahendra Singh Dhoni has been associated with several brands over the years, including Pepsi, Reebok, Sony, Gulf Oil, GoDaddy, and many more. Please note that endorsement deals may change over time.'


In [None]:
from typing import List
from langchain.llms import OpenAI
from langchain.output_parsers import PydanticOutputParser
from langchain.prompts import PromptTemplate
from langchain.pydantic_v1 import BaseModel, Field, validator

# Initialize the language model
model = OpenAI(model_name="text-davinci-003", temperature=0.0)

# Define your desired data structure using Pydantic
class Joke(BaseModel):
    setup: str = Field(description="question to set up a joke")
    punchline: str = Field(description="answer to resolve the joke")

    @validator("setup")
    def question_ends_with_question_mark(cls, field):
        if field[-1] != "?":
            raise ValueError("Badly formed question!")
        return field

# Set up a PydanticOutputParser
parser = PydanticOutputParser(pydantic_object=Joke)

# Create a prompt with format instructions
prompt = PromptTemplate(
    template="Answer the user query.\n{format_instructions}\n{query}\n",
    input_variables=["query"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

# Define a query to prompt the language model
query = "Tell me a joke."

# Combine prompt, model, and parser to get structured output
prompt_and_model = prompt | model
output = prompt_and_model.invoke({"query": query})

# Parse the output using the parser
parsed_result = parser.invoke(output)

# The result is a structured object
print(parsed_result)

setup='Why did the chicken cross the road?' punchline='To get to the other side!'


In [None]:
from langchain.output_parsers.json import SimpleJsonOutputParser

# Create a JSON prompt
json_prompt = PromptTemplate.from_template(
    "Return a JSON object with `birthdate` and `birthplace` key that answers the following question: {question}"
)

# Initialize the JSON parser
json_parser = SimpleJsonOutputParser()

# Create a chain with the prompt, model, and parser
json_chain = json_prompt | model | json_parser

# Stream through the results
result_list = list(json_chain.stream({"question": "When and where was Elon Musk born?"}))

# The result is a list of JSON-like dictionaries
print(result_list)

[{}, {'birthdate': ''}, {'birthdate': 'June'}, {'birthdate': 'June 28'}, {'birthdate': 'June 28,'}, {'birthdate': 'June 28, 1971'}, {'birthdate': 'June 28, 1971', 'birthplace': ''}, {'birthdate': 'June 28, 1971', 'birthplace': 'P'}, {'birthdate': 'June 28, 1971', 'birthplace': 'Pret'}, {'birthdate': 'June 28, 1971', 'birthplace': 'Pretoria'}, {'birthdate': 'June 28, 1971', 'birthplace': 'Pretoria,'}, {'birthdate': 'June 28, 1971', 'birthplace': 'Pretoria, South'}, {'birthdate': 'June 28, 1971', 'birthplace': 'Pretoria, South Africa'}]


In [None]:
from langchain.output_parsers import CommaSeparatedListOutputParser
from langchain.prompts import PromptTemplate
from langchain.llms import OpenAI

# Initialize the parser
output_parser = CommaSeparatedListOutputParser()

# Create format instructions
format_instructions = output_parser.get_format_instructions()

# Create a prompt to request a list
prompt = PromptTemplate(
    template="List five {subject}.\n{format_instructions}",
    input_variables=["subject"],
    partial_variables={"format_instructions": format_instructions}
)

# Define a query to prompt the model
query = "Indian Cricket Team"

# Generate the output
output = model(prompt.format(subject=query))

# Parse the output using the parser
parsed_result = output_parser.parse(output)

# The result is a list of items
print(parsed_result)

['Virat Kohli', 'Rohit Sharma', 'Shikhar Dhawan', 'Jasprit Bumrah', 'Ravindra Jadeja']


In [None]:
from langchain.prompts import PromptTemplate
from langchain.output_parsers import DatetimeOutputParser
from langchain.chains import LLMChain
from langchain.llms import OpenAI

# Initialize the DatetimeOutputParser
output_parser = DatetimeOutputParser()

# Create a prompt with format instructions
template = """
Answer the user's question:
{question}
{format_instructions}
"""

prompt = PromptTemplate.from_template(
    template,
    partial_variables={"format_instructions": output_parser.get_format_instructions()},
)

# Create a chain with the prompt and language model
chain = LLMChain(prompt=prompt, llm=OpenAI())

# Define a query to prompt the model
query = "when did Neil Armstrong land on the moon in terms of GMT?"

# Run the chain
output = chain.run(query)

# Parse the output using the datetime parser
parsed_result = output_parser.parse(output)

# The result is a datetime object
print(parsed_result)

1969-07-20 20:17:40


#16 RETRIEVAL IN LANGCHAIN

In [None]:
from langchain.document_loaders import TextLoader

loader = TextLoader("/content/drive/MyDrive/1_Gen_AI.txt")
document = loader.load()

In [None]:
from langchain.document_loaders.csv_loader import CSVLoader

loader = CSVLoader(file_path='/content/sample_data/california_housing_test.csv')
documents = loader.load()


In [None]:
'''
loader = CSVLoader(file_path='./example_data/mlb_teams_2012.csv', csv_args={
    'delimiter': ',',
    'quotechar': '"',
    'fieldnames': ['MLB Team', 'Payroll in millions', 'Wins']
})
documents = loader.load()
'''

'\nloader = CSVLoader(file_path=\'./example_data/mlb_teams_2012.csv\', csv_args={\n    \'delimiter\': \',\',\n    \'quotechar\': \'"\',\n    \'fieldnames\': [\'MLB Team\', \'Payroll in millions\', \'Wins\']\n})\ndocuments = loader.load()\n'

In [None]:
from langchain.document_loaders import PyPDFLoader
loader = PyPDFLoader("/content/drive/MyDrive/0_GENAIDATA/Introduction Yottaasys 15_October.pdf")
pages = loader.load_and_split()

In [None]:
'''from langchain.document_loaders import MathpixPDFLoader
loader = MathpixPDFLoader("/content/drive/MyDrive/0_GENAIDATA/Introduction Yottaasys 15_October.pdf")
data = loader.load()
'''

'from langchain.document_loaders import MathpixPDFLoader\nloader = MathpixPDFLoader("/content/drive/MyDrive/0_GENAIDATA/Introduction Yottaasys 15_October.pdf")\ndata = loader.load()\n'

In [None]:
pip install pymupdf

In [None]:
from langchain.document_loaders import PyMuPDFLoader
loader = PyMuPDFLoader("/content/drive/MyDrive/0_GENAIDATA/Introduction Yottaasys 15_October.pdf")
data = loader.load()
# Optionally pass additional arguments for PyMuPDF's get_text() call
data = loader.load(option="text")



In [None]:
pip install pdfminer.six

In [None]:
from langchain.document_loaders import PDFMinerLoader

loader = PDFMinerLoader("/content/drive/MyDrive/0_GENAIDATA/Introduction Yottaasys 15_October.pdf")
data = loader.load()

In [None]:
pip install amazon-textract-caller

In [None]:
from langchain.document_loaders import AmazonTextractPDFLoader

# Requires AWS account and configuration
loader = AmazonTextractPDFLoader("/content/drive/MyDrive/0_GENAIDATA/Introduction Yottaasys 15_October.pdf")
documents = loader.load()