## Correct Imports

In [1]:
%pip install --upgrade --quiet  langchain langchain_community langchain-huggingface sentence_transformers pypdf python-dotenv faiss-cpu tiktoken


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


## First tests

In [11]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("human-rights.pdf")
pages = loader.load()

In [12]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False
)

texts = text_splitter.split_documents(pages)
type(texts)

list

In [13]:
from langchain_community.embeddings import HuggingFaceBgeEmbeddings

model_name = "BAAI/bge-small-en"
model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": True}
hf = HuggingFaceBgeEmbeddings(
    model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
)

In [24]:
from langchain_community.vectorstores import Chroma
# create vectorstore
vectorstore = Chroma.from_documents(documents=texts, embedding=hf, persist_directory="./chroma_db")

In [15]:
# load from disk
db3 = Chroma(persist_directory="./chroma_db", embedding_function=hf)

<langchain_community.vectorstores.chroma.Chroma at 0x7d5380e7b190>

In [32]:
retriever = vectorstore.as_retriever()

In [28]:
from langchain_community.llms import HuggingFaceHub
from dotenv import load_dotenv
import os

load_dotenv()

huggingface_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")

llm = HuggingFaceHub(
    repo_id="huggingfaceh4/zephyr-7b-alpha",
    model_kwargs={
        "max_new_tokens":512,
        "repetition_penalty": 1.1,
        "temperature": 0.2,
        "top_p": 0.5,
        "return_full_text":False
    }
)

In [36]:
template =  """
User: You are an AI Assistant that follows instructions extremely well.
Please be truthful and give direct answers. Please tell 'I don't know' if user query is not in CONTEXT

Keep in mind, you will lose the job, if you answer out of CONTEXT questions


CONTEXT: {context}
Query: {question}

Remember only return AI answer
Assistant:
"""

In [37]:
from langchain_core.prompts import ChatPromptTemplate
from langchain.schema import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

prompt = ChatPromptTemplate.from_template(template)
output_parser = StrOutputParser()

chain = (
    {
        "context": retriever.with_config(run_name="Docs"),
        "question": RunnablePassthrough(),
    }
    | prompt
    | llm
    | output_parser
)

In [38]:
answer = chain.invoke("What the Article 10 says?")

In [39]:
answer

'According to the given context, Article 10 states "Everyone is entitled in full equality to a fair and public hearing by an independent tribunal in the determination of his rights and obligations and of any criminal charge against him."'