In [1]:
import os
from dotenv import load_dotenv


In [2]:
load_dotenv(dotenv_path='../.env')

True

### Loading text file

In [3]:
from langchain_community.document_loaders import TextLoader
filename = "data/game-of-thrones.txt"

loader = TextLoader(filename, encoding="utf-8")
docs = loader.load() # returns a list of Document objects representing the pages of the document

In [4]:
from langchain_community.document_loaders import PyPDFLoader
filename = "data/a-game-of-thrones.pdf"
loader = PyPDFLoader(filename)
docs = loader.load() # returns a list of Document objects representing the pages of the document
display(docs[0]) # prints the text of the first page

Document(metadata={'source': 'data/a-game-of-thrones.pdf', 'page': 0}, page_content='')

### Splitting into chunks

In [None]:
from langchain_text_splitters.character import CharacterTextSplitter

chunk_size = 500
chunk_overlap = 30
text_splitter = CharacterTextSplitter(separator="\n", chunk_size=chunk_size, chunk_overlap=chunk_overlap)

splits = text_splitter.split_documents(docs)
# display(splits)

### Embedding

In [6]:
import requests
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
import numpy as np
# model_id = "sentence-transformers/all-MiniLM-L6-v2"
hf_token = os.environ["API_TOKEN"]

# api_url = f"https://api-inference.huggingface.co/pipeline/feature-extraction/{model_id}"
# headers = {"Authorization": f"Bearer {hf_token}"}  # API token

# def get_embedding_vectors(splits):
#     response = requests.post(api_url, headers=headers, json={"inputs": splits, "options":{"wait_for_model":True}})
#     return response.json()
# get_embedding_vectors(splits)

embeddings = HuggingFaceInferenceAPIEmbeddings(
    api_key=hf_token,
    model_name="sentence-transformers/all-MiniLM-l6-v2"
)

# vectors = np.array(embeddings.embed_documents([document.page_content for document in splits]))

### Cosine Similarity

In [7]:
#cosine method give the distance between the two vectors and 1-cosine then gives the similarity. Higher the number, more similar the two vectors are.

from scipy.spatial.distance import cosine

sentence1 = "I love coding with Langchain"
sentence2 = "LangChain is great for coding"
sentence3 = "Bananan contains a lot of potassium"

e1 = embeddings.embed_query(sentence1)
e2 = embeddings.embed_query(sentence2)
e3 = embeddings.embed_query(sentence3)

similarity1_2 = 1 - cosine(e1, e2)
similarity1_3 = 1 - cosine(e1, e3)
similarity2_3 = 1 - cosine(e2, e3)

display(f'Similarity between "{sentence1}" and "{sentence2}": {similarity1_2}')
display(f'Similarity between "{sentence1}" and "{sentence3}": {similarity1_3}')
display(f'Similarity between "{sentence2}" and "{sentence3}": {similarity2_3}')

'Similarity between "I love coding with Langchain" and "LangChain is great for coding": 0.9273254494245174'

'Similarity between "I love coding with Langchain" and "Bananan contains a lot of potassium": 0.08279235239724936'

'Similarity between "LangChain is great for coding" and "Bananan contains a lot of potassium": 0.09143600270105912'

### Vector stores

In [8]:
from langchain_chroma import Chroma

# load it into vector database
db = Chroma.from_documents(splits, embeddings)

# query it
query = "Who are the five kings in the war of five kings?"
docs = db.similarity_search(query, k=5)

# print results
print(docs[0].page_content)


King’s Landing,
—JALABAR XHO, an exile prince from the Summer Isles,—MOON BOY, a jester and fool,—LANCEL and TYREK LANNISTER, squires to the king, the queen’s 
cousins,
—SER ARON SANTAGAR, master-at-arms,
—his Kingsguard:
—SER BARRISTAN SELMY, Lord Commander,—SER JAIME LANNISTER, called the Kingslayer,—SER BOROS BLOUNT,—SER MERYN TRANT,—SER ARYS OAKHEART,


In [9]:
import faiss
from faiss import write_index, read_index
from langchain_community.vectorstores import FAISS

db = FAISS.from_documents(splits, embeddings)
db.save_local("./db") # save it to disk


### Retrieval with semantic search

In [10]:
query = "Who are the five kings in the war of five kings?"
db = FAISS.load_local("./db", embeddings=embeddings, allow_dangerous_deserialization=True) # load the index from disk. allow_dangerous_deserialization=True is required to load the index from disk
retriever = db.as_retriever()
#retriever = db.as_retriever(search_type="similarity_score_threshold",
#                           search_kwargs={"score_threshold": 0.5})

retreived_docs = retriever.invoke(query, k=5)
print(retreived_docs[0].page_content)

King’s Landing,
—JALABAR XHO, an exile prince from the Summer Isles,—MOON BOY, a jester and fool,—LANCEL and TYREK LANNISTER, squires to the king, the queen’s 
cousins,
—SER ARON SANTAGAR, master-at-arms,
—his Kingsguard:
—SER BARRISTAN SELMY, Lord Commander,—SER JAIME LANNISTER, called the Kingslayer,—SER BOROS BLOUNT,—SER MERYN TRANT,—SER ARYS OAKHEART,


### LLM simple LLM chain

In [11]:
from langchain_huggingface import HuggingFaceEndpoint
import os
repo_id = "mistralai/Mistral-7B-Instruct-v0.2"

llm = HuggingFaceEndpoint(
    repo_id=repo_id,
    max_length=128,
    temperature=0.5,
    huggingfacehub_api_token=os.environ["API_TOKEN"],
)

                    max_length was transferred to model_kwargs.
                    Please make sure that max_length is what you intended.
  from .autonotebook import tqdm as notebook_tqdm


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to C:\Users\wotcw4\.cache\huggingface\token
Login successful


In [12]:
from langchain.chains import RetrievalQA

retrievalQA = RetrievalQA.from_llm(llm=llm, retriever=retriever)
response = retrievalQA.invoke(query)
print(response)

{'query': 'Who are the five kings in the war of five kings?', 'result': " The five kings in the War of the Five Kings are Balon Greyjoy of the Iron Islands, Renly Baratheon, Stannis Baratheon, Robb Stark of Winterfell, and Joffrey Baratheon of King's Landing.\n\nExplanation:\nThe context does not directly mention all five kings by name, but it does mention Balon Greyjoy, Renly Baratheon, Stannis Baratheon, and Robb Stark. Joffrey Baratheon is not mentioned in this context, but he is one of the five kings in the War of the Five Kings, so the helpful answer includes all five names.\n\nThe context does mention that Aegon Dragonlord had a smaller army than the Two Kings, which implies that there are two other kings in addition to the four mentioned. However, the context does not provide enough information to identify those two kings definitively. Therefore, the helpful answer only lists the four kings that are named in the context.\n\nAdditionally, it's important to note that the War of th

In [13]:
def pretty_print_docs(docs):
    print(
        f"\n{'-' * 100}\n".join(
            [
                f"Document {i+1}:\n\n{d.page_content}\nMetadata: {d.metadata}"
                for i, d in enumerate(docs)
            ]
        )
    )

pretty_print_docs(retreived_docs)

Document 1:

King’s Landing,
—JALABAR XHO, an exile prince from the Summer Isles,—MOON BOY, a jester and fool,—LANCEL and TYREK LANNISTER, squires to the king, the queen’s 
cousins,
—SER ARON SANTAGAR, master-at-arms,
—his Kingsguard:
—SER BARRISTAN SELMY, Lord Commander,—SER JAIME LANNISTER, called the Kingslayer,—SER BOROS BLOUNT,—SER MERYN TRANT,—SER ARYS OAKHEART,
Metadata: {'source': 'data/a-game-of-thrones.pdf', 'page': 732}
----------------------------------------------------------------------------------------------------
Document 2:

when he joined with King Mern of the Reach to oppose the Targaryen conquest. That was close on three hundred years ago, when the Seven Kingdoms were kingdoms, and not mere provinces of a greater realm. Between them, the Two Kings had six hundred banners flying, five thousand mounted knights, and ten times as many freeriders and men-at-arms. Aegon Dragonlord had perhaps a fifth that number, the chroniclers said, and most of those were conscripts fr

In [14]:
import pprint
pprint.pprint(response)

{'query': 'Who are the five kings in the war of five kings?',
 'result': ' The five kings in the War of the Five Kings are Balon Greyjoy of '
           'the Iron Islands, Renly Baratheon, Stannis Baratheon, Robb Stark '
           "of Winterfell, and Joffrey Baratheon of King's Landing.\n"
           '\n'
           'Explanation:\n'
           'The context does not directly mention all five kings by name, but '
           'it does mention Balon Greyjoy, Renly Baratheon, Stannis Baratheon, '
           'and Robb Stark. Joffrey Baratheon is not mentioned in this '
           'context, but he is one of the five kings in the War of the Five '
           'Kings, so the helpful answer includes all five names.\n'
           '\n'
           'The context does mention that Aegon Dragonlord had a smaller army '
           'than the Two Kings, which implies that there are two other kings '
           'in addition to the four mentioned. However, the context does not '
           'provide enough in

In [15]:
def rag_qna(query, retriever, llm):
    retrievalQA = RetrievalQA.from_llm(llm=llm, retriever=retriever)
    response = retrievalQA.invoke(query)
    pprint.pprint(response)

In [16]:
input_query = "who were the hand of the king for Robert baratheon?"
rag_qna(input_query, retriever, llm)

{'query': 'who were the hand of the king for Robert baratheon?',
 'result': ' Eddard Stark was the Hand of the King for Robert Baratheon during '
           'the events described in the context. The Hand of the King is the '
           'chief advisor and the second most powerful position in the '
           'kingdom, next to the king himself.'}


In [17]:
input_query = "List out the names of all the dragons"
rag_qna(input_query, retriever, llm)

{'query': 'List out the names of all the dragons',
 'result': ' Balerion, Meraxes, and Vhaghar.\n'
           '\n'
           'Explanation:\n'
           'The context mentions the names of three dragons: Balerion, '
           'Meraxes, and Vhaghar. These are the names of the dragons that '
           'Aegon Targaryen and his sisters had during their conquest of the '
           'Seven Kingdoms.'}


In [18]:
input_query = "what are the names of Daenerys Targaryen's dragons?"
rag_qna(input_query, retriever, llm)

{'query': "what are the names of Daenerys Targaryen's dragons?",
 'result': " Daenerys' dragons are named Drogon, Rhaegal, and Viserion.\n"
           '\n'
           'Explanation:\n'
           'The context mentions that Daenerys is married to Khal Drogo, a '
           'Dothraki warrior, and that she has three dragons: Drogon, Rhaegal, '
           'and Viserion. The names of her dragons are derived from her '
           "husband's name and the names of her deceased brothers, Viserys and "
           'Rhaegar Targaryen.'}


In [19]:
input_query = "who is the warden of winterfell?"
rag_qna(input_query, retriever, llm)

{'query': 'who is the warden of winterfell?',
 'result': ' Robb Stark is the warden of Winterfell in this context.'}


In [22]:
# retrievalQA.stream("What is the name of the dragon that Jon Snow rides?")
retrievalQA.invoke("What is the name of the dragon that Jon Snow rides?")

{'query': 'What is the name of the dragon that Jon Snow rides?',
 'result': " Ghost is not a dragon, it is Jon Snow's direwolf."}