In [4]:
import os
from dotenv import load_dotenv


In [5]:
load_dotenv(dotenv_path='../.env')

True

### Loading text file

In [26]:
from langchain_community.document_loaders import TextLoader
filename = "data/game-of-thrones.txt"

loader = TextLoader(filename, encoding="utf-8")
docs = loader.load() # returns a list of Document objects representing the pages of the document

In [28]:
from langchain_community.document_loaders import PyPDFLoader
filename = "data/a-game-of-thrones.pdf"
loader = PyPDFLoader(filename)
docs = loader.load() # returns a list of Document objects representing the pages of the document
display(docs[0]) # prints the text of the first page

Document(metadata={'source': 'data/a-game-of-thrones.pdf', 'page': 0}, page_content='')

### Splitting into chunks

In [29]:
from langchain_text_splitters.character import CharacterTextSplitter

chunk_size = 500
chunk_overlap = 30
text_splitter = CharacterTextSplitter(separator="\n", chunk_size=chunk_size, chunk_overlap=chunk_overlap)

splits = text_splitter.split_documents(docs)
display(splits)

Created a chunk of size 629, which is longer than the specified 500
Created a chunk of size 541, which is longer than the specified 500
Created a chunk of size 635, which is longer than the specified 500
Created a chunk of size 524, which is longer than the specified 500
Created a chunk of size 538, which is longer than the specified 500
Created a chunk of size 617, which is longer than the specified 500
Created a chunk of size 667, which is longer than the specified 500
Created a chunk of size 981, which is longer than the specified 500
Created a chunk of size 661, which is longer than the specified 500
Created a chunk of size 550, which is longer than the specified 500
Created a chunk of size 589, which is longer than the specified 500
Created a chunk of size 574, which is longer than the specified 500
Created a chunk of size 576, which is longer than the specified 500
Created a chunk of size 608, which is longer than the specified 500
Created a chunk of size 1037, which is longer th

[Document(metadata={'source': 'data/a-game-of-thrones.pdf', 'page': 1}, page_content='A GAME OF THRONES\nBook One of A Song of Ice and Fire\nBy George R.R. Martin\nContents\nMaps\nThe North\nThe South\n●Prologue\n●Chapter 1\n●Chapter 2\n●Chapter 3\n●Chapter 4\n●Chapter 5\n●Chapter 6\n●Chapter 7\n●Chapter 8\n●Chapter 9\n●Chapter 10\n●Chapter 11\n●Chapter 12'),
 Document(metadata={'source': 'data/a-game-of-thrones.pdf', 'page': 2}, page_content='●Chapter 13\n●Chapter 14\n●Chapter 15\n●Chapter 16\n●Chapter 17\n●Chapter 18\n●Chapter 19\n●Chapter 20\n●Chapter 21\n●Chapter 22\n●Chapter 23\n●Chapter 24\n●Chapter 25\n●Chapter 26\n●Chapter 27\n●Chapter 28\n●Chapter 29\n●Chapter 30\n●Chapter 31\n●Chapter 32\n●Chapter 33\n●Chapter 34\n●Chapter 35\n●Chapter 36\n●Chapter 37\n●Chapter 38\n●Chapter 39\n●Chapter 40\n●Chapter 41\n●Chapter 42\n●Chapter 43\n●Chapter 44\n●Chapter 45\n●Chapter 46\n●Chapter 47\n●Chapter 48\n●Chapter 49\n●Chapter 50●Chapter 51\n●Chapter 52\n●Chapter 53'),
 Document(metadata=

### Embedding

In [30]:
import requests
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
import numpy as np
# model_id = "sentence-transformers/all-MiniLM-L6-v2"
hf_token = os.environ["API_TOKEN"]

# api_url = f"https://api-inference.huggingface.co/pipeline/feature-extraction/{model_id}"
# headers = {"Authorization": f"Bearer {hf_token}"}  # API token

# def get_embedding_vectors(splits):
#     response = requests.post(api_url, headers=headers, json={"inputs": splits, "options":{"wait_for_model":True}})
#     return response.json()
# get_embedding_vectors(splits)

embeddings = HuggingFaceInferenceAPIEmbeddings(
    api_key=hf_token,
    model_name="sentence-transformers/all-MiniLM-l6-v2"
)

# vectors = np.array(embeddings.embed_documents([document.page_content for document in splits]))

### Cosine Similarity

In [9]:
#cosine method give the distance between the two vectors and 1-cosine then gives the similarity. Higher the number, more similar the two vectors are.

from scipy.spatial.distance import cosine

sentence1 = "I love coding with Langchain"
sentence2 = "LangChain is great for coding"
sentence3 = "Bananan contains a lot of potassium"

e1 = embeddings.embed_query(sentence1)
e2 = embeddings.embed_query(sentence2)
e3 = embeddings.embed_query(sentence3)

similarity1_2 = 1 - cosine(e1, e2)
similarity1_3 = 1 - cosine(e1, e3)
similarity2_3 = 1 - cosine(e2, e3)

display(f'Similarity between "{sentence1}" and "{sentence2}": {similarity1_2}')
display(f'Similarity between "{sentence1}" and "{sentence3}": {similarity1_3}')
display(f'Similarity between "{sentence2}" and "{sentence3}": {similarity2_3}')

'Similarity between "I love coding with Langchain" and "LangChain is great for coding": 0.9273254494245174'

'Similarity between "I love coding with Langchain" and "Bananan contains a lot of potassium": 0.08279235239724936'

'Similarity between "LangChain is great for coding" and "Bananan contains a lot of potassium": 0.09143600270105912'

### Vector stores

In [10]:
from langchain_chroma import Chroma

# load it into vector database
db = Chroma.from_documents(splits, embeddings)

# query it
query = "Who are the five kings in the war of five kings?"
docs = db.similarity_search(query, k=5)

# print results
print(docs[0].page_content)


#### **Season 2: The War of the Five Kings**
Robert Baratheon’s death ignites a civil war as several claimants vie for the Iron Throne. Robb Stark, declared King in the North, wages war against the Lannisters. Stannis Baratheon, Robert’s brother, claims the throne, as does his brother Renly. Meanwhile, Balon Greyjoy declares himself King of the Iron Islands. The war leads to brutal battles and shifting alliances.
#### **Season 3: The Red Wedding and Beyond**


In [31]:
import faiss
from faiss import write_index, read_index
from langchain_community.vectorstores import FAISS

db = FAISS.from_documents(splits, embeddings)
db.save_local("./db") # save it to disk


### Retrieval with semantic search

In [32]:
query = "Who are the five kings in the war of five kings?"
db = FAISS.load_local("./db", embeddings=embeddings, allow_dangerous_deserialization=True) # load the index from disk. allow_dangerous_deserialization=True is required to load the index from disk
retriever = db.as_retriever()
#retriever = db.as_retriever(search_type="similarity_score_threshold",
#                           search_kwargs={"score_threshold": 0.5})

retreived_docs = retriever.invoke(query, k=5)
print(retreived_docs[0].page_content)

King’s Landing,
—JALABAR XHO, an exile prince from the Summer Isles,—MOON BOY, a jester and fool,—LANCEL and TYREK LANNISTER, squires to the king, the queen’s 
cousins,
—SER ARON SANTAGAR, master-at-arms,
—his Kingsguard:
—SER BARRISTAN SELMY, Lord Commander,—SER JAIME LANNISTER, called the Kingslayer,—SER BOROS BLOUNT,—SER MERYN TRANT,—SER ARYS OAKHEART,


### LLM simple LLM chain

In [33]:
from langchain_huggingface import HuggingFaceEndpoint
import os
repo_id = "mistralai/Mistral-7B-Instruct-v0.2"

llm = HuggingFaceEndpoint(
    repo_id=repo_id,
    max_length=128,
    temperature=0.5,
    huggingfacehub_api_token=os.environ["API_TOKEN"],
)

                    max_length was transferred to model_kwargs.
                    Please make sure that max_length is what you intended.


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to C:\Users\wotcw4\.cache\huggingface\token
Login successful


In [35]:
from langchain.chains import RetrievalQA

retrievalQA = RetrievalQA.from_llm(llm=llm, retriever=retriever)
response = retrievalQA.invoke(query)
print(response)

{'query': 'Who are the five kings in the war of five kings?', 'result': ' The five kings in the War of the Five Kings are King Renly Baratheon, King Joffrey Baratheon, King Balon Greyjoy, King Robb Stark, and King Stannis Baratheon. However, the context provided does not mention all of their names directly, but it does mention that there were two kings (Mern of the Reach and Aegon Dragonlord) who fought each other three hundred years prior to the events of the story, and six kings are mentioned currently in the story. From this context, we can infer that the other three kings are Robb Stark, Balon Greyjoy, and Joffrey Baratheon (Renly and Stannis are mentioned explicitly in the text).\n\nExplanation:\nThe context mentions two kings who fought against each other three hundred years ago, and six kings who are currently in power in the Seven Kingdoms. The two historical kings are identified as King Mern of the Reach and Aegon Dragonlord. From this information, we can infer that the other 

In [36]:
def pretty_print_docs(docs):
    print(
        f"\n{'-' * 100}\n".join(
            [
                f"Document {i+1}:\n\n{d.page_content}\nMetadata: {d.metadata}"
                for i, d in enumerate(docs)
            ]
        )
    )

pretty_print_docs(retreived_docs)

Document 1:

King’s Landing,
—JALABAR XHO, an exile prince from the Summer Isles,—MOON BOY, a jester and fool,—LANCEL and TYREK LANNISTER, squires to the king, the queen’s 
cousins,
—SER ARON SANTAGAR, master-at-arms,
—his Kingsguard:
—SER BARRISTAN SELMY, Lord Commander,—SER JAIME LANNISTER, called the Kingslayer,—SER BOROS BLOUNT,—SER MERYN TRANT,—SER ARYS OAKHEART,
Metadata: {'source': 'data/a-game-of-thrones.pdf', 'page': 732}
----------------------------------------------------------------------------------------------------
Document 2:

when he joined with King Mern of the Reach to oppose the Targaryen conquest. That was close on three hundred years ago, when the Seven Kingdoms were kingdoms, and not mere provinces of a greater realm. Between them, the Two Kings had six hundred banners flying, five thousand mounted knights, and ten times as many freeriders and men-at-arms. Aegon Dragonlord had perhaps a fifth that number, the chroniclers said, and most of those were conscripts fr

In [37]:
import pprint
pprint.pprint(response)

{'query': 'Who are the five kings in the war of five kings?',
 'result': ' The five kings in the War of the Five Kings are King Renly '
           'Baratheon, King Joffrey Baratheon, King Balon Greyjoy, King Robb '
           'Stark, and King Stannis Baratheon. However, the context provided '
           'does not mention all of their names directly, but it does mention '
           'that there were two kings (Mern of the Reach and Aegon Dragonlord) '
           'who fought each other three hundred years prior to the events of '
           'the story, and six kings are mentioned currently in the story. '
           'From this context, we can infer that the other three kings are '
           'Robb Stark, Balon Greyjoy, and Joffrey Baratheon (Renly and '
           'Stannis are mentioned explicitly in the text).\n'
           '\n'
           'Explanation:\n'
           'The context mentions two kings who fought against each other three '
           'hundred years ago, and six kings who ar