In [2]:
import os
from dotenv import load_dotenv


In [3]:
load_dotenv(dotenv_path='../.env')

True

### Loading text file

In [4]:
from langchain_community.document_loaders import TextLoader
filename = "game-of-thrones.txt"

loader = TextLoader(filename, encoding="utf-8")
docs = loader.load() # returns a list of Document objects representing the pages of the document

### Splitting into chunks

In [5]:
from langchain_text_splitters.character import CharacterTextSplitter

chunk_size = 500
chunk_overlap = 30
text_splitter = CharacterTextSplitter(separator="\n", chunk_size=chunk_size, chunk_overlap=chunk_overlap)

splits = text_splitter.split_documents(docs)
display(splits)

Created a chunk of size 563, which is longer than the specified 500


[Document(metadata={'source': 'game-of-thrones.txt'}, page_content='*Game of Thrones* is a sprawling epic fantasy series based on George R.R. Martin’s *A Song of Ice and Fire* novels. Set in the fictional continents of Westeros and Essos, the series is renowned for its intricate plots, diverse characters, and unpredictable twists. The story is centered on the power struggles among noble families as they vie for control of the Iron Throne of the Seven Kingdoms.\n### The Setting'),
 Document(metadata={'source': 'game-of-thrones.txt'}, page_content='### The Setting\nThe narrative is set in a world where seasons can last for decades. Westeros is divided into several regions, each ruled by a prominent house. The Seven Kingdoms are governed by the Iron Throne, a seat of immense power located in the capital, King’s Landing. Across the Narrow Sea lies Essos, a continent known for its diverse cultures and cities like Braavos and Volantis.\n### The Major Houses'),
 Document(metadata={'source': '

### Embedding

In [11]:
import requests
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
import numpy as np
# model_id = "sentence-transformers/all-MiniLM-L6-v2"
hf_token = os.environ["API_TOKEN"]

# api_url = f"https://api-inference.huggingface.co/pipeline/feature-extraction/{model_id}"
# headers = {"Authorization": f"Bearer {hf_token}"}  # API token

# def get_embedding_vectors(splits):
#     response = requests.post(api_url, headers=headers, json={"inputs": splits, "options":{"wait_for_model":True}})
#     return response.json()
# get_embedding_vectors(splits)

embeddings = HuggingFaceInferenceAPIEmbeddings(
    api_key=hf_token,
    model_name="sentence-transformers/all-MiniLM-l6-v2"
)

vectors = np.array(embeddings.embed_documents([document.page_content for document in splits]))

### Cosine Similarity

In [41]:
#cosine method give the distance between the two vectors and 1-cosine then gives the similarity. Higher the number, more similar the two vectors are.

from scipy.spatial.distance import cosine

sentence1 = "I love coding with Langchain"
sentence2 = "LangChain is great for coding"
sentence3 = "Bananan contains a lot of potassium"

e1 = embeddings.embed_query(sentence1)
e2 = embeddings.embed_query(sentence2)
e3 = embeddings.embed_query(sentence3)

similarity1_2 = 1 - cosine(e1, e2)
similarity1_3 = 1 - cosine(e1, e3)
similarity2_3 = 1 - cosine(e2, e3)

display(f'Similarity between "{sentence1}" and "{sentence2}": {similarity1_2}')
display(f'Similarity between "{sentence1}" and "{sentence3}": {similarity1_3}')
display(f'Similarity between "{sentence2}" and "{sentence3}": {similarity2_3}')

'Similarity between "I love coding with Langchain" and "LangChain is great for coding": 0.07267455057548255'

'Similarity between "I love coding with Langchain" and "Bananan contains a lot of potassium": 0.9172076476027506'

'Similarity between "LangChain is great for coding" and "Bananan contains a lot of potassium": 0.9085639972989409'

### Vector stores

In [43]:
from langchain_chroma import Chroma

# load it into vector database
db = Chroma.from_documents(splits, embeddings)

# query it
query = "Who are the five kings in the war of five kings?"
docs = db.similarity_search(query, k=5)

# print results
print(docs[0].page_content)


#### **Season 2: The War of the Five Kings**
Robert Baratheon’s death ignites a civil war as several claimants vie for the Iron Throne. Robb Stark, declared King in the North, wages war against the Lannisters. Stannis Baratheon, Robert’s brother, claims the throne, as does his brother Renly. Meanwhile, Balon Greyjoy declares himself King of the Iron Islands. The war leads to brutal battles and shifting alliances.
#### **Season 3: The Red Wedding and Beyond**


In [26]:
import faiss
from faiss import write_index, read_index
from langchain_community.vectorstores import FAISS

db = FAISS.from_documents(splits, embeddings)
db.save_local("./db") # save it to disk


### Retrieval with semantic search

In [29]:
query = "Who are the five kings in the war of five kings?"
db = FAISS.load_local("./db", embeddings=embeddings, allow_dangerous_deserialization=True) # load the index from disk. allow_dangerous_deserialization=True is required to load the index from disk
retriever = db.as_retriever()
#retriever = db.as_retriever(search_type="similarity_score_threshold",
#                           search_kwargs={"score_threshold": 0.5})

retreived_docs = retriever.invoke(query, k=5) # returns a list of Document objects
print(retreived_docs[0].page_content)

#### **Season 2: The War of the Five Kings**
Robert Baratheon’s death ignites a civil war as several claimants vie for the Iron Throne. Robb Stark, declared King in the North, wages war against the Lannisters. Stannis Baratheon, Robert’s brother, claims the throne, as does his brother Renly. Meanwhile, Balon Greyjoy declares himself King of the Iron Islands. The war leads to brutal battles and shifting alliances.
#### **Season 3: The Red Wedding and Beyond**


### LLM simple LLM chain