In [1]:
# Source:
# * https://github.com/pinecone-io/examples/blob/master/learn/generation/langchain/handbook/xx-langchain-chunking.ipynb
# * https://github.com/pinecone-io/examples/blob/master/learn/generation/llm-field-guide/llama-2/llama-2-13b-retrievalqa.ipynb
# * https://medium.com/@onkarmishra/using-langchain-for-question-answering-on-own-data-3af0a82789ed
# * https://medium.com/international-school-of-ai-data-science/implementing-rag-with-langchain-and-hugging-face-28e3ea66c5f7

In [2]:
from langchain_community.document_loaders import DirectoryLoader
from langchain.schema import Document
from tqdm.auto import tqdm
import sys
import hashlib
sys.path.append('../src/')
from common import load_docs_from_jsonl, save_docs_to_jsonl

In [3]:
PATH_DOCS = '../Documents/main/'

## Data Loading

In [4]:
## Option 1
loader = DirectoryLoader(PATH_DOCS, glob="*.txt")
all_docs1 = loader.load()

In [5]:
## Option 2
all_docs2 = load_docs_from_jsonl('../Documents/main_docs.jsonl')

In [6]:
all_docs2

 Document(page_content='\n\n## Skip links\n\nSkip to Content\n\nplay\n\n##  Live\n\nShow navigation menu\n\n## Navigation menu\n\n  * NewsShow more news sections\n\n    * Middle East\n    * Africa\n    * Asia\n    * US & Canada\n    * Latin America\n    * Europe\n    * Asia Pacific\n\n  * Israel War on Gaza\n  * Features\n  * Opinion\n  * Video\n  * MoreShow more sections\n\n    * Economy\n    * Ukraine war\n    * Coronavirus\n    * Climate Crisis\n    * Investigations\n    * Interactives\n    * In Pictures\n    * Science & Technology\n    * Sport\n    * Podcasts\n\nplay\n\n##  Live\n\nClick here to searchsearch\n\nNews\n\n# Japan earthquake death toll crosses 100\n\n _Hundreds of people are still missing after a magnitude 7.6 earthquake struck\nJapan’s west coast on January 1._\n\nA man looks at debris left on the streets of the city of Suzu, Ishikawa\nprefecture, on January 6, 2024 [JIJI Press via AFP]\n\nPublished On 6 Jan 20246 Jan 2024\n\nThe death toll from Japan’s New Year’s Day

## Docs Splitting

In [7]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=350,
    chunk_overlap=25,  # number of tokens overlap between chunks
    # length_function=tiktoken_len,
    separators=["\n\n", "\n", " ", ""]
)

In [8]:
chunks = text_splitter.split_text(all_docs2[5].page_content)

In [9]:
chunks

['Skip to main content\n\nOpen menu Close menu\n\nSpace Space\n\nSearch Search Space\n\nSubscribe\n\nRSS\n\nAll About Space Magazine\n\nWhy subscribe?',
 '* Subscribe this Christmas season!\n  * Jam packed issues filled with the latest cutting-edge research, technology and theories delivered in an entertaining and visually stunning way, aiming to educate and inspire readers of all ages\n  * Engaging articles, breathtaking images and expert knowledge\n  * Issues delivered straight to your door',
 'From$26.49\n\nView',
 '* __\n  * Space Exploration\n  * The Universe\n  * Stargazing\n  * Launches & Spacecraft\n  * Videos\n  * The Moon\n  * More ____\n    * Solar System\n    * Artemis\n    * James Webb Space Telescope\n    * Entertainment\n    * Search for Life\n    * Tech\n    * Forums\n    * Subscribe to "All about Space" magazine\n    * Space.com Store\n    * About Us',
 '* About Us\n    * Web Notifications',
 "Trending\n\n  * Next Full Moon\n  * Peregrine moon lander failure cause\n  *

In [10]:
# Change to desired format

In [31]:
documents = []
m = hashlib.md5()  # this will convert URL into unique
for doc in tqdm(all_docs2):
    url = doc.metadata['source']
    m.update(url.encode('utf-8'))
    uid = m.hexdigest()[:12]
    chunks = text_splitter.split_text(doc.page_content)
    for i, chunk in enumerate(chunks):
        obj =  {
#             
            'page_content': chunk,
            'metadata': {
                'id': f'{uid}-{i}',
                'source': url
                }
            }
        obj = Document(**obj)
        documents.append(obj)
        
save_docs_to_jsonl(documents,"../Documents/split_docs.jsonl")

  0%|          | 0/7 [00:00<?, ?it/s]

In [32]:
# documents = []
# m = hashlib.md5()  # this will convert URL into unique ID
# for doc in tqdm(all_docs2):
#     url = doc.metadata['source']
#     m.update(url.encode('utf-8'))
#     uid = m.hexdigest()[:12]
#     chunks = text_splitter.split_text(doc.page_content)
#     for i, chunk in enumerate(chunks):
#         obj =  {
#             'id': f'{uid}-{i}',
#             'text': chunk,
#             'source': url
#             }
#         documents.append(obj)

# save_docs_to_jsonl(documents,"../Documents/split_docs.jsonl")

## Embedding

In [33]:

from torch import cuda
from langchain.embeddings.huggingface import HuggingFaceEmbeddings

embed_model_id = 'sentence-transformers/all-MiniLM-L6-v2'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

embed_model = HuggingFaceEmbeddings(
    model_name=embed_model_id,
    model_kwargs={'device': device},
    encode_kwargs={'device': device, 'batch_size': 32}
)

In [34]:
embed_model.embed_query("just testing")

[0.011062849313020706,
 -0.04128526523709297,
 -0.011026446707546711,
 0.008643227629363537,
 0.047911398112773895,
 -0.07127462327480316,
 -0.029645757749676704,
 0.027396446093916893,
 -0.006226712372153997,
 0.005826866719871759,
 0.007963497191667557,
 -0.05756664648652077,
 -0.002841239795088768,
 0.010483230464160442,
 -0.023804638534784317,
 -0.03553685545921326,
 0.05072366073727608,
 -0.03719549998641014,
 -0.1263161152601242,
 0.04863246902823448,
 -0.1002393439412117,
 -0.01212237123399973,
 -0.07015921175479889,
 0.0005458503728732467,
 -0.05106617137789726,
 -0.02152702398598194,
 -0.06728058308362961,
 0.059879597276449203,
 0.120863176882267,
 -0.006624198518693447,
 -0.005206495523452759,
 0.04940183833241463,
 0.05089988186955452,
 0.01071919035166502,
 0.03333247825503349,
 -0.013492285273969173,
 0.06532233953475952,
 0.03977071866393089,
 0.030155999585986137,
 -0.04105069488286972,
 -0.002320002531632781,
 -0.13054929673671722,
 0.03872948884963989,
 0.034455254673

## Vector store

In [20]:
from langchain_community.vectorstores import FAISS

In [35]:
vdb = FAISS.from_documents(documents, embed_model)

In [36]:
## Testing
question = "what happen in Noto earthquake?"
searchDocs = vdb.similarity_search(question)

In [38]:
print(searchDocs[2])

page_content='the Noto Peninsula, particularly in the towns of Wajima, Suzu and Anamizu. All\nof the 220 fatalities and 26 missing individuals were reported in Ishikawa\nwhile over 1,000 were injured across multiple prefectures, making it the\ndeadliest earthquake in Japan since the 2016 Kumamoto earthquakes.[4]' metadata={'id': 'f60ddb659058-12', 'source': 'https://en.wikipedia.org/wiki/2024_Noto_earthquake'}


In [40]:
## Save
vdb.save_local("../Documents/vdb_faiss_index")
# docsearch = FAISS.load_local("faiss_index", embed_model)