# Retrieval Documents by Query
This notebook demonstrates how, based on some user query, to retrieve relevant documents from a corpus of text data.

## Data Preprocessing

### Import Text File

In [127]:
import json

def txt_to_json(path, dump = None, debug = False):
    data = {}
    file = open(path, 'r')
    for i, line in enumerate(file.readlines()):
        if debug: print(i)
        line = line.strip()
        first_space = line.find(" ")
        if len(line.split()) == 1 and line[-1:] not in [".", "?", ":", ";", ",", "!", ")"] and line != "Selah" or len(line.split()) == 2 and line[:first_space].isnumeric():
            current_book = line[:]
            data[current_book] = {}
        elif len(line) == 0:
            continue
        elif (line.startswith("Chapter") or line.startswith("Psalm ")) and line[-1:].isnumeric():
            current_chapter = int(line[first_space + 1:])
            data[current_book][current_chapter] = {}
        elif line[:1].isnumeric():
            current_verse = int(line[:first_space])
            data[current_book][current_chapter][current_verse] = line[first_space + 1:]
        else:
            data[current_book][current_chapter][current_verse] += f' {line[:] }'
    
    if dump:
        json_data = json.dumps(data, indent=2)
        with open(dump, 'w') as f:
            f.write(json_data)
    else:
        return data

# txt_to_json('sample_text.txt', "sample_data.json")

In [128]:
# test on all data
txt_to_json('tanakh.txt', 'tanakh.json')

### Turn json Into Chunks of N Words

In [None]:
tanakh_json = json.load(fp=open("tanakh.json", "r"))

In [140]:
n = 500
chunks = []
chunk = ""
for book, book_content in tanakh_json.items():
    for chapter, chapter_content in book_content.items():
        for verse, verse_content in chapter_content.items():
            if len(verse_content.split()) > 500:
                print(book, chapter, verse)
                break
            if len((chunk + verse_content).split()) > 500:
                chunks.append(chunk)
                chunk = verse_content
            else:
                chunk += f" {verse_content}"

In [145]:
# save to file
with open("chunks.txt", "w") as f:
    for chunk in chunks:
        f.write(f"\n{chunk}\n")

## Turn Into a Chroma DB - Vector Store

### Import `langchain`

In [1]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma

In [None]:


full_text = open("state_of_the_union.txt", "r").read()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
texts = text_splitter.split_text(full_text)

embeddings = HuggingFaceEmbeddings()
db = Chroma.from_texts(texts, embeddings)
retriever = db.as_retriever()