In [1]:
!pip install langchain openai chromadb sentence-transformers

Collecting langchain
  Downloading langchain-0.0.330-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting openai
  Downloading openai-0.28.1-py3-none-any.whl (76 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.0/77.0 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting chromadb
  Downloading chromadb-0.4.15-py3-none-any.whl (479 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m479.8/479.8 kB[0m [31m43.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain)
  Downloading dataclasses_json-0.6.1-py3-none-any.whl (27 kB)
Coll

# Scraping Feynman Lectures with metadata tags

In [2]:
from langchain.text_splitter import HTMLHeaderTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter


In [5]:
def split_feynman(url):
  headers_to_split_on = [
    # ("h1", "Header 1"),
    ("h2", "Chapter"),
    ("h3", "Section")]

  html_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on)

  # for local file use html_splitter.split_text_from_file(<path_to_file>)
  html_header_splits = html_splitter.split_text_from_url(url)

  chunk_size = 1000
  chunk_overlap = 0
  text_splitter = RecursiveCharacterTextSplitter(
      chunk_size=chunk_size, chunk_overlap=chunk_overlap
  )

  # Split
  splits = text_splitter.split_documents(html_header_splits)
  return splits


In [6]:
base_url = "https://www.feynmanlectures.caltech.edu/"
vols = ['I', 'II', "III"]
chaps = [52, 42, 21]
# Loop through volumes and all chapters for each volume
all_chunks = []
for num, volume_num in enumerate(vols):
    chapters = chaps[num] +1
    for ch in range(1,chapters):
      vol_chapter_url = f"{base_url}{volume_num}_%02d.html" %ch
      splits = split_feynman(vol_chapter_url)
      for doc in splits:
        # Add volume metadata
        doc.metadata['Volume'] = volume_num
      all_chunks.extend(splits)

In [7]:
all_chunks[40:50]

[Document(page_content='Now we go on to another process. In Fig.\xa01–6 we see, from an atomic point of view, a solid dissolving in water. If we put a crystal of salt in the water, what will happen? Salt is a solid, a crystal, an organized arrangement of “salt atoms.” Figure\xa01–7 is an illustration of the three-dimensional structure of common salt, sodium chloride. Strictly speaking, the crystal is not made of atoms, but of what we call ions. An ion is an atom which either has a few extra electrons or has lost a few electrons. In a salt crystal we find chlorine ions (chlorine atoms with an extra electron) and sodium ions (sodium atoms with one electron missing). The ions all stick together by electrical attraction in the solid salt, but when we put them in the water we find, because of the attractions of the negative oxygen and positive hydrogen for the ions, that some of the ions jiggle loose. In Fig.\xa01–6 we see a chlorine ion getting loose, and other atoms floating in the water 

In [8]:
len(all_chunks)

8536

# Create Embeddings and Populate Vector Database

In [10]:
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.embeddings import HuggingFaceBgeEmbeddings

from langchain.vectorstores import Chroma


In [11]:

model_name = "BAAI/bge-small-en"
model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": True}
hf = HuggingFaceBgeEmbeddings(
    model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
)

Downloading (…)421f3/.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)93c10421f3/README.md:   0%|          | 0.00/90.2k [00:00<?, ?B/s]

Downloading (…)c10421f3/config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/134M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading (…)421f3/tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

Downloading (…)93c10421f3/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)10421f3/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [13]:
# Embed and store the texts
# Supplying a persist_directory will store the embeddings on disk
persist_directory = 'db_hf'

## here we are using OpenAI embeddings but in future we will swap out to local embeddings
# embedding = OpenAIEmbeddings()

# embedding = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")




vectordb = Chroma.from_documents(documents=all_chunks,
                                 embedding=hf,
                                 persist_directory=persist_directory)

In [49]:
# persist the db to disk
vectordb.persist()
vectordb = None

In [50]:
# Now we can load the persisted database from disk, and use it as normal.
vectordb = Chroma(persist_directory=persist_directory,
                  embedding_function=embedding)

In [51]:
retriever = vectordb.as_retriever(search_type="similarity", search_kwargs={"k": 12})

In [52]:
query = "create a lesson plan about "
# metadata = {'source':'/content/PDFs/Nursing_Education different learning styles.pdf'}

In [53]:
# query = "kinematics"

docs_k = retriever.get_relevant_documents(query)#, metadata = metadata)

In [54]:
docs_k

[Document(page_content='By the following amount: if instead of letting the planet go around the sun, we were to change the direction (but not the magnitude) of its velocity and make it move radially, and then we let it fall from some special radius to the radius of interest, the new speed would be the same as the speed it had in the actual orbit, because this is just another example of a complicated path. So long as we come back to the same distance, the kinetic energy will be the same. So, whether the motion is the', metadata={'Chapter': '13Work and Potential Energy (A)', 'Section': '13–2Work done by gravity', 'Volume': 'I'}),
 Document(page_content='Now let us see how we can calculate the motion of Neptune, Jupiter, Uranus, or any other planet. If we have a great many planets, and let the sun move too, can we do the same thing? Of course we can. We calculate the force on a particular planet, let us say planet number\xa0$i$, which has a position $x_i,y_i,z_i$ ($i=1$ may represent the 

# popularity of top k by metadata

In [None]:
# popularity of top k
# Create a dictionary to store the count of metadata occurrences
metadata_count = {}

# Iterate through the documents and count metadata occurrences
for document in docs_k:
    metadata = document.metadata
    for key, value in metadata.items():
        if key in metadata_count:
            if value in metadata_count[key]:
                metadata_count[key][value] += 1
            else:
                metadata_count[key][value] = 1
        else:
            metadata_count[key] = {value: 1}

# Print the count of metadata occurrences
for key, value_count in metadata_count.items():
    print(f"{key}:")
    for value, count in value_count.items():
        print(f"  {value}: {count} occurrences")

# semantic similarity to metadata

In [None]:


# similarity with chapters/sections
# Semantic search over specific metadata tags (e.g. chapter, section)
retriever = vectordb.as_retriever(search_type="similarity", search_kwargs={"filter":metadata,"k":5})
# Find ways to best use multiple sections, maybe cohere reranker
# Consider parent child retrieval

In [None]:
metadata = {'Section': '18–1The center of mass'}
vectordb.get(where={'Section': '18–1The center of mass'})


{'ids': ['22554ee8-7b59-11ee-8f2e-0242ac1c000c',
  '22554f38-7b59-11ee-8f2e-0242ac1c000c',
  '22554f92-7b59-11ee-8f2e-0242ac1c000c',
  '22555028-7b59-11ee-8f2e-0242ac1c000c',
  '22555078-7b59-11ee-8f2e-0242ac1c000c',
  '225550d2-7b59-11ee-8f2e-0242ac1c000c',
  '2255512c-7b59-11ee-8f2e-0242ac1c000c',
  '22555186-7b59-11ee-8f2e-0242ac1c000c',
  '225551e0-7b59-11ee-8f2e-0242ac1c000c',
  '22555262-7b59-11ee-8f2e-0242ac1c000c',
  '225552bc-7b59-11ee-8f2e-0242ac1c000c',
  '22555316-7b59-11ee-8f2e-0242ac1c000c',
  '22555366-7b59-11ee-8f2e-0242ac1c000c',
  '225553c0-7b59-11ee-8f2e-0242ac1c000c',
  '2255541a-7b59-11ee-8f2e-0242ac1c000c',
  '22555474-7b59-11ee-8f2e-0242ac1c000c',
  '225554ce-7b59-11ee-8f2e-0242ac1c000c',
  '22555528-7b59-11ee-8f2e-0242ac1c000c',
  '22555578-7b59-11ee-8f2e-0242ac1c000c',
  '225555d2-7b59-11ee-8f2e-0242ac1c000c'],
 'embeddings': None,
 'metadatas': [{'Chapter': '18Rotation in Two Dimensions',
   'Section': '18–1The center of mass',
   'Volume': 'I'},
  {'Chapter':

#Experimenting on RAG vs Prompting for data within training set