In [None]:
!pip install chromadb langchain_huggingface langchain_chroma langchain_experimental langchain_text_splitters

Collecting chromadb
  Downloading chromadb-1.1.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.2 kB)
Collecting langchain_huggingface
  Downloading langchain_huggingface-0.3.1-py3-none-any.whl.metadata (996 bytes)
Collecting langchain_chroma
  Downloading langchain_chroma-0.2.6-py3-none-any.whl.metadata (1.1 kB)
Collecting langchain_experimental
  Downloading langchain_experimental-0.3.4-py3-none-any.whl.metadata (1.7 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.2-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl.metadata (8.7 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Downloading posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.23.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.9 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opente

In [None]:
from langchain_core.documents import Document
from sentence_transformers import SentenceTransformer
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma, vectorstores
import re
from transformers import AutoTokenizer, AutoModelForCausalLM
from langchain_experimental.text_splitter import SemanticChunker
from langchain_text_splitters import RecursiveCharacterTextSplitter
import os
from langchain_core.prompts import PromptTemplate, FewShotPromptTemplate
from langchain_core.example_selectors import SemanticSimilarityExampleSelector
import random

In [None]:
# 334M parameters , the small one is 33.4M
model_name = "BAAI/bge-large-en-v1.5"
model_kwargs = {'device': 'cuda'}
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity

model = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs)


In [None]:
# for cosine similarity
# collection_metadata={"hnsw:space": "cosine"}

vector_store = Chroma(
    collection_name="collection",
    embedding_function=model,
    persist_directory="./chroma_langchain_db",
    collection_metadata={"hnsw:space": "cosine"}

)

In [None]:
# before embedding a query append this instruction: "Generate a representaion for this sentence to retrieve related articles: "
# no need for an instruction for embedding documents


# metadata of each document:
## - parent_folder: the direct parent folder name
## - root_folder: the top root folder name , one of 6: "events", "general_info_and_history", "music", "food_festivals", "museums", "sports", "tax", "operating_budget"
## - file: name of file
## - depth: the depth of the article starting from 'docments' folder, for ex: file "pitts_cultural_trust" has depth of 2
## - path: path of file starting from "documents" folder
## - title: main title of the text file

# WON'T USE HEADING AND SUBHEADING FOR NOW
## - heading: one of the main headings of a text file, could be None
## - subheading: one of the subheadings of the main heading, if there is any, else None


# each chunk should have:
  # main title of text file
  # heading(could be None),
  # subheading(could be None)
  # its content


In [None]:
tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-large-en-v1.5")

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=20, length_function= lambda x: len(x.split()),
                                               separators = ["\n\n", " ", ""])

In [None]:
def return_smaller_chunks(chunk, chunker):
  """
  Args:
    - chunk (str)
    - chunker (): SemanticChunker instance from langchain

  Returns:
    - result_chunks (List[str]): a list of strings representing the smaller chunks

  split the chunk that is exceeding the max number of tokens into smaller chunks semantically with overlap
  with each smaller chunk having its title, heading ,and subheading, if available
  """
  # extract heading and subheading , if available
  heading = ""
  subheading = ""

  chunk_lines = []

  for line in chunk.splitlines():
    if line[:2] == "- ":
      heading = line


    elif line[:2] == "--" and line[2:5] != "---" :
      subheading = line

    else:
      chunk_lines.append(line)

  chunk = "\n".join(chunk_lines)

  smaller_chunks = chunker.split_text(chunk)

  result_chunks = []

  for i, chunk in enumerate(smaller_chunks):
    # add heading, and subheading for each smaller chunk
    new_chunk = ""

    if heading:
      new_chunk += heading + "\n\n"

    if subheading:
      new_chunk += subheading + "\n"

    chunk = new_chunk + chunk

    result_chunks.append(chunk)

  return result_chunks



In [None]:
def return_docs(file_path, text_splitter ,max_token_size = 512):
  """
  Args:
  - file_path: str, file path starts with 'documents' directory
  - text_splitter
  - max_token_size

  Returns:
  - documents: List[Document]

  chunking texts based on headings and subheadings
  returns a list of documents of type (Document)

  each chunk contains:
  - main title of text file (first multiple lines until an empty line is detected)
  - heading, if there is any (represented in text file by '-')
  - subheading, if there is any (represented by '--')
  - content
  """
  chunks = []
  print(file_path)
  with open(file_path, "r") as f:
    document = f.readlines()

  chunk_content = ""

  # extract title, which could be multiple lines
  # a title is defined as the group of subsequent lines until an empty line break
  # in this example, the title is the first two lines
  """
  Article 1
  Taxes

  .....
  """
  for j, line in enumerate(document):
    if line.strip(' ') == '\n':
      break

  title = "".join(document[:j]).rstrip()

  current_heading = ""
  for i, line in enumerate(document[j:], j):

    # get new heading with new content
    if line[:2] == "- ":
      current_heading = line

      if chunk_content.strip() != "" and re.fullmatch(r"^\n?- (.+)\n+", chunk_content) is None:
        chunks.append(chunk_content.rstrip())

      chunk_content = ""

    # get new subheading content with same heading
    elif (line[:2] == "--" and line[2:5] != "---" )or i == len(document) - 1:

      if re.fullmatch(r"^\n?- (.+)\n+", chunk_content) is None:
        chunks.append(chunk_content.rstrip())

      chunk_content = current_heading + "\n"


    # remove references
    chunk_content += re.sub(r"\[\d+\]", "", line)


  limit_respecting_chunks = []
  # if a chunk's tokens exceed model limit, split further
  for i, chunk in enumerate(chunks):
    # print(chunk, end="\n\n")
    tokens = tokenizer.encode(chunk, add_special_tokens=True)
    # remove 70 tokens to accomodate for the title that will be added, which is a max of three short lines
    if len(tokens) > max_token_size - 70:
      smaller_chunks = return_smaller_chunks(chunk, text_splitter)
      for smaller_chunk in smaller_chunks:
        limit_respecting_chunks.append(smaller_chunk)

    else:
      limit_respecting_chunks.append(chunk)


  # add title to each chunk
  final_chunks = []

  for chunk in limit_respecting_chunks:
    chunk = title + "\n\n" + chunk
    final_chunks.append(chunk)


  documents = []

  if "/" in file_path:
    splitted_path = file_path.lower().split("/")
  # Windows
  elif "\\" in file_path:
    splitted_path = file_path.lower().split("\\")

  else:
    FileNotFoundError("File path is unusual", file_path)


  # text chunks to documents
  for text in final_chunks:

    document = Document(
      page_content=text,
      metadata={"parent_folder": splitted_path[-2],
              "root_folder": splitted_path[1],
              "file": splitted_path[-1],
              "depth": len(splitted_path) - 1,
              "path": file_path,
              "title": title.lower()}
      )

    documents.append(document)

  return documents


In [None]:
def vectorize(file_path, db):
  """
  recursively search 'file_path' directory for text files
  then transform text file into chunks and add them to the database
  """
  if os.path.isfile(file_path):
    docs = return_docs(file_path, text_splitter)
    vector_store.add_documents(docs)
    return

  files = os.listdir(file_path)
  for f in files:
    path = os.path.join(file_path, f)
    vectorize(path, db)



In [None]:
# saves documents into the database
vectorize("documents", vector_store)

Token indices sequence length is longer than the specified maximum sequence length for this model (540 > 512). Running this sequence through the model will result in indexing errors


documents/Operating_Budget/4_Revenue
documents/Operating_Budget/3_five_year_financial_forecast
documents/Operating_Budget/0_Budget_Authorizing_Legislation_article_1
documents/Operating_Budget/2_budget_guide
documents/Operating_Budget/1_American_Rescue_Plan_article_1
documents/Operating_Budget/0_Budget_Authorizing_Legislation_article_2
documents/Operating_Budget/1_American_Rescue_Plan_article_2
documents/Operating_Budget/6_Special_Revenue_Funds/Solid Waste Trust Fund/Solid Waste Trust Fund_article
documents/Operating_Budget/6_Special_Revenue_Funds/Regional Asset District - Parks and Recreation Trust Fund/Regional Asset District - Parks and Recreation Trust Fund_article
documents/Operating_Budget/6_Special_Revenue_Funds/Schenley Park Rink Trust Fund/Schenley Park Rink Trust Fund_article
documents/Operating_Budget/6_Special_Revenue_Funds/Special Events Trust Fund/Special Events Trust Fund_article
documents/Operating_Budget/6_Special_Revenue_Funds/Senior Citizens Program Trust Fund/Senior 