In [1]:
import openai
import requests
import json
import os
import pandas as pd
import numpy as np

import tiktoken
from openai import OpenAI
from utils.embeddings_utils import get_embedding

openai.api_key = os.environ["OPENAI_API_KEY"]
client = OpenAI()

In [2]:
embedding_model = "text-embedding-3-small"
embedding_encoding = "cl100k_base"
max_tokens = 8000  # the maximum for text-embedding-3-small is 8191

In [3]:
def normalize_l2(x):
    x = np.array(x)
    if x.ndim == 1:
        norm = np.linalg.norm(x)
        if norm == 0:
            return x
        return x / norm
    else:
        norm = np.linalg.norm(x, 2, axis=1, keepdims=True)
        return np.where(norm == 0, x, x / norm)

In [4]:
# Note: you need to be using OpenAI Python v0.27.0 for the code below to work
# Test OpenAI API access
import openai
import requests
import json

conversation = [{"role": "system", "content": "You are an assistant and can response to a user's query based on the information provided. Please only only response with confidence. Otherwise, just say you are not able to answer the question based on the limited information."}]
def chatgpt(user_input, messages):
    messages.append({'role':'user','content':user_input})
    response = client.chat.completions.create(
        model='gpt-3.5-turbo',
        messages=messages
    )
    reply = response.choices[0].message.content
    messages.append({'role':'assistant','content':reply})
    return reply

query =  "Tell me briefly about American Revolution in about 100 words"

while len(query) > 0:
    response = chatgpt(query, conversation)
    conversation.append({"role":"assistant","content":response})
    print("ChatGPT answer: " + response)
    query = input("Input your query:  ")
    conversation.append({"role":"user","content":query})

ChatGPT answer: The American Revolution was a colonial revolt that took place between 1765 and 1783, resulting in the Thirteen American Colonies gaining independence from British rule. Key events include the Boston Tea Party, Battles of Lexington and Concord, Declaration of Independence, Battle of Saratoga, and the Treaty of Paris. The revolution was fueled by grievances over taxation without representation, limits on self-governance, and other political and economic issues. The conflict ultimately led to the establishment of the United States of America as a sovereign nation and inspired democratic movements worldwide.
Input your query:  


In [5]:
import re

regx_array = [r"(?:^|\n|\s+)# ",r"(?:^|\n|\s+)## ",r"(?:^|\n|\s+)### ",r"(?:^|\n|\s+)#### ",r"(?:^|\n|\s+)##### ",r"(?:^|\n|\s+)###### "]
deli_array = ["\n# ","\n## ","\n### ","\n#### ","\n##### ","\n###### "]
levels = ["chapters", "sections","sub_sections", "subsub_section","subsubsub_section","paragraphs", "sentences","words"]

def chunk_recursive(input_chunk, c_level):
    print("input chunk", input_chunk)
    regx = regx_array[c_level]
    chunks = re.split(regx,input_chunk)
    print("chunks", chunks)
    chapters = []
    for i in range(len(chunks)):
        if (chunks[i] == ""):
            continue
        chapters.append(chunks[i])
    if (len(chapters) == 0):
        return ""
    topic = chapters[0]
    print("Topic from chunk_recursive = ", topic)
    if (len(chapters) == 1):
        return chapters[0]
    sections = []
    for i in range(1,len(chapters)):
        sections.append(chunk_recursive(chapters[i],c_level+1))
    return {"topic":topic, "content":sections}
    
def chunk_top(input_md,topic):
    c_level = 0
    regx = regx_array[c_level]
    # Top level split
    chunks = re.split(regx,input_md)
    chapters = []
    for i in range(len(chunks)):
        if (chunks[i] == ""):
            continue
        chapters.append(chunks[i])
    doc_chapters = []
    print("Chapter 0",chapters[0])
    print("End of C0")
    for chapt in chapters:
        doc_chapters.append(chunk_recursive(chapt, c_level+1))
    doc_tree = {"topic":topic, "content":doc_chapters}
    return doc_tree

def extract_all_markdown_tables(md_text):
    # Split the markdown text into lines to facilitate line number tracking
    lines = md_text.splitlines()

    # Regular expression to capture any line as the title that immediately precedes a markdown table, followed by the table and the following notes up to a blank line
    pattern = r"^(.*?(?:\r?\n))(^\|.*?\|.*?(?:\r?\n))+([\s\S]*?(?=\r?\n\r?\n|\Z))"

    # Find all matches in the markdown text
    matches = re.finditer(pattern, md_text, re.MULTILINE)

    # Collect all matched tables along with their title and notes, and line numbers
    tables = []
    for match in matches:
        start_line = md_text.count('\n', 0, match.start())
        end_line = md_text.count('\n', 0, match.end())

        table_dict = {
            "title": match.group(1).strip(),
            "content": match.group(2).strip(),
            "notes": match.group(3).strip() if match.group(3).strip() else None,
            "start_line": start_line,
            "end_line": end_line
        }
        tables.append(table_dict)
    
    return tables

def extract_preformatted_text(markdown_text):
    # Regular expression to find <pre>...</pre> blocks
    pattern = r'<pre>([\s\S]*?)</pre>'
    
    # List to store results, including line numbers
    results = []
    
    # Finding all matches with re.finditer to keep track of line numbers
    for match in re.finditer(pattern, markdown_text):
        start_index = match.start()
        end_index = match.end()
        
        # Extract text inside <pre> tags
        content = match.group(0)
        
        # Count line numbers up to the start of the match
        start_line = markdown_text.count('\n', 0, start_index)
        end_line = markdown_text.count('\n', 0, end_index)
        
        # Store results with line numbers
        results.append((start_line, end_line, content))
    
    return results
    
# Split input text into tables, pre-formatted chunks, and paragraphs
def text_chunks(input_text):
    #print("input_text = ", input_text)
    text_lines = input_text.split("\n")
    line_numbers = [0, len(text_lines)]
    # Chunks text and type
    chunks = {}
    # Extract the tables and print each
    extracted_tables = extract_all_markdown_tables(input_text)
    for i, table in enumerate(extracted_tables, 1):
        line_numbers.append(table['start_line'])
        line_numbers.append(table['end_line']+1)
        hash_tag = table['start_line']
        chunk_type = "table"
        content = text_lines[table['start_line']:table['end_line']+1]
        content = "\n".join(content)
        chunks[hash_tag] = (chunk_type, content)
    
    # Extract pre-formatted text
    extracted_text = extract_preformatted_text(input_text)
    for result in extracted_text:
        #print(f"Start Line: {result[0]}, End Line: {result[1]}")
        line_numbers.append(result[0])
        line_numbers.append(result[1]+1)
        hash_tag = result[0]
        chunk_type = "pre-formatted"
        content = text_lines[result[0]:result[1]+1]
        content = "\n".join(content)
        chunks[hash_tag] = (chunk_type, content)        
    line_numbers = list(set(line_numbers))
    line_numbers.sort()
    #print(line_numbers)
    #print("line_numbers = ",line_numbers)
    # Create remaining chunks
    for i in range(len(line_numbers)-1):
        hash_tag = line_numbers[i]
        chunk_type = "text"
        content = text_lines[line_numbers[i]:line_numbers[i+1]]
        #print("content from chunk_text ",content)
        content = "\n".join(content)
        if (hash_tag not in chunks):
            chunks[hash_tag] = (chunk_type, content)        
    #print("Output chunks = ", chunks)
    return chunks

def paragraph_splitter(input_text):
    paragraphs = re.split(r"\n\s*\n",input_text);
    clean_paragraphs = []
    for p in paragraphs:
        p = p.strip()
        if (len(p) > 0):
            clean_paragraphs.append(p)
    return clean_paragraphs
        
def tree_iterator(doc_node):
    if (isinstance(doc_node, dict)):
        for key in doc_node.keys():
            doc_node[key] = tree_iterator(doc_node[key])
        return doc_node
    elif (isinstance(doc_node, list)):
        for i in range(len(doc_node)):
            item = doc_node[i]
            if (isinstance(item, str)):
                #print(item)
                chunks = text_chunks(item)
                para_table = []
                for key in chunks.keys():
                    if (chunks[key][0] == "text"):
                        #print(chunks[key][1])
                        paragraphs = paragraph_splitter(chunks[key][1])
                        for p in paragraphs:
                            print("P: ",p)
                            # Split into sentences
                            sentences = p.split(". ")
                            para_table.append(sentences)
                    else:
                        print("P: ",chunks[key][1])
                        para_table.append(chunks[key][1])
                item = para_table
                doc_node[i] = item
            else:
                doc_node[i] = tree_iterator(item)
        return doc_node
    else:
        #print(doc_node)
        chunks = text_chunks(doc_node)
        para_table = []
        for key in chunks.keys():
            if (chunks[key][0] == "text"):
                #print(chunks[key][1])
                paragraphs = paragraph_splitter(chunks[key][1])
                for p in paragraphs:
                    print("P: ",p)
                    # Split into sentences
                    sentences = p.split(". ")
                    para_table.append(sentences)
            else:
                print("P: ",chunks[key][1])
                para_table.append(chunks[key][1])
        doc_node = para_table
        return doc_node
        
def sentence_iterator(doc_node):
    if (isinstance(doc_node, dict)):
        for key in doc_node.keys():
            sentence_iterator(doc_node[key])
    elif (isinstance(doc_node, list)):
        for item in doc_node:
            if (isinstance(item, str)):
                print(item)
            else:
                sentence_iterator(item)
    else:
        print(doc_node)
    
# Example markdown text with two tables
markdown_text = """
# Sample Doc in Markdown Format

Chapter_topic_sentence

## Making your doc RAG-Ready

<pre>
Author: James Lee
        ABC Inc
        101 Dove Canyon, CA 92127
        
        April 1, 2024
</pre>

# Chapter 1: Introduction

More text here.

<pre>
Second Preformatted Text
More lines
End of second text
</pre>

Conclusion here.

## Section 1: What is a RAG-ready format?

In this section, we will explain what is the RAG-ready markdown format for a document.

This is an example table
|Table Row |Example Column1|Example Column2|
|---|---|---|
|Row1 |Col11|Col12|
|Row2|Col21|Col22|
Summary of table one.
Additional explanation follows here.
Details to be noted.

## Section 2: Additional Information

Further Example of Table
|Header |Data1|Data2|
|---|---|---|
|Info1 |D11|D12|
|Info2|D21|D22|
Insights about the second table.
Further elaboration is given here.

Continue with more content here after the tables.
"""

# Extract the tables and print each
extracted_tables = extract_all_markdown_tables(markdown_text)
for i, table in enumerate(extracted_tables, 1):
    print(f"Table {i}:")
    print(f"Title: {table['title']}")
    print(f"Content:\n{table['content']}")
    if table['notes']:
        print(f"Notes:\n{table['notes']}")
    print(f"Start Line: {table['start_line']}, End Line: {table['end_line']}")
    print()  # Just to add a space between outputs

# Extract pre-formatted text
extracted_text = extract_preformatted_text(markdown_text)
for result in extracted_text:
    print(f"Start Line: {result[0]}, End Line: {result[1]}")
    print(result[2])
    print("---------------")


Table 1:
Title: This is an example table
Content:
|Row2|Col21|Col22|
Notes:
Summary of table one.
Additional explanation follows here.
Details to be noted.
Start Line: 31, End Line: 38

Table 2:
Title: Further Example of Table
Content:
|Info2|D21|D22|
Notes:
Insights about the second table.
Further elaboration is given here.
Start Line: 42, End Line: 48

Start Line: 7, End Line: 13
<pre>
Author: James Lee
        ABC Inc
        101 Dove Canyon, CA 92127
        
        April 1, 2024
</pre>
---------------
Start Line: 19, End Line: 23
<pre>
Second Preformatted Text
More lines
End of second text
</pre>
---------------


In [27]:
doc_text = """
# Sample Doc in Markdown Format

Topic summary: an example of markdown doc. 

## Making your doc RAG-Ready

<pre>
Author: James Lee
        ABC Inc
        101 Dove Canyon, CA 92127
        
April 1, 2024

</pre>

More text here.

<pre>
Second Preformatted Text
More lines
End of second text
</pre>

# Chapter 1: Introduction

This sample markdown doc can be used as a template for creating a RAG-ready document. The RAG-ready document can optionally contains prompt instructions specific to the document. For example, how to parse data, how to generate. For pre

## Section 1: What is a RAG-ready format?

In this section, we will explain what is the RAG-ready markdown format for a document.

This is an example table
|Table Row |Example Column1|Example Column2|
|---|---|---|
|Row1 |Col11|Col12|
|Row2|Col21|Col22|
Summary of table one.
Additional explanation follows here.
Details to be noted.

Further Example of Table
|Header |Data1|Data2|
|---|---|---|
|Info1 |D11|D12|
|Info2|D21|D22|
Insights about the second table.
Further elaboration is given here.

Continue with more content here after the tables.

More content follows.

### SubSection 1:

A document in RAG-ready markdown format means that this document is well organized in property hierarchical structure so that the Retrieval Augmented Generation (RAG) application can build the most efficient and accurate Q&A system and knowledge system that powered by LLMs. 

An advanced RAG-application can take advantage of the hierarchical structures of chapter->section->sub-section->paragraph->sentence for building a robust Q&A system: more accurate document chunk retrieval with the relevant details and in the meantime, also keep a holistic view of more document context. 

#### subsubsection 1
Test1

##### sub3_section 1
test sub3 section

Add some more text.

##### sub3_section 2
Test sub3 section2

#### subsubsection 2
Test2

### SubSection 2:
A RAG-ready document can be eitehr in markdown or HTML format as long as the hierarchical structure is expressed in the format. In markdow, the hierarchical structure can be expressed by the level of headers. 

## Section 2: Why the RAG-ready is important?

RAG-ready format can garantee the hierarchical structure of a document can be preserved and the document can be properly processed into chunks and create proper meta data for each chunk. A clean and well structured information is crucial for the RAG performance.

# Chapter 2: How to create a RAG-ready document?

This chapter explain how to create a RAG-ready document.

## Section 1: raw document formats and tools

The raw document format can be in many different forms, such as PDF, Word, PPT or even plain text. 
Depending on the the original document format, one can choose the corresponding tools to process.

For example, if the raw document is in PDF, one can use Llambda-Parse (cloud service) to covert it into markdown format. Other output format is also possible. 

## Section 2: data process procedures

No tools are perfect. Although Llama-Parse is fast and conveninet to use, the generated markdown output may still have a lot of errors. The best way to clean the error is by human corrections. This includes a lot of editing. 

Certain part of document has pre-formatted, thus this pre-foramt should be reserved in the RAG-ready format. 

# Conclusion

In this short document, the RAG-ready format is explained. 

"""

In [28]:
doc_root = chunk_top(doc_text, "Sample Markdown Doc")
print("----------------------------------------------------------------------------------")
doc_tree = tree_iterator(doc_root)

Chapter 0 Sample Doc in Markdown Format

Topic summary: an example of markdown doc. 

## Making your doc RAG-Ready

<pre>
Author: James Lee
        ABC Inc
        101 Dove Canyon, CA 92127
        
April 1, 2024

</pre>

More text here.

<pre>
Second Preformatted Text
More lines
End of second text
</pre>
End of C0
input chunk Sample Doc in Markdown Format

Topic summary: an example of markdown doc. 

## Making your doc RAG-Ready

<pre>
Author: James Lee
        ABC Inc
        101 Dove Canyon, CA 92127
        
April 1, 2024

</pre>

More text here.

<pre>
Second Preformatted Text
More lines
End of second text
</pre>
chunks ['Sample Doc in Markdown Format\n\nTopic summary: an example of markdown doc.', 'Making your doc RAG-Ready\n\n<pre>\nAuthor: James Lee\n        ABC Inc\n        101 Dove Canyon, CA 92127\n        \nApril 1, 2024\n\n</pre>\n\nMore text here.\n\n<pre>\nSecond Preformatted Text\nMore lines\nEnd of second text\n</pre>']
Topic from chunk_recursive =  Sample Doc in Markd

In [8]:
sentence_iterator(doc_tree)

Sample Markdown Doc
Sample Doc in Markdown Format
Topic summary: an example of markdown doc.
<pre>
Author: James Lee
        ABC Inc
        101 Dove Canyon, CA 92127
        
April 1, 2024

</pre>
<pre>
Second Preformatted Text
More lines
End of second text
</pre>
Making your doc RAG-Ready
More text here.
Chapter 1: Introduction
This sample markdown doc can be used as a template for creating a RAG-ready document
The RAG-ready document can optionally contains prompt instructions specific to the document
For example, how to parse data, how to generate
For pre
This is an example table
|Table Row |Example Column1|Example Column2|
|---|---|---|
|Row1 |Col11|Col12|
|Row2|Col21|Col22|
Summary of table one.
Additional explanation follows here.
Details to be noted.
Further Example of Table
|Header |Data1|Data2|
|---|---|---|
|Info1 |D11|D12|
|Info2|D21|D22|
Insights about the second table.
Further elaboration is given here.
Section 1: What is a RAG-ready format?
In this section, we will explai

In [9]:
import re
from collections import defaultdict

class Node:
    def __init__(self, level, header, parent=None):
        self.level = level
        self.header = header
        self.parent = parent
        self.content = []
        self.children = []

    def __repr__(self):
        return f"{self.header} ({len(self.children)} subsections)"

def parse_markdown_to_tree(markdown_text):
    lines = markdown_text.split('\n')
    root = Node(level=0, header='root')
    current_node = root

    content_block = ''
    in_pre_block = False
    pre_block = ''

    for line in lines:
        if line.strip().startswith('<pre>') and not in_pre_block:
            in_pre_block = True
            pre_block += line+"\n"
            continue
        elif '</pre>' in line and in_pre_block:
            in_pre_block = False
            pre_block += line+"\n"
            current_node.content.append(pre_block.strip())
            pre_block = ''
            continue
        elif in_pre_block:
            pre_block += line + '\n'
            continue

        match = re.match(r'^(#+)\s*(.*)', line)
        if match:
            if content_block:
                current_node.content.append(content_block.strip())
                content_block = ''
            level = len(match.group(1))
            header = match.group(2).strip()
            # Create new node
            new_node = Node(level=level, header=header, parent=current_node)
            # Find correct parent for the new node
            while current_node.level >= level:
                current_node = current_node.parent
            current_node.children.append(new_node)
            current_node = new_node
        else:
            if line.strip() == '' and content_block:
                current_node.content.append(content_block.strip())
                content_block = ''
            else:
                content_block += line + '\n'

    if content_block:  # Append last block if exists
        current_node.content.append(content_block.strip())

    return root

# Example Markdown text
markdown_text = """# Introduction

Welcome to our guide.

Here we will cover several important aspects.

<pre>
Author    James Lee
          ABC Inc, 
          USA

April 25, 2024</pre>

## Setup Instructions

First, ensure you have the following tools installed:

- Tool A
- Tool B

Please follow these steps to get started.

## Configuration

Modify the configuration files as shown below:

config_setting_1 = true
config_setting_2 = false

Remember to restart the service after changing the config files.

## Data Format

Our system uses the following data structure:

Table 1: Table of names and their values.
| ID | Name   | Value |
|----|--------|-------|
| 1  | Item 1 | 100   |
| 2  | Item 2 | 200   |
Note: more data will be added.

Please make sure your data conforms to this table.

## Summary

This guide should help you get started with the basic setup and configuration.

For more details, visit our [website](http://example.com).

Thank you for reading!

"""

# Parse the Markdown and build the tree
tree = parse_markdown_to_tree(doc_text)

# Function to print tree for visualization
def print_tree(node, indent=""):
    content_summary = ' | '.join(node.content[:5]) + ('...' if len(node.content) > 5 else '')
    print(f"{indent}{node.header}: {content_summary} ({len(node.children)} children)")
    for child in node.children:
        print_tree(child, indent + "  ")

print_tree(tree)


root:  (4 children)
  Sample Doc in Markdown Format: Topic summary: an example of markdown doc. (1 children)
    Making your doc RAG-Ready: <pre>
Author: James Lee
        ABC Inc
        101 Dove Canyon, CA 92127
        
April 1, 2024

</pre> |  | More text here. | <pre>
Second Preformatted Text
More lines
End of second text
</pre> |  (0 children)
  Chapter 1: Introduction: This sample markdown doc can be used as a template for creating a RAG-ready document. The RAG-ready document can optionally contains prompt instructions specific to the document. For example, how to parse data, how to generate. For pre (2 children)
    Section 1: What is a RAG-ready format?: In this section, we will explain what is the RAG-ready markdown format for a document. | This is an example table
|Table Row |Example Column1|Example Column2|
|---|---|---|
|Row1 |Col11|Col12|
|Row2|Col21|Col22|
Summary of table one.
Additional explanation follows here.
Details to be noted. | Further Example of Table
|Header |

In [10]:
print(len(tree.children))

4


In [11]:
print(len(tree.children[1].children[0].children))

2


In [12]:
print(tree.children[1].children[0].content)
print("------")
print(tree.children[1].children[1].content)


['In this section, we will explain what is the RAG-ready markdown format for a document.', 'This is an example table\n|Table Row |Example Column1|Example Column2|\n|---|---|---|\n|Row1 |Col11|Col12|\n|Row2|Col21|Col22|\nSummary of table one.\nAdditional explanation follows here.\nDetails to be noted.', 'Further Example of Table\n|Header |Data1|Data2|\n|---|---|---|\n|Info1 |D11|D12|\n|Info2|D21|D22|\nInsights about the second table.\nFurther elaboration is given here.', 'Continue with more content here after the tables.', 'More content follows.']
------
['RAG-ready format can garantee the hierarchical structure of a document can be preserved and the document can be properly processed into chunks and create proper meta data for each chunk. A clean and well structured information is crucial for the RAG performance.']


In [13]:
def find_full_context(node):
    context = node.header
    while node.parent:
        if (node.header != ""):
            context = node.parent.header + " - "+ context
        node = node.parent
    return context

def content_iterator(tree):
    if (isinstance(tree, Node)):
        full_context = find_full_context(tree)
        print("Level ", tree.level, "Header:",full_context)
        print("Content:",tree.content)
        for c in tree.children:
            content_iterator(c)

def get_node_content(node):
    tag = "##########"
    level = int(node.level)
    header_tag = tag[:level]+" "
    content = header_tag + node.header + "\n\n" + "\n\n".join(node.content)
    for c in node.children:
        content = content + "\n\n" + get_node_content(c)
    return content
    
tree.header = "Mardown Ready Doc"
content_iterator(tree)

Level  0 Header: Mardown Ready Doc
Content: ['']
Level  1 Header: Mardown Ready Doc - Sample Doc in Markdown Format
Content: ['Topic summary: an example of markdown doc.']
Level  2 Header: Mardown Ready Doc - Sample Doc in Markdown Format - Making your doc RAG-Ready
Content: ['<pre>\nAuthor: James Lee\n        ABC Inc\n        101 Dove Canyon, CA 92127\n        \nApril 1, 2024\n\n</pre>', '', 'More text here.', '<pre>\nSecond Preformatted Text\nMore lines\nEnd of second text\n</pre>', '']
Level  1 Header: Mardown Ready Doc - Sample Doc in Markdown Format - Making your doc RAG-Ready - Chapter 1: Introduction
Content: ['This sample markdown doc can be used as a template for creating a RAG-ready document. The RAG-ready document can optionally contains prompt instructions specific to the document. For example, how to parse data, how to generate. For pre']
Level  2 Header: Mardown Ready Doc - Sample Doc in Markdown Format - Making your doc RAG-Ready - Chapter 1: Introduction - Section 1: Wh

In [14]:
print(get_node_content(tree.children[3]))

# Conclusion

In this short document, the RAG-ready format is explained.




In [15]:
with open('data/vb2000_short.md','r') as file:
    vb2000_content = file.read()
print(len(vb2000_content))

80004


In [16]:
#Parse the Markdown and build the tree
tree = parse_markdown_to_tree(vb2000_content)


In [17]:
print(get_node_content(tree))

 root



# VB2000



## An ab initio Valence Bond Program based on the Generalized Product Function Method and the Algebrant Algorithm Version 3.0

Jiabo Li 1, Brian Duke 2, Roy McWeeny 3, David W. O. de Sousa 4 , and Rodrigo S. Bitzer 4

1 SciNet Technologies, 9943 Fieldthorn St., San Diego CA 92127, USA

2 Monash Institute of Pharmaceutical Sciences, Monash University 381 Royal Pde, Parkville, Victoria, 3052, Australia

3 Department of Chemistry, University of Pisa, 56100 Pisa, ITALY

4 Chemistry Institute, Federal University of Rio de Janeiro, Brazil.

Date code finalized: September 2021

Date of most recent change in manual: August 30th, 2021

Copyright © 2000-2017 by Jiabo Li, Brian Duke and Roy McWeeny All Rights Reserved.

Copyright (C) 2018- Jiabo Li, Brian Duke, and Roy McWeeny

This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, 

In [33]:
# vector DB
# each record: parent_node_id, content_id, vector, metadata: doc_id, headers, header_vector
# list of nodes, list of contents, 
class rag_chunk:
    def __init__(self, p_id,c_id, header, content, h_vector, c_vector):
        self.p_id = p_id
        self.c_id = c_id
        self.header = header
        self.content = content
        self.h_vector = h_vector
        self.c_vector = c_vector

    def __str__(self):
        return f"{self.header}, {self.p_id}, {self.c_id}, {self.header}, {self.content}"

def get_node_list(tree,node_list = []):
    node_list.append(tree)
    for c in tree.children:
        get_node_list(c,node_list)

def add_node_id(tree):
    node_list = []
    get_node_list(tree,node_list)
    for node_id in range(len(node_list)):
        node_list[node_id].node_id = node_id

def node_iterator(node, node_list=[],content_list=[]):
    if (isinstance(node, Node)):
        node_list.append(node)
        node_id = len(node_list)-1
        for cont in node.content:
            if (len(cont)  == 0):
                continue
            content = rag_chunk(node.parent.node_id, -1, node.header, cont, [0.0,0.0],[0.1,0.1])
            content_list.append(content)
        for c in node.children:
            node_iterator(c, node_list, content_list)    

cont_list = []
def content_iterator(node):
    if (isinstance(node, Node)):
        print("header:",node.header)
        cont_list.append(node.header)
        for cont in node.content:
            if (len(cont)  == 0):
                continue
            print("c:",cont)
            cont_list.append(cont)
        for c in node.children:
            content_iterator(c)       

In [34]:
content_iterator(tree)

header: root
header: VB2000
header: An ab initio Valence Bond Program based on the Generalized Product Function Method and the Algebrant Algorithm Version 3.0
c: Jiabo Li 1, Brian Duke 2, Roy McWeeny 3, David W. O. de Sousa 4 , and Rodrigo S. Bitzer 4
c: 1 SciNet Technologies, 9943 Fieldthorn St., San Diego CA 92127, USA
c: 2 Monash Institute of Pharmaceutical Sciences, Monash University 381 Royal Pde, Parkville, Victoria, 3052, Australia
c: 3 Department of Chemistry, University of Pisa, 56100 Pisa, ITALY
c: 4 Chemistry Institute, Federal University of Rio de Janeiro, Brazil.
c: Date code finalized: September 2021
c: Date of most recent change in manual: August 30th, 2021
c: Copyright © 2000-2017 by Jiabo Li, Brian Duke and Roy McWeeny All Rights Reserved.
c: Copyright (C) 2018- Jiabo Li, Brian Duke, and Roy McWeeny
c: This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, e

In [36]:
print(len(cont_list))
print(cont_list)

325
['root', 'VB2000', 'An ab initio Valence Bond Program based on the Generalized Product Function Method and the Algebrant Algorithm Version 3.0', 'Jiabo Li 1, Brian Duke 2, Roy McWeeny 3, David W. O. de Sousa 4 , and Rodrigo S. Bitzer 4', '1 SciNet Technologies, 9943 Fieldthorn St., San Diego CA 92127, USA', '2 Monash Institute of Pharmaceutical Sciences, Monash University 381 Royal Pde, Parkville, Victoria, 3052, Australia', '3 Department of Chemistry, University of Pisa, 56100 Pisa, ITALY', '4 Chemistry Institute, Federal University of Rio de Janeiro, Brazil.', 'Date code finalized: September 2021', 'Date of most recent change in manual: August 30th, 2021', 'Copyright © 2000-2017 by Jiabo Li, Brian Duke and Roy McWeeny All Rights Reserved.', 'Copyright (C) 2018- Jiabo Li, Brian Duke, and Roy McWeeny', 'This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either versio

In [42]:
# Tokenlization
cont_tokens = []
for c in cont_list:
    c = c.lower()
    if (c[-1] == '.'):
        c = c[:-1]
    cont_tokens.append(c.split())
print(cont_tokens[30])
    

['the', 'development', 'of', 'vb2000', 'has', 'been', 'motivated', 'by', 'two', 'main', 'considerations.', 'first,', 'there', 'is', 'a', 'need', 'to', 'obtain', 'high-precision', 'electronic', 'wave', 'functions,', 'capable', 'of', 'giving', 'quantitative', 'substance', 'to', 'the', 'empirical', 'and', 'intuitive', 'ideas', 'distilled', 'from', 'more', 'than', 'two', 'hundred', 'years', 'of', 'experimental', 'chemistry.', 'there', 'exist', 'within', 'molecules', '"structural', 'units"', 'such', 'as', 'chemical', 'bonds', 'and', 'functional', 'groups,', 'often', 'with', 'highly', 'individual', 'properties,', 'which', 'are', 'transferable', 'from', 'one', 'environment', 'to', 'another', 'with', 'little', 'change;', 'the', 'behavior', 'of', 'such', 'units', 'conforms,', 'in', 'many', 'cases,', 'to', 'empirical', '"additivity', 'rules"', 'which', 'have', 'never', 'received', 'a', 'convincing', 'theoretical', 'explanation.', 'second,', 'although', 'valence', 'bond', 'theory,', 'as', 'develo

In [19]:
add_node_id(tree)
node_list = []
content_list = []
node_iterator(tree, node_list, content_list)

In [24]:
print(len(node_list))
print(len(content_list))

for c in content_list:
    print(c.header)
    print(c.content)

68
257
An ab initio Valence Bond Program based on the Generalized Product Function Method and the Algebrant Algorithm Version 3.0
Jiabo Li 1, Brian Duke 2, Roy McWeeny 3, David W. O. de Sousa 4 , and Rodrigo S. Bitzer 4
An ab initio Valence Bond Program based on the Generalized Product Function Method and the Algebrant Algorithm Version 3.0
1 SciNet Technologies, 9943 Fieldthorn St., San Diego CA 92127, USA
An ab initio Valence Bond Program based on the Generalized Product Function Method and the Algebrant Algorithm Version 3.0
2 Monash Institute of Pharmaceutical Sciences, Monash University 381 Royal Pde, Parkville, Victoria, 3052, Australia
An ab initio Valence Bond Program based on the Generalized Product Function Method and the Algebrant Algorithm Version 3.0
3 Department of Chemistry, University of Pisa, 56100 Pisa, ITALY
An ab initio Valence Bond Program based on the Generalized Product Function Method and the Algebrant Algorithm Version 3.0
4 Chemistry Institute, Federal Univers

In [20]:
for node in node_list:
    print("Node ID = ", node.node_id)

c_list = []
h_list = []
for content in content_list:
    c_list.append(content.content)
    h_list.append(content.header)
    print(content)

Node ID =  0
Node ID =  1
Node ID =  2
Node ID =  3
Node ID =  4
Node ID =  5
Node ID =  6
Node ID =  7
Node ID =  8
Node ID =  9
Node ID =  10
Node ID =  11
Node ID =  12
Node ID =  13
Node ID =  14
Node ID =  15
Node ID =  16
Node ID =  17
Node ID =  18
Node ID =  19
Node ID =  20
Node ID =  21
Node ID =  22
Node ID =  23
Node ID =  24
Node ID =  25
Node ID =  26
Node ID =  27
Node ID =  28
Node ID =  29
Node ID =  30
Node ID =  31
Node ID =  32
Node ID =  33
Node ID =  34
Node ID =  35
Node ID =  36
Node ID =  37
Node ID =  38
Node ID =  39
Node ID =  40
Node ID =  41
Node ID =  42
Node ID =  43
Node ID =  44
Node ID =  45
Node ID =  46
Node ID =  47
Node ID =  48
Node ID =  49
Node ID =  50
Node ID =  51
Node ID =  52
Node ID =  53
Node ID =  54
Node ID =  55
Node ID =  56
Node ID =  57
Node ID =  58
Node ID =  59
Node ID =  60
Node ID =  61
Node ID =  62
Node ID =  63
Node ID =  64
Node ID =  65
Node ID =  66
Node ID =  67
An ab initio Valence Bond Program based on the Generalized

In [21]:
# Do the embedding for both content and header
from openai import OpenAI
import numpy as np
import time
import pandas as pd
import tiktoken
import os

from utils.embeddings_utils import get_embedding

client = OpenAI()

def normalize_l2(x):
    x = np.array(x)
    if x.ndim == 1:
        norm = np.linalg.norm(x)
        if norm == 0:
            return x
        return x / norm
    else:
        norm = np.linalg.norm(x, 2, axis=1, keepdims=True)
        return np.where(norm == 0, x, x / norm)


time1 = time.time()

input_str = ['This is a test','Test OpenAI API']
response = client.embeddings.create(
    model="text-embedding-3-small", input=c_list, encoding_format="float", dimensions=1536
)
print("Time = ", time.time()-time1)

print(len(response.data))
print(len(response.data[1].embedding))

Time =  2.461479902267456
257
1536


In [22]:
print(len(c_list[23]))

1052


In [23]:
c_vectors = []
h_vectors = []
for i in range(len(response.data)):
    c_vectors.append(normalize_l2(response.data[i].embedding))

In [24]:
response = client.embeddings.create(
    model="text-embedding-3-small", input=h_list, encoding_format="float", dimensions=1536
)
print("Time = ", time.time()-time1)

print(len(response.data))
print(len(response.data[1].embedding))

Time =  13.255935907363892
257
1536


In [25]:
for i in range(len(response.data)):
    h_vectors.append(normalize_l2(response.data[i].embedding))

In [26]:
for i in range(len(content_list)):
    content_list[i].h_vector = h_vectors[i]
    content_list[i].c_vector = c_vectors[i]
print(content_list[-1].c_vector)
print(content_list[-1].h_vector)

[-0.01017923 -0.01964828  0.06423174 ... -0.02368841 -0.03471986
  0.02787058]
[-0.02185916  0.00699718  0.06751248 ... -0.01714028 -0.00730928
  0.0241437 ]


In [50]:
import numpy as np

time1 = time.time()
query = "Show me an example input of VB2000. Using methane (CH4) as an example"

response = client.embeddings.create(
    model="text-embedding-3-small", input=query, encoding_format="float", dimensions=1536
)

q_vector = normalize_l2(response.data[0].embedding)
c_scores = []
h_scores = []
m_scores = []

for i in range(len(c_vectors)):
    c_scores.append(q_vector@c_vectors[i])
    h_scores.append(q_vector@h_vectors[i])
    m_scores.append(q_vector@c_vectors[i]+q_vector@h_vectors[i])
c_id = np.argmax(m_scores)

print(c_list[c_id])

print("Time = ", time.time()-time1)


The first example is a VB calculation of 8 electrons on the 4 C-H bonds of methane. To do the calculation, just copy or write the following lines into a new input file (it is already available at TESTINP/extra/extra01_ch4vb8.inp):
Time =  0.4086627960205078


In [51]:
time1 = time.time()
query = "Is this software free licensed?"

response = client.embeddings.create(
    model="text-embedding-3-small", input=query, encoding_format="float", dimensions=1536
)

q_vector = normalize_l2(response.data[0].embedding)
c_scores = []
h_scores = []
m_scores = []

for i in range(len(c_vectors)):
    c_scores.append(q_vector@c_vectors[i])
    h_scores.append(q_vector@h_vectors[i])
    m_scores.append(q_vector@c_vectors[i]+q_vector@h_vectors[i])
c_id = np.argmax(m_scores)

print(c_list[c_id])

print("Time = ", time.time()-time1)

This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
Time =  0.6236920356750488


In [53]:
import math
from collections import Counter
from typing import List

class BM25:
    def __init__(self, corpus: List[List[str]], k1=1.5, b=0.75):
        self.corpus = corpus
        self.corpus_size = len(corpus)
        self.avgdl = sum(len(doc) for doc in corpus) / self.corpus_size
        self.doc_freqs = []
        self.idf = {}
        self.k1 = k1
        self.b = b
        self.inv_index = {}
        self.initialize()

    def initialize(self):
        df = {}
        invdex ={}
        # Term frequency in a doc
        for i, document in enumerate(self.corpus):
            # Count frequencies of terms in documents
            frequencies = Counter(document)
            self.doc_freqs.append(frequencies)
            print("Frequences = ",frequencies)
            
            # Document frequency calculation for terms (words)
            for word, freq in frequencies.items():
                if word in df:
                    df[word] += 1
                else:
                    df[word] = 1
                if (word in invdex):
                    doc_list = invdex[word]
                    doc_list.append(i)
                    invdex[word] = doc_list
                else:
                    invdex[word] = [i]   
        self.inv_index = invdex
        # Calculating inverse document frequency
        # Make sure it is always greater than 1, so that the log is a positive value
        for word, freq in df.items():
            self.idf[word] = math.log((self.corpus_size - freq + 0.5) / (freq + 0.5) + 1)

    def get_score(self, document: List[str], query: List[str]):
        score = 0.0
        doc_len = len(document)
        frequencies = Counter(document)

        for word in query:
            if word in frequencies:
                tf = frequencies[word]
                # Term frequency and document length normalization
                denom = tf + self.k1 * (1 - self.b + self.b * doc_len / self.avgdl)
                score += self.idf.get(word, 0) * tf * (self.k1 + 1) / denom

        return score

    def get_scores(self, query: List[str]):
        scores = [self.get_score(doc, query) for doc in self.corpus]
        return scores

# Example usage
docs = [["the", "quick", "brown", "fox"], ["jumped", "over", "the", "lazy", "dog", "the","fox", "was","jumping","around"]]
bm25 = BM25(cont_tokens)
query = "Example input of Methane"
query_words = query.split(" ")
scores = bm25.get_scores(query_words)
print(scores)

idx = np.argmax(scores)
print(np.argmax(scores))
print(cont_list[idx])

Frequences =  Counter({'root': 1})
Frequences =  Counter({'vb2000': 1})
Frequences =  Counter({'the': 2, 'an': 1, 'ab': 1, 'initio': 1, 'valence': 1, 'bond': 1, 'program': 1, 'based': 1, 'on': 1, 'generalized': 1, 'product': 1, 'function': 1, 'method': 1, 'and': 1, 'algebrant': 1, 'algorithm': 1, 'version': 1, '3.0': 1})
Frequences =  Counter({'4': 2, 'jiabo': 1, 'li': 1, '1,': 1, 'brian': 1, 'duke': 1, '2,': 1, 'roy': 1, 'mcweeny': 1, '3,': 1, 'david': 1, 'w.': 1, 'o.': 1, 'de': 1, 'sousa': 1, ',': 1, 'and': 1, 'rodrigo': 1, 's.': 1, 'bitzer': 1})
Frequences =  Counter({'1': 1, 'scinet': 1, 'technologies,': 1, '9943': 1, 'fieldthorn': 1, 'st.,': 1, 'san': 1, 'diego': 1, 'ca': 1, '92127,': 1, 'usa': 1})
Frequences =  Counter({'monash': 2, '2': 1, 'institute': 1, 'of': 1, 'pharmaceutical': 1, 'sciences,': 1, 'university': 1, '381': 1, 'royal': 1, 'pde,': 1, 'parkville,': 1, 'victoria,': 1, '3052,': 1, 'australia': 1})
Frequences =  Counter({'of': 2, 'pisa,': 2, '3': 1, 'department': 1, 

In [17]:
bm25.inv_index

{'the': [0, 1],
 'quick': [0],
 'brown': [0],
 'fox': [0, 1],
 'jumped': [1],
 'over': [1],
 'lazy': [1],
 'dog': [1],
 'was': [1],
 'jumping': [1],
 'around': [1]}