In [None]:
with open("explore/Test.md") as f:
    doc_md = f.read()
print(doc_md)

In [None]:
import re
import numpy as np
import pickle
import time

from collections import defaultdict
from FlagEmbedding import BGEM3FlagModel
from sklearn.metrics.pairwise import cosine_similarity

class Content:
    def __init__(self, content_text, content_type, parent=None):
        self.content_text = content_text
        self.sentences = []
        self.content_type = content_type  # paragraph, table, pre-formatted, figure description, etc.
        self.parent = parent  # parent node
        self.create_sentence_list()

    def __repr__(self):
        return f"Type: {self.content_type}, Content: {self.content_text}\n"

    def split_to_sentences(self, language='english'):
        if self.content_type == 'paragraph':
            self.sentences = split_paragraph_into_sentences(self.content_text, language)
            return self.sentences
        else:
            return None

    def create_sentence_list(self):
        if self.content_type == 'paragraph':
            self.sentences = split_paragraph_into_sentences(self.content_text)
        elif self.content_type == 'table':
            # Extract sentences from table title or notes (if any)
            table_notes = re.findall(r'Note: (.*?)$', self.content_text, re.MULTILINE)
            self.sentences = [note.strip() for note in table_notes if note.strip()]
        elif self.content_type == 'pre-formatted':
            # Extract comments from code
            comments = re.findall(r'#.*|//.*|/\*.*?\*/', self.content_text, re.DOTALL)
            self.sentences = [comment.strip() for comment in comments if comment.strip()]

    def get_function_names(self):
        # For pre-formatted text, extract Python function names
        if self.content_type != 'pre-formatted':
            return []
        pattern = r'def\s+([a-zA-Z_]\w*)\s*\('
        return re.findall(pattern, self.content_text)

class Node:
    def __init__(self, level, header, parent=None):
        self.level = level
        self.header = header
        self.parent = parent
        self.content = ""
        self.chunk_list = []
        self.children = []
        self.header_embedding = None
        self.content_embeddings = None

    def __repr__(self):
        return f"{self.header} ({len(self.children)} subsections)"

def load_embedding_model(model_path='../bge-m3.model'):
    with open(model_path, 'rb') as f:
        return pickle.load(f)

def normalize_l2(x):
    x = np.array(x)
    norm = np.linalg.norm(x, axis=-1, keepdims=True)
    return np.where(norm == 0, x, x / norm)

def split_markdown(text):
    # Define regular expressions for tables, pre-formatted text, and paragraphs
    table_regex = r'((?:\|.*?\|\n)+)'  # Markdown table pattern
    preformatted_regex = r'(```[\s\S]*?```|<pre>[\s\S]*?</pre>)'  # Code blocks
    
    # Split text into pre-formatted and other content
    chunks = re.split(preformatted_regex, text)
    result = []

    for chunk in chunks:
        if re.match(preformatted_regex, chunk):
            result.append({'type': 'pre-formatted', 'content': chunk})
        else:
            parts = re.split(table_regex, chunk)
            for part in parts:
                if re.match(table_regex, part):
                    result.append({'type': 'table', 'content': part})
                else:
                    paragraphs = re.split(r'\n{2,}', part)
                    result.extend([{'type': 'paragraph', 'content': p} for p in paragraphs if p.strip()])
    
    return result

def split_paragraph_into_sentences(paragraph, language='english'):
    # Split paragraph into sentences using common punctuation patterns
    sentence_regex = r'(?<=[.!?])\s+(?=[A-Z])'
    sentences = re.split(sentence_regex, paragraph)
    return [s.strip() for s in sentences if s.strip()]

def parse_markdown_to_tree(markdown_text):
    lines = markdown_text.split('\n')
    root = Node(level=0, header='root')
    current_node = root
    content_block = ''

    for line in lines:
        match = re.match(r'^(#+)\s*(.*)', line)
        if match:
            if content_block:
                add_content_to_node(current_node, content_block)
                content_block = ''
            level = len(match.group(1))
            header = match.group(2).strip()
            # Create new node
            new_node = Node(level=level, header=header, parent=current_node)
            # Find correct parent for the new node
            while current_node.level >= level:
                current_node = current_node.parent
            current_node.children.append(new_node)
            current_node = new_node
        else:
            content_block += line + '\n'

    if content_block:
        add_content_to_node(current_node, content_block)

    return root

def add_content_to_node(node, content_block):
    content_block = content_block.strip()
    node.content = content_block
    chunks = split_markdown(content_block)
    for chunk in chunks:
        content_type = chunk['type']
        content_text = chunk['content']
        node.chunk_list.append(Content(content_text, content_type, node))

def create_embeddings_for_tree(root_node, embedding_model):
    def traverse_and_embed(node):
        # Create embeddings for header and content
        node.header_embedding = embedding_model.encode([node.header])['dense_vecs']
        if node.chunk_list:
            sentences = [sentence for chunk in node.chunk_list for sentence in chunk.sentences]
            if sentences:
                node.content_embeddings = embedding_model.encode(sentences)['dense_vecs']
        # Recursively create embeddings for child nodes
        for child in node.children:
            traverse_and_embed(child)
    
    traverse_and_embed(root_node)

def collect_all_embeddings(root_node):
    embeddings_with_nodes = []

    def traverse_and_collect(node):
        if node.header_embedding is not None:
            embeddings_with_nodes.append((node.header_embedding, node))
        if node.content_embeddings is not None:
            embeddings_with_nodes.append((node.content_embeddings, node))
        for child in node.children:
            traverse_and_collect(child)
    
    traverse_and_collect(root_node)
    return embeddings_with_nodes

def print_tree(node, indent=""):
    hashes = "##########"
    hash_tag = hashes[:node.level]
    if node.header != "root":
        print(f"{hash_tag} {node.header}\n")
    for child in node.children:
        print_tree(child, indent + "  ")

def string_tree(node, tree_str=""):
    hashes = "##########"
    hash_tag = hashes[:node.level]
    if node.header != "root":
        tree_str += f"{hash_tag} {node.header}\n\n"
    tree_str += f"{node.content}\n\n"
    for child in node.children:
        tree_str += string_tree(child)
    return tree_str

def calculate_similarity_score(query_embedding, content_embeddings, header_embedding, weight_header=0.3, weight_content=0.7):
    # Calculate cosine similarity between the query and content embeddings using dot product for normalized vectors
    content_similarity = np.dot(query_embedding, content_embeddings.T).max() if content_embeddings is not None else 0
    header_similarity = np.dot(query_embedding, header_embedding.T)
    
    # Weighted sum of similarities
    score = weight_header * header_similarity + weight_content * content_similarity
    return score

def retrieve_relevant_nodes(query, root_node, embedding_model, top_n=3, threshold=0.5):
    query_embedding = embedding_model.encode([query])['dense_vecs']
    relevant_nodes = []

    def evaluate_node(node):
        if node.header_embedding is None:
            return
        score = calculate_similarity_score(query_embedding, node.content_embeddings, node.header_embedding)
        if score >= threshold:
            relevant_nodes.append((node, score))

    def traverse_tree(node):
        evaluate_node(node)
        for child in node.children:
            traverse_tree(child)

    traverse_tree(root_node)
    relevant_nodes = sorted(relevant_nodes, key=lambda x: x[1], reverse=True)
    return [node for node, score in relevant_nodes[:top_n]]

def call_llm_api(prompt, model_name, api_url, temperature=0.1, max_tokens=1024):
    """
    Helper function to call LLM API and generate text based on the given prompt.

    Args:
        prompt (str): The input prompt for the model.
        model_name (str): Name of the model to be used.
        api_url (str): URL endpoint for the LLM API.
        temperature (float): Sampling temperature for response generation.
        max_tokens (int): Maximum number of tokens to generate.

    Returns:
        str: Generated response text.
    """
    payload = {
        'model': model_name,
        'prompt': prompt,
        'temperature': temperature,
        'max_tokens': max_tokens
    }

    response = requests.post(api_url, json=payload, stream=True)

    generated_text = ''
    if response.status_code == 200:
        for line in response.iter_lines():
            if line:
                data = json.loads(line.decode('utf-8'))
                if 'response' in data:
                    generated_text += data['response']
    else:
        print('Error:', response.status_code, response.text)

    return generated_text

def genai_by_llm(input_text, system_prompt, model_name='llama3.2-3B-16k', api_url='http://localhost:11434/api/generate', temperature=0.1, max_tokens=1024):
    """
    Generate text based on system prompt and input text.

    Args:
        input_text (str): The input text for generating the response.
        system_prompt (str): The system-level prompt to control the behavior of the LLM.
        model_name (str): Name of the LLM to be used.
        api_url (str): URL endpoint for the LLM API.
        temperature (float): Sampling temperature for response generation.
        max_tokens (int): Maximum number of tokens to generate.

    Returns:
        str: Generated response text.
    """
    prompt_text = system_prompt + "\n\n" + input_text
    return call_llm_api(prompt_text, model_name, api_url, temperature, max_tokens)

def general_answer(query, model_name='llama3.2-3B-16k', api_url='http://localhost:11434/api/generate', temperature=0.1, max_tokens=1024):
    """
    Generate an answer based solely on the user query.

    Args:
        query (str): The query for which an answer is required.
        model_name (str): Name of the LLM to be used.
        api_url (str): URL endpoint for the LLM API.
        temperature (float): Sampling temperature for response generation.
        max_tokens (int): Maximum number of tokens to generate.

    Returns:
        str: Generated response text.
    """
    return call_llm_api(query, model_name, api_url, temperature, max_tokens)


# Example Markdown text
markdown_text = """
# Introduction

Welcome to our guide.

Here we will cover several important aspects.

<pre>
Author    James Lee
          ABC Inc,
          USA

April 25, 2024
</pre>

## Setup Instructions

First, ensure you have the following tools installed:

- Tool A
- Tool B

Please follow these steps to get started.

## Configuration

Modify the configuration files as shown below:

```
config_setting_1 = true
config_setting_2 = false
```

Remember to restart the service after changing the config files.

## Data Format

Our system uses the following data structure:

Table 1: Table of names and their values.
| ID | Name   | Value |
|----|--------|-------|
| 1  | Item 1 | 100   |
| 2  | Item 2 | 200   |
Note: more data will be added.

Please make sure your data conforms to this table.

# Summary

This guide should help you get started with the basic setup and configuration.

For more details, visit our [website](http://example.com).

Thank you for reading!
"""

# Parse the Markdown and build the tree
tree = parse_markdown_to_tree(markdown_text)

# Create embeddings for the document tree
embedding_model = load_embedding_model()
create_embeddings_for_tree(tree, embedding_model)

# Collect all embeddings with their parent nodes
embeddings_with_nodes = collect_all_embeddings(tree)

# Print the content of each child node
for node in tree.children:
    tree_str = string_tree(node)
    print(tree_str)

# Example of retrieving relevant nodes for a query
query = "How do I configure the settings?"
relevant_nodes = retrieve_relevant_nodes(query, tree, embedding_model)
print("\nMost relevant nodes:\n")
for node in relevant_nodes:
    print(node.header)

In [53]:
import math
from collections import Counter
from typing import List

class BM25:
    def __init__(self, corpus: List[List[str]], k1=1.5, b=0.75):
        self.corpus = corpus
        self.corpus_size = len(corpus)
        self.avgdl = sum(len(doc) for doc in corpus) / self.corpus_size
        self.doc_freqs = []
        self.idf = {}
        self.k1 = k1
        self.b = b
        self.inv_index = {}
        self.initialize()

    def initialize(self):
        df = {}
        invdex ={}
        # Term frequency in a doc
        for i, document in enumerate(self.corpus):
            # Count frequencies of terms in documents
            frequencies = Counter(document)
            self.doc_freqs.append(frequencies)
            print("Frequences = ",frequencies)
            
            # Document frequency calculation for terms (words)
            for word, freq in frequencies.items():
                if word in df:
                    df[word] += 1
                else:
                    df[word] = 1
                if (word in invdex):
                    doc_list = invdex[word]
                    doc_list.append(i)
                    invdex[word] = doc_list
                else:
                    invdex[word] = [i]   
        self.inv_index = invdex
        # Calculating inverse document frequency
        # Make sure it is always greater than 1, so that the log is a positive value
        for word, freq in df.items():
            self.idf[word] = math.log((self.corpus_size - freq + 0.5) / (freq + 0.5) + 1)

    def get_score(self, document: List[str], query: List[str]):
        score = 0.0
        doc_len = len(document)
        frequencies = Counter(document)

        for word in query:
            if word in frequencies:
                tf = frequencies[word]
                # Term frequency and document length normalization
                denom = tf + self.k1 * (1 - self.b + self.b * doc_len / self.avgdl)
                score += self.idf.get(word, 0) * tf * (self.k1 + 1) / denom

        return score

    def get_scores(self, query: List[str]):
        scores = [self.get_score(doc, query) for doc in self.corpus]
        return scores

Frequences =  Counter({'root': 1})
Frequences =  Counter({'vb2000': 1})
Frequences =  Counter({'the': 2, 'an': 1, 'ab': 1, 'initio': 1, 'valence': 1, 'bond': 1, 'program': 1, 'based': 1, 'on': 1, 'generalized': 1, 'product': 1, 'function': 1, 'method': 1, 'and': 1, 'algebrant': 1, 'algorithm': 1, 'version': 1, '3.0': 1})
Frequences =  Counter({'4': 2, 'jiabo': 1, 'li': 1, '1,': 1, 'brian': 1, 'duke': 1, '2,': 1, 'roy': 1, 'mcweeny': 1, '3,': 1, 'david': 1, 'w.': 1, 'o.': 1, 'de': 1, 'sousa': 1, ',': 1, 'and': 1, 'rodrigo': 1, 's.': 1, 'bitzer': 1})
Frequences =  Counter({'1': 1, 'scinet': 1, 'technologies,': 1, '9943': 1, 'fieldthorn': 1, 'st.,': 1, 'san': 1, 'diego': 1, 'ca': 1, '92127,': 1, 'usa': 1})
Frequences =  Counter({'monash': 2, '2': 1, 'institute': 1, 'of': 1, 'pharmaceutical': 1, 'sciences,': 1, 'university': 1, '381': 1, 'royal': 1, 'pde,': 1, 'parkville,': 1, 'victoria,': 1, '3052,': 1, 'australia': 1})
Frequences =  Counter({'of': 2, 'pisa,': 2, '3': 1, 'department': 1, 

In [17]:
# Example usage
docs = [["the", "quick", "brown", "fox"], ["jumped", "over", "the", "lazy", "dog", "the","fox", "was","jumping","around"]]
bm25 = BM25(cont_tokens)
query = "Example input of Methane"
query_words = query.split(" ")
scores = bm25.get_scores(query_words)
print(scores)

idx = np.argmax(scores)
print(np.argmax(scores))
print(cont_list[idx])

bm25.inv_index

{'the': [0, 1],
 'quick': [0],
 'brown': [0],
 'fox': [0, 1],
 'jumped': [1],
 'over': [1],
 'lazy': [1],
 'dog': [1],
 'was': [1],
 'jumping': [1],
 'around': [1]}