### Helpers

In [13]:
import os
import hashlib
from typing import List, Dict

def hash_file(content: str) -> str:
    """Generate a SHA-256 hash for the given content."""
    sha256 = hashlib.sha256()
    sha256.update(content.encode('utf-8'))
    return sha256.hexdigest()

def read_file(path: str) -> str:
    with open(path, 'r', encoding='utf-8') as file:
        return file.read()
    
def write_file(path: str, content: str) -> None:
    with open(path, 'w', encoding='utf-8') as file:
        file.write(content)

def read_files_recursive(
        directory: str, 
        extensions: List[str], 
        ignore_folders: List[str]=None
) -> List[str]:
    files = []
    for root, dirs, filenames in os.walk(directory):
        if ignore_folders and any(ignored in root for ignored in ignore_folders):
            continue
        for filename in filenames:
            if any(filename.endswith(ext) for ext in extensions):
                files.append(os.path.join(root, filename))
    return files

class Document(Dict):
    """A document that contains metadata and content."""
    path: str
    directory: str
    name: str
    content: str
    extension: str
    size: int
    hash: str

def fetch_documents(folder_path, extensions, ignore_folders=None) -> List[Document]:
    files = read_files_recursive(folder_path, extensions, ignore_folders)
    documents: List[Document] = []
    for file in files:
        content = read_file(file)
        doc = Document(
            path=file,
            directory=os.path.dirname(file),
            name=os.path.basename(file),
            content=content,
            extension=os.path.splitext(file)[1],
            size=os.path.getsize(file),
            hash=hash_file(content)
        )
        documents.append(doc)
    return documents

### LLM

In [8]:
import dspy

model_name = "mistral:7b-instruct"
lm = dspy.LM(model=model_name, api_base="http://localhost:11434", api_key="")
dspy.configure(lm=lm)

In [9]:
import ollama

def embed_text(text: str) -> List[float]:
    """Embed the given text using Ollama's embedding model."""
    response = ollama.embed(model="nomic-embed-text", input=text)
    return response['embedding']

In [12]:
# llm interface tools
import dspy

class CodeSummary(dspy.Signature):
    """Write a detailed summary of the given code including low level details which explains the steps involved."""
    code = dspy.InputField(desc="The code to be summarized.")
    summary = dspy.OutputField(desc="Summary of the code with all the details that make up the logic.")

summarize_code = dspy.ChainOfThought(CodeSummary)

condense_summary = dspy.ChainOfThought("document -> summary")

ask_question = dspy.ChainOfThought("context, question -> response")

### RAPTOR

In [16]:
# Models

class SummaryDocument(Dict):
    """A document that contains a summary of the code."""
    target_docs: List[Document]
    summary: str

In [None]:
from typing import Tuple
import pandas as pd

def recursive_embed_cluster_summarize(
    texts: List[SummaryDocument], 
    level: int = 1, 
    n_levels: int = 3
) -> Dict[int, Tuple[pd.DataFrame, pd.DataFrame]]:
    """
    Recursively embeds, clusters, and summarizes texts up to a specified level or until
    the number of unique clusters becomes 1, storing the results at each level.

    Parameters:
    - texts: List[SummaryDocument], summaries to be processed.
    - level: int, current recursion level (starts at 1).
    - n_levels: int, maximum depth of recursion.

    Returns:
    - Dict[int, Tuple[pd.DataFrame, pd.DataFrame]], a dictionary where keys are the recursion
      levels and values are tuples containing the clusters DataFrame and summaries DataFrame at that level.
    """
    results = {}  # Dictionary to store results at each level

    # Perform embedding, clustering, and summarization for the current level
    df_clusters, df_summary = embed_cluster_summarize_texts(texts, level)

    # Store the results of the current level
    results[level] = (df_clusters, df_summary)

    # Determine if further recursion is possible and meaningful
    unique_clusters = df_summary["cluster"].nunique()
    if level < n_levels and unique_clusters > 1:
        # Use summaries as the input texts for the next level of recursion
        new_texts = df_summary["summaries"].tolist()
        next_level_results = recursive_embed_cluster_summarize(
            new_texts, level + 1, n_levels
        )

        # Merge the results from the next level into the current results dictionary
        results.update(next_level_results)

    return results