# Create Vector Database

In [None]:
import os
import subprocess
import argparse
import re
from uuid import uuid4
import nbformat
import json

from langchain_community.document_loaders import NotebookLoader
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain_core.documents import Document

## (Optional) Pull code from Git

In [None]:
! git clone https://github.com/fabric-testbed/jupyter-examples.git
! git clone https://github.com/fabric-testbed/teaching-materials.git

## Pre-processing Functions

In [None]:
def construct_url(local_directory, full_path, base_url="https://github.com/fabric-testbed/jupyter-examples/blob/main"):
    """
    Constructs the git url for each notebook
    Arguments: 
        - local_directory: the directory where files are stored locally
        - full_path: full path to the file locally 
        - base_url: base git url for the repository 
    Returns: 
        - url: the final git url for the given file 
    """
    url_path = full_path.replace(local_directory, '')
    url = base_url + url_path
    return url

In [None]:
def find_notebooks(directory) -> list:
    """
    Recursively finds all Jupyter Notebooks in the give directory
    Arguments:
        - directory: the directory to be traversed to find notebooks to process
    Returns: 
        - notebooks: a list of the paths to the notebooks(.ipynb files) found 
    """
    notebooks = []
    for root, _, files in os.walk(directory):
        # print(f"This is the root directory: {root}")
        for file in files:
            if file.endswith(".ipynb"):
                full_path = os.path.join(root, file)
                # Load the notebook
                nb = nbformat.read(full_path, as_version=4)
                # Add url as custom metadata
                nb.metadata["url"] = construct_url(directory, full_path)
                # Save back
                nbformat.write(nb, full_path)
                # add path to the notebooks list
                notebooks.append(full_path)
    return notebooks

In [None]:
def convert_notebook_to_markdown(notebook_path, output_dir) -> str:
    """
    Converts each Jupyter Notebook to Markdown using nbconvert
    Arguments: 
        - notebook_path: path to a single notebook file that is to be converted
        - output_dir: path to directory that will store the converted files
    Returns:
        - markdown_path: a path to the converted file
        - None: returns None if markdown_path doesn't exist
    """
    # Construct a path to the converted file by replacing .ipynb with .md
    markdown_path = os.path.join(output_dir, os.path.basename(notebook_path).replace('.ipynb', '.md'))
    # Convert using nbconvert
    command = f'jupyter nbconvert --to markdown "{notebook_path}" --output-dir {output_dir}'
    subprocess.run(command, shell=True)

    # Return the path if it exists and None if it doesn't 
    return markdown_path if os.path.exists(markdown_path) else None

In [None]:
def convert_notebook_to_py(notebook_path, output_dir) -> str:
    """
    Converts each Jupyter Notebook to Python using nbconvert
    Arguments: 
        - notebook_path: path to a single notebook file that is to be converted
        - output_dir: path to directory that will store the converted files
    Returns:
        - py_path: a path to the converted file
        - None: returns None if py_path doesn't exist
    """
    # Construct a path to the converted file by replacing .ipynb with .py
    py_path = os.path.join(output_dir, os.path.basename(notebook_path).replace('.ipynb', '.py'))

    # Convert the .ipynb file to .py file using nbconvert
    command = f'jupyter nbconvert --to script "{notebook_path}" --output-dir {output_dir}'
    subprocess.run(command, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

    # Only go through this if py_path exists(i.e successful conversion)
    if os.path.exists(py_path):
        # Get the url from notebook's metadata 
        with open(notebook_path, "r", encoding="utf-8") as f:
            nb = nbformat.read(f, as_version=4)
        url = nb.metadata.get("url")

        # For debugging
        # print(f"This is the url retreieved from notebook metadata: {url}")
        
        if url:
            metadata_comment = "# === Notebook Metadata ===\n"
            metadata_comment += f"# url: {url}\n"
            metadata_comment += "# =========================\n\n"

            # For debugging
            # print(f"This is the metadata comment to be added: {metadata_comment}")

            # Append url as metadata dictionary to .py file 
            with open(py_path, "a", encoding="utf-8") as f:
                f.write(metadata_comment)

    # Return the path if it exists and None if it doesn't 
    return py_path if os.path.exists(py_path) else None

In [None]:
def process_notebooks(notebook_directory, output_dir, doc_type):
    """
    Process all notebooks and convert them as needed
    Arguments: 
        - notebook_directory: directory where notebooks for the vectorstore are located
        - output_dir: directory where converted files will be stored
        - doc_type: type of file that notebooks should be converted to 
    """

    notebooks = find_notebooks(notebook_directory)

    # Process each notebook, one at a time
    for i, notebook_path in enumerate(notebooks):
        if doc_type == "markdown_whole_page":
            # Convert notebook to Markdown
            page_path = convert_notebook_to_markdown(notebook_path, output_dir)
            if page_path is None:
                print(f"Failed to convert {notebook_path} to Markdown.")
                continue

        elif doc_type == "py_whole_page":
            # Convert notebook to Python 
            page_path = convert_notebook_to_py(notebook_path, output_dir)
            if page_path is None:
                print(f"Failed to convert {notebook_path} to Python script.")
                continue

## Loading, Splitting and Creating Vectorstore functions

In [None]:
# *** This function currently doesn't work as intended. The urls associated with each file 
# are mismatched and don't point to the right files ***
def load_markdown_content(markdown_dir) -> list:
    """
    Load entire content of each file in the given directory into one document each 
    Arguments: 
        - markdown_dir: directory that has the files to load
    Returns:
        - documents: a list of the documents created from all the files in the directory 
    """

    documents = []

    # Go through each file in the directory 
    for i, filename in enumerate(os.listdir(markdown_dir)):
        # Construct the file path for the file
        filepath = os.path.join(markdown_dir, filename)

        # With the file open, read contents and create document 
        with open(filepath, "r", encoding="utf-8") as f:
            content = f.read()

            # Add file name as a header
            extended_markdown_content = f"# {filename} \n\n{content}" 

            # Convert to a Document and add to list
            # Include file path as metadata 
            document = Document(page_content=extended_markdown_content, 
                                metadata={"source": filepath}, id=i)
            documents.append(document)
    
    return documents 

In [None]:
def load_py_content(py_dir) -> list:
    """
    Load entire content of each file in the given directory into one document each 
    Arguments: 
        - py_dir: directory that has the files to load
    Returns:
        - documents: a list of the documents created from all the files in the directory 
    """
    documents = []

    # Go through each file in the directory 
    for i, filename in enumerate(os.listdir(py_dir)):
        filepath = os.path.join(py_dir, filename)

        # With the file open, read contents and create document 
        with open(filepath, 'r', encoding="utf-8") as f:
            content = f.read()

            lines = content.splitlines()

            # Get url from metadata comment
            url = ""
            for line in lines:
                if line.startswith("# url: "):
                    url = line.split(":", 1)[1].strip()

            # For debugging
            # print(f"This is the url found while loading the contents: {url}")

            # Add file name as a header
            extended_py_content = f"{content[:39]} #{filename}\n\n {content[39:]}"


            # Remove lines that match "# In[ ]:"
            cleaned_py_content = re.sub(r'^\s*# In\[\s*\d*\s*\]:\s*\n?', '',
                                        extended_py_content, flags=re.MULTILINE)

            # Convert to a Document and add to list
            # Include file path as metadata 
            document = Document(page_content=cleaned_py_content,
                                metadata={"source": filepath, "url": url}, id=i)
            documents.append(document)

    return documents

In [None]:
def create_vectorstore(documents, database_loc, embedding="all-mpnet-base-v2"):
    """
    Creates vector store at the given location using the given documents and embeddings
    Arguments: 
        - documents: documents to be loaded into the vectorstore
        - database_loc: the location to store the vectorestore
        - embedding: the embedding model to use in creating the vectorstore
    """
   
    # Initialize the embedding model and the vector store
    embedding_model = HuggingFaceEmbeddings(model_name=embedding)
    vector_store = Chroma(embedding_function=embedding_model,
                          persist_directory=database_loc)

    # Store the document in the vector store
    uuids = [str(uuid4()) for _ in range(len(documents))]
    # Add all documents to the vector store, associated with their unique ids. 
    vector_store.add_documents(documents, ids=uuids)
    
    print("All notebooks have been processed and stored.")

## Pipeline Function

In [None]:
def run_db_pipeline(notebook_directory, output_dir, database_loc,
                    doc_type, embedding="all-mpnet-base-v2"):
    """
    Pre-processes the notebooks in the provided notebook_directory and
    load all the markdown/python content from them. Finally, creates the
    vectorDB
    """
    # Call process function to pre-process notebooks 
    process_notebooks(notebook_directory, output_dir, doc_type)
    
    # Based on doc type, load the content from the found and converted notebooks 
    if doc_type == "markdown_whole_page":
        documents = load_markdown_content(output_dir)

    elif doc_type == "py_whole_page":
        documents = load_py_content(output_dir)

    # Call create vectorstore function to perform embedding and create vectostore 
    create_vectorstore(documents, database_loc, embedding=embedding)

# Set locations

In [None]:
notebook_directory = "/root/dir/for/notebooks"
output_files_dir = "/root/dir/for/converted/files"
database_loc = "/path/to/vectorDB/dir"
doc_type = "py_whole_page" # or "markdown_whole_page"

## Run the Pipeline

In [None]:
# Confirm converted files have a place to go
os.makedirs(output_files_dir, exist_ok=True)

In [None]:
run_db_pipeline(notebook_directory, output_files_dir, database_loc, doc_type)