In [None]:
import os
import constants

from langchain.text_splitter import Language
from langchain.document_loaders.generic import GenericLoader
from langchain.document_loaders.parsers import LanguageParser
from langchain.document_loaders import UnstructuredMarkdownLoader
from langchain.document_loaders import GitLoader
from langchain.document_loaders import NotebookLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores.faiss import FAISS
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
import pickle

In [None]:
os.environ['OPENAI_API_KEY'] = constants.APIKEY

# Paths to repos to ingest
aeon_analysis = "path/to/aeon_analysis"
aeon_mecha = "path/to/aeon_mecha"

# Define splitters:
python_splitter = RecursiveCharacterTextSplitter.from_language(language=Language.PYTHON, 
                                                               chunk_size=2000, 
                                                               chunk_overlap=200)
markdown_splitter = RecursiveCharacterTextSplitter.from_language(language=Language.MARKDOWN,
															 chunk_size = 500, 
															 chunk_overlap = 0)

# Ingest Aeon Mecha
# We load the py code using LanguageParser, which will:
# Keep top-level functions and classes together (into a single document)
# Put remaining code into a separate document
# Retains metadata about where each split comes from
loader = GenericLoader.from_filesystem(
    aeon_mecha +'/aeon', # Can remove if you want to load all the python files from the repo
    glob="**/*",
    suffixes=[".py"],
    parser=LanguageParser(language=Language.PYTHON)
)
documents = loader.load()
texts = python_splitter.split_documents(documents)

loader = GitLoader(
    repo_path=aeon_mecha,
    file_filter=lambda file_path: file_path.endswith(".md"),
) # You can specify a branch if you want
documents = loader.load()
texts.extend(markdown_splitter.split_documents(documents))

# # FOR SOME REASON THIS IS GETTING STUCK ON LOADER.LOAD...
# for root, dirs, files in os.walk(aeon_mecha):
#     for file in files:
#         if file.endswith(".md"):
#             path = os.path.abspath(os.path.join(root, file))
#             print(path)
#             loader = UnstructuredMarkdownLoader(path)
#             documents = loader.load()
#             texts.extend(markdown_splitter.split_documents(documents))

# Ingest Aeon Analysis
loader = GenericLoader.from_filesystem(
    aeon_analysis +'/aeon_analysis', # Can remove if you want to load all the python files from the repo
    glob="**/*",
    suffixes=[".py"],
    parser=LanguageParser(language=Language.PYTHON)
)
documents = loader.load()
texts.extend(python_splitter.split_documents(documents))

for root, dirs, files in os.walk(aeon_analysis):
    for file in files:
        if file.endswith(".ipynb"):
            path = os.path.abspath(os.path.join(root, file))
            loader = NotebookLoader(path)
            documents = loader.load()
            texts.extend(python_splitter.split_documents(documents))

# Chroma
vectorstore = Chroma.from_documents(texts, OpenAIEmbeddings(), persist_directory='chroma_vectorstore', collection_metadata={"hnsw:space": "cosine"})

# FAISS
# vectorstore = FAISS.from_documents(texts, OpenAIEmbeddings())
# with open("vectorstore.pkl", "wb") as f:
# 	pickle.dump(vectorstore, f)