In [11]:
import os
import tiktoken


def codebase_to_md(directory):
    js_files = []
    ts_files = []
    json_files = []
    output_file = "output.md"
    if os.path.exists(output_file):
        os.remove(output_file)
    for root, dirs, files in os.walk(directory):
        for file in files:
            if "node_modules" in root or "node_modules" in file:
                continue
            if "package-lock.json" in file:
                continue
            if ".stories." in file:
                continue
            if ".next" in root or ".next" in file:
                continue
            if "components/ui" in root:
                continue
            if "components/home" in root:
                continue
            if "docs-agixt" in root:
                continue
            if "docs" in root:
                continue
            if ".eslintrc.json" in file:
                continue
            if file.endswith(".js") or file.endswith(".jsx"):
                js_files.append(os.path.join(root, file))
            elif file.endswith(".ts") or file.endswith(".tsx"):
                ts_files.append(os.path.join(root, file))
            elif file.endswith(".json"):
                json_files.append(os.path.join(root, file))

    if os.path.exists(output_file):
        os.remove(output_file)

    with open(output_file, "w", encoding="utf-8") as markdown_file:
        for file_paths, file_type in [
            (js_files, "javascript"),
            (ts_files, "typescript"),
            (json_files, "json"),
        ]:
            for file_path in file_paths:
                # Make sure the file isn't output.md
                if output_file in file_path:
                    continue
                markdown_file.write(f"**{file_path}**\n")
                with open(file_path, "r", encoding="utf-8") as code_file:
                    content = code_file.read()
                    markdown_file.write(f"```{file_type}\n{content}\n```\n\n")
    with open(output_file, "r", encoding="utf-8") as markdown_file:
        content = markdown_file.read()
    content = content.replace("<|endoftext|>", "")
    encoding = tiktoken.get_encoding("cl100k_base")
    tokens = len(encoding.encode(content))
    return content, tokens


directory = "."  # Replace with your folder path
content, tokens = codebase_to_md(directory)
print(f"Tokens: {tokens}")

Tokens: 121342
