1. Audio

In [None]:
from typing import Optional
from uuid import UUID

from logger import get_logger
from models.settings import common_dependencies
from pydantic import BaseModel

logger = get_logger(__name__)


class User(BaseModel):
    id: UUID
    email: Optional[str]
    user_openai_api_key: Optional[str] = None
    requests_count: int = 0

    # [TODO] Rename the user table and its references to 'user_usage'
    def create_user(self, date):
        commons = common_dependencies()
        logger.info(f"New user entry in db document for user {self.email}")

        return (
            commons["supabase"]
            .table("users")
            .insert(
                {
                    "user_id": self.id,
                    "email": self.email,
                    "date": date,
                    "requests_count": 1,
                }
            )
            .execute()
        )

    def get_user_request_stats(self):
        commons = common_dependencies()
        requests_stats = (
            commons["supabase"]
            .from_("users")
            .select("*")
            .filter("user_id", "eq", self.id)
            .execute()
        )
        return requests_stats.data

    def fetch_user_requests_count(self, date):
        commons = common_dependencies()
        response = (
            commons["supabase"]
            .from_("users")
            .select("*")
            .filter("user_id", "eq", self.id)
            .filter("date", "eq", date)
            .execute()
        )
        userItem = next(iter(response.data or []), {"requests_count": 0})

        return userItem["requests_count"]

    def increment_user_request_count(self, date):
        commons = common_dependencies()
        requests_count = self.fetch_user_requests_count(date) + 1
        logger.info(f"User {self.email} request count updated to {requests_count}")
        commons["supabase"].table("users").update(
            {"requests_count": requests_count}
        ).match({"user_id": self.id, "date": date}).execute()
        self.requests_count = requests_count

2. Common

In [None]:
import time

from langchain.schema import Document
from models.brains import Brain
from models.files import File
from models.settings import CommonsDep
from utils.vectors import Neurons


async def process_file(
    commons: CommonsDep,
    file: File,
    loader_class,
    enable_summarization,
    brain_id,
    user_openai_api_key,
):
    dateshort = time.strftime("%Y%m%d")

    file.compute_documents(loader_class)

    for doc in file.documents:  # pyright: ignore reportPrivateUsage=none
        metadata = {
            "file_sha1": file.file_sha1,
            "file_size": file.file_size,
            "file_name": file.file_name,
            "chunk_size": file.chunk_size,
            "chunk_overlap": file.chunk_overlap,
            "date": dateshort,
            "summarization": "true" if enable_summarization else "false",
        }
        doc_with_metadata = Document(page_content=doc.page_content, metadata=metadata)

        neurons = Neurons(commons=commons)
        created_vector = neurons.create_vector(doc_with_metadata, user_openai_api_key)
        # add_usage(stats_db, "embedding", "audio", metadata={"file_name": file_meta_name,"file_type": ".txt", "chunk_size": chunk_size, "chunk_overlap": chunk_overlap})

        created_vector_id = created_vector[0]  # pyright: ignore reportPrivateUsage=none

        brain = Brain(id=brain_id)
        brain.create_brain_vector(created_vector_id, file.file_sha1)

    return

3. Csv

In [None]:
from langchain.document_loaders import CSVLoader
from models.files import File
from models.settings import CommonsDep

from .common import process_file


def process_csv(
    commons: CommonsDep,
    file: File,
    enable_summarization,
    brain_id,
    user_openai_api_key,
):
    return process_file(
        commons,
        file,
        CSVLoader,
        enable_summarization,
        brain_id,
        user_openai_api_key,
    )

4. Docx

In [None]:
from langchain.document_loaders import Docx2txtLoader
from models.files import File
from models.settings import CommonsDep

from .common import process_file


def process_docx(commons: CommonsDep, file: File, enable_summarization, brain_id, user_openai_api_key):
    return process_file(commons, file, Docx2txtLoader, enable_summarization, brain_id, user_openai_api_key)

5. Epub

In [None]:
from langchain.document_loaders.epub import UnstructuredEPubLoader
from models.files import File
from models.settings import CommonsDep

from .common import process_file


def process_epub(commons: CommonsDep, file: File, enable_summarization, brain_id, user_openai_api_key):
    return process_file(commons, file, UnstructuredEPubLoader, enable_summarization, brain_id, user_openai_api_key)

6. Guthub

In [None]:
import os
import time

from langchain.document_loaders import GitLoader
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from models.brains import Brain
from models.files import File
from models.settings import CommonsDep
from utils.file import compute_sha1_from_content
from utils.vectors import Neurons


async def process_github(
    commons: CommonsDep,  # pyright: ignore reportPrivateUsage=none
    repo,
    enable_summarization,
    brain_id,
    user_openai_api_key,
):
    random_dir_name = os.urandom(16).hex()
    dateshort = time.strftime("%Y%m%d")
    loader = GitLoader(
        clone_url=repo,
        repo_path="/tmp/" + random_dir_name,
    )
    documents = loader.load()
    os.system("rm -rf /tmp/" + random_dir_name)

    chunk_size = 500
    chunk_overlap = 0
    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap
    )

    documents = text_splitter.split_documents(documents)
    print(documents[:1])

    for doc in documents:
        if doc.metadata["file_type"] in [
            ".pyc",
            ".png",
            ".svg",
            ".env",
            ".lock",
            ".gitignore",
            ".gitmodules",
            ".gitattributes",
            ".gitkeep",
            ".git",
            ".json",
        ]:
            continue
        metadata = {
            "file_sha1": compute_sha1_from_content(doc.page_content.encode("utf-8")),
            "file_size": len(doc.page_content) * 8,
            "file_name": doc.metadata["file_name"],
            "chunk_size": chunk_size,
            "chunk_overlap": chunk_overlap,
            "date": dateshort,
            "summarization": "true" if enable_summarization else "false",
        }
        doc_with_metadata = Document(page_content=doc.page_content, metadata=metadata)

        file = File(
            file_sha1=compute_sha1_from_content(doc.page_content.encode("utf-8"))
        )

        file_exists = file.file_already_exists()

        if not file_exists:
            print(f"Creating entry for file {file.file_sha1} in vectors...")
            neurons = Neurons(commons=commons)
            created_vector = neurons.create_vector(
                doc_with_metadata, user_openai_api_key
            )
            print("Created vector sids ", created_vector)
            print("Created vector for ", doc.metadata["file_name"])

        file_exists_in_brain = file.file_already_exists_in_brain(brain_id)

        if not file_exists_in_brain:
            file.add_file_to_brain(brain_id)  # pyright: ignore reportPrivateUsage=none
            brain = Brain(id=brain_id)
            file.link_file_to_brain(brain)
    return {
        "message": f"✅ Github with {len(documents)} files has been uploaded.",
        "type": "success",
    }

7. Html

In [None]:
import re
import unicodedata

import requests
from langchain.document_loaders import UnstructuredHTMLLoader
from models.files import File
from models.settings import CommonsDep

from .common import process_file


def process_html(commons: CommonsDep, file: File, enable_summarization, brain_id, user_openai_api_key):
    return process_file(commons, file, UnstructuredHTMLLoader,  enable_summarization, brain_id, user_openai_api_key)


def get_html(url):
    response = requests.get(url)
    if response.status_code == 200:
        return response.text
    else:
        return None


def slugify(text):
    text = unicodedata.normalize('NFKD', text).encode(
        'ascii', 'ignore').decode('utf-8')
    text = re.sub(r'[^\w\s-]', '', text).strip().lower()
    text = re.sub(r'[-\s]+', '-', text)
    return text

8. Markdown

In [None]:
from langchain.document_loaders import UnstructuredMarkdownLoader
from models.files import File
from models.settings import CommonsDep

from .common import process_file


def process_markdown(commons: CommonsDep, file: File, enable_summarization, brain_id, user_openai_api_key):
    return process_file(commons, file, UnstructuredMarkdownLoader, enable_summarization, brain_id, user_openai_api_key)

9. Notebook

In [None]:
from langchain.document_loaders import NotebookLoader
from models.files import File
from models.settings import CommonsDep

from .common import process_file


def process_ipnyb(commons: CommonsDep, file: File, enable_summarization, brain_id, user_openai_api_key):
    return process_file(commons, file, NotebookLoader, enable_summarization, brain_id, user_openai_api_key)

10. Odt

In [None]:
from langchain.document_loaders import PyMuPDFLoader
from models.files import File
from models.settings import CommonsDep

from .common import process_file


def process_odt(commons: CommonsDep, file: File, enable_summarization, brain_id, user_openai_api_key):
    return process_file(commons, file, PyMuPDFLoader, enable_summarization, brain_id, user_openai_api_key)

11. Pdf

In [None]:
from langchain.document_loaders import PyMuPDFLoader
from models.files import File
from models.settings import CommonsDep

from .common import process_file


def process_pdf(commons: CommonsDep, file: File, enable_summarization, brain_id, user_openai_api_key):
    return process_file(commons, file, PyMuPDFLoader, enable_summarization, brain_id, user_openai_api_key)

12. Powerpoint

In [None]:
from langchain.document_loaders import UnstructuredPowerPointLoader
from models.files import File
from models.settings import CommonsDep

from .common import process_file


def process_powerpoint(commons: CommonsDep, file: File, enable_summarization, brain_id, user_openai_api_key):
    return process_file(commons, file, UnstructuredPowerPointLoader, enable_summarization, brain_id, user_openai_api_key)

13. Txt

In [None]:
from langchain.document_loaders import TextLoader
from models.files import File
from models.settings import CommonsDep

from .common import process_file


async def process_txt(commons: CommonsDep, file: File, enable_summarization, brain_id, user_openai_api_key):
    return await process_file(commons, file, TextLoader, enable_summarization, brain_id,user_openai_api_key)