In [1]:
# import chromadb
# from chromadb.utils import embedding_functions
# from huggingface_hub import InferenceClient
# from sentence_transformers import SentenceTransformer
import os
import orjson
from chromadb.utils import embedding_functions

# import bs4
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_core.documents import Document
import chromadb


class Chatbot:
    def __init__(self):
        self.chatbot = None
        self.vectorstore = None

    def Query(self, query_string):
        results = self.chatbot.invoke(query_string)
        return results

    def LoadVectorstoreandChatbot(self, username):
        os.environ[
            "OPENAI_API_KEY"
        ] = os.getenv("OPENAI_API_KEY")
        os.environ["LANGCHAIN_TRACING_V2"] = "false"
        # os.environ["LANGCHAIN_API_KEY"] = ""

        # Load, chunk and index the contents of the blog.
        loader = TextLoader(file_path="./data_uzbek.txt")
        docs = loader.load()

        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000, chunk_overlap=200
        )
        splits = text_splitter.split_documents(docs)

        def format_docs(docs):
            return "\n\n".join(doc.page_content for doc in docs)

        # load chroma
        # Retrieve and generate using the relevant snippets of the blog.
        vectorstore = Chroma.from_documents(
            documents=splits, embedding=OpenAIEmbeddings()
        )
        retriever = vectorstore.as_retriever()
        prompt = hub.pull("rlm/rag-prompt")
        llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

        rag_chain = (
            {"context": retriever | format_docs, "question": RunnablePassthrough()}
            | prompt
            | llm
            | StrOutputParser()
        )

        self.chatbot = rag_chain
        self.vectorstore = vectorstore

    def FormatForGeneralAnswer(self, result):
        print(result)

        return orjson.dumps({"type": "answer", "message": result})


class TrainonDocuments:
    def __init__(self, username) -> None:
        self.documents = []
        self.username = None
        self.file_types = []
        self.filenames = []

    def AddDocument(self, document, file_type, filename):
        self.documents.append(document)
        self.file_types.append(file_type)
        self.filenames.append(filename)

    def SetUsername(self, username):
        self.username = username

    def Train(self):
        documents = []
        for i in range(len(self.filenames)):
            filename = self.filenames[i]
            file_type = self.file_types[i]
            document = self.documents[i]
            documents.append(
                Document(
                    page_content=document,
                    metadata={
                        "source": filename,
                        "filename": filename,
                        "file_type": file_type,
                    },
                )
            )
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000, chunk_overlap=200
        )
        splits = text_splitter.split_documents(documents=documents)

        chromaaa = chromadb.PersistentClient("./data_chatbot" + self.username)
        collection = chromaaa.get_or_create_collection("demo_collection")
        sentence_transformer_ef = (
            embedding_functions.SentenceTransformerEmbeddingFunction(
                model_name="all-MiniLM-L6-v2"
            )
        )
        print("adding documents")
        collection.add_documents(
            splits,
            ids=[str(i) for i in range(len(splits))],
            embedding_function=sentence_transformer_ef,
        )

        return collection, chromaaa


# Path: sentence-classification/sentece_classifier_bot/initializers.p
print("I cam getting crazy here")
if __name__ == "__main__":
    train = TrainonDocuments("test")
    print("Training started")
    d = open("data_uzbek.txt", "r").read()
    train.AddDocument(d, "txt", "data_uzbek.txt")
    train.SetUsername("test")
    collection, chromaaaa = train.Train()
    os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
    os.environ["LANGCHAIN_TRACING_V2"] = "false"
    print("Training is done")
    chroma_instance = Chroma(collection_name="demo_collection", client=chromaaaa)
    retriever = chroma_instance.as_retriever()

    def format_docs(docs):
        return "\n\n".join(doc.page_content for doc in docs)

    prompt = hub.pull("rlm/rag-prompt")
    llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

    rag_chain = (
        {"context": retriever | format_docs, "question": RunnablePassthrough()}
        | prompt
        | llm
        | StrOutputParser()
    )
    print("asking question")
    response = rag_chain.invoke("What is this document about ?")
    print(response)
    pass


In [29]:
import json
with open("test.txt", "r") as f:
    print(f.__sizeof__())
    lines = f.readlines()
    components = "".join(lines).split("\n\n")
    print(len(components))
    print(components[0])
    print(components[29])
    objects = []
    for component in components[:30]:
        lines = component.split('\n')
        question = lines[1][3:]
        answer = "".join(lines[2:])
        objects.append({
            "question":question,
            "answer":answer
        })
    objects[0]["question"] = components[0].split("\n")[0][3:]
    objects[0]["answer"] = "".join(components[0].split('\n')[1:])
    with open("questions.json", "w") as j:
        json.dump(objects, j)
    print(objects)

192
42
1. What is the difference between a debit card and a credit card?
A debit card is a card that allows its holder to pay for goods and services, make transfers, and withdraw cash from an ATM. It holds the cardholder's personal funds.
Credit card - a card enabling the holder to pay for goods and services at the expense of the bank. The money on the card belongs to the bank, and the bank lends it on certain conditions. Interest is accrued on the money spent.

30. Demand Deposits
This is the deposit with no fixed withdrawal period and the depositor can withdraw the funds at any time.
[{'question': 'What is the difference between a debit card and a credit card?', 'answer': "A debit card is a card that allows its holder to pay for goods and services, make transfers, and withdraw cash from an ATM. It holds the cardholder's personal funds.Credit card - a card enabling the holder to pay for goods and services at the expense of the bank. The money on the card belongs to the bank, and the b