In [None]:
"""
The telegram bot related code is taken from https://github.com/cmd410/OrigamiBot
and then modified with our LLM bot to have conversation with users
"""

# import packages
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.vectorstores.chroma import Chroma
from langchain_huggingface import HuggingFacePipeline
from langchain_community.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import AutoTokenizer, pipeline, AutoModelForCausalLM

# Below will use HuggingFace - sentence-transformers
# https://huggingface.co/sentence-transformers
from langchain_huggingface import HuggingFaceEmbeddings

# Define directories
pdf_file_dir_path = "custom_data_chatbot/pdfs"
model_path = "custom_data_chatbot/models"

MAX_MESSAGE_LENGTH = 4095  # Maximum length for a Telegram message

def split_message(message):
    """Split a message into chunks of maximum length."""
    return [message[i:i+MAX_MESSAGE_LENGTH] for i in range(0, len(message), MAX_MESSAGE_LENGTH)]

# Load  ................................................................................................................
# Load data from PDF file.
loader = DirectoryLoader(pdf_file_dir_path)

# convert docs in to small chunks for better management
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=1000,
    chunk_overlap=0,
    length_function=len,
    is_separator_regex=False,
)

# load data from pdf and create chunks for better management
pages = loader.load_and_split(text_splitter=text_splitter)

# ======================================================================================================================
# Defining global settings for easy and fast work

# load text embedding model from HuggingFaceHub to generate vector embeddings ..........................................
embed_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-l6-v2",
    cache_folder=model_path,
    # cpu because on AWS we are not using GPU
    model_kwargs={
        "device": "cpu",
    },  # make it to "cuda" in case of GPU
    encode_kwargs={"normalize_embeddings": False},
    multi_process=True,
)

chroma_db = Chroma.from_documents(pages, embed_model, persist_directory=model_path)

# Retrieve .............................................................................................................
# define retriever to retrieve Question related Docs
retriever = chroma_db.as_retriever(
    search_type="mmr",  # Maximum MArginal Relevance
    search_kwargs={"k": 1},  # max relevan docs to retrieve
)

tokenizer = AutoTokenizer.from_pretrained("gpt2", cache_dir=model_path)
model = AutoModelForCausalLM.from_pretrained("gpt2", cache_dir=model_path)

# Define pipeline ......................................................................................................
text_generator = pipeline(
    task="text-generation",
    model=model,
    token="PUT_HERE_HUGGINGFACEHUB_API_TOKEN",
    trust_remote_code=True,
    device_map="auto",  # make it "auto" for auto selection between GPU and CPU, -1 for CPU, 0 for GPU
    tokenizer=tokenizer,
    max_length=1024,  # generate token sequences of 1024 including input and output token sequences
)

ms_dialo_gpt_hf = HuggingFacePipeline(pipeline=text_generator)

retrievalQA = RetrievalQA.from_llm(
    llm=ms_dialo_gpt_hf,
    retriever=retriever,
    prompt=PromptTemplate(
        input_variables=["context"],
        template="{context}",
    ),
)

# telegram related stuff -----------------------------------------------------------------------------------------------
class BotsCommands:
    """
    This are the commands which you can use in chat like..........
    /start will start the conversation
    /echo will echo the message
    """

    def __init__(self, bot: Bot):  # Can initialize however you like
        self.bot = bot

    def start(self, message):  # /start command
        self.bot.send_message(message.chat.id, "Hello user!\nThis is an example bot.")

    def echo(self, message, value: str):  # /echo [value: str] command
        self.bot.send_message(message.chat.id, value)

    def _not_a_command(self):  # This method not considered a command
        print("I am not a command")


class MessageListener(Listener):  # Event listener must inherit Listener
    """
    This is the message listener. Based on the question this portion will be
    answer. This will be responsible for conversation with user.
    """

    def __init__(self, bot):
        self.bot = bot
        self.m_count = 0

    def on_message(self, message):  # called on every message
        self.m_count += 1
        print(f"Total messages: {self.m_count}")
        ans = retrievalQA.invoke(message.text)
        chunks = split_message(ans["result"])
        for chunk in chunks:
            self.bot.send_message(message.chat.id, chunk)

    def on_command_failure(self, message, err=None):  # When command fails
        if err is None:
            self.bot.send_message(message.chat.id, "Command failed to bind arguments!")
        else:
            self.bot.send_message(message.chat.id, f"Error in command:\n{err}")


if __name__ == "__main__":
    token = "PUT_TELEGRAM_TOKEN_HERE"
    bot = Bot(token)  # Create instance of OrigamiBot class

    # Add an event listener
    bot.add_listener(MessageListener(bot))

    # Add a command holder
    bot.add_commands(BotsCommands(bot))

    # We can add as many command holders
    # and event listeners as we like

    bot.start()  # start bot's threads
    print("*" * 25)
    print("Bot has been started!!!")
    while True:
        sleep(1)
        # Can also do some useful work i main thread
        # Like autoposting to channels for example





Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


*************************
Bot has been started!!!
Total messages: 1
