# Google Drive Connection

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
%cd /content/gdrive/MyDrive/QA-CHATBOT-LLM/RAG
%ls

# Libraries installation

In [None]:
from IPython.display import clear_output

! pip install sentence_transformers==2.2.2
!pip install huggingface-hub==0.25.0
!pip install langchain
!pip install torch
!pip install transformers
! pip install -qq -U tiktoken
! pip install -qq -U pypdf
! pip install -qq -U faiss-gpu
! pip install -qq -U InstructorEmbedding
! pip install -qq -U accelerate
! pip install -qq -U bitsandbytes
! pip install -qq -U langchain_community

clear_output()

# Libraries import

In [None]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import os
import glob
import textwrap
import time

import langchain

### loaders
from langchain.document_loaders import PyPDFLoader, DirectoryLoader

### splits
from langchain.text_splitter import RecursiveCharacterTextSplitter

### prompts
from langchain import PromptTemplate, LLMChain

### vector stores
from langchain.vectorstores import FAISS

### models
from langchain.llms import HuggingFacePipeline
from langchain.embeddings import HuggingFaceInstructEmbeddings

### retrievers
from langchain.chains import RetrievalQA

import torch
import transformers
from transformers import (
    AutoTokenizer, AutoModelForCausalLM,
    BitsAndBytesConfig,
    pipeline
)

clear_output()

In [None]:
print('langchain:', langchain.__version__)
print('torch:', torch.__version__)
print('transformers:', transformers.__version__)

# Environment configuration

In [None]:
class CFG:
    # LLMs
    model_name = 'Llama' #'Mistral'
    temperature = 0
    top_p = 0.95
    repetition_penalty = 1.15

    # splitting
    split_chunk_size = 800
    split_overlap = 0

    # embeddings
    embeddings_model_repo = 'sentence-transformers/all-MiniLM-L6-v2'

    # similar passages
    k = 6

    # paths
    input_path = '/content/gdrive/MyDrive/QA-CHATBOT-LLM/RAG/Dataset/InputDataFormatted/'
    Embeddings_path = 'qa-chatbot-llm-vectordb/faiss_index_hp'
    output_path = './qa-chatbot-llm-vectordb'

#Model definition



In [None]:
def get_model(model = CFG.model_name):

    print('\nDownloading model: ', model, '\n\n')

    if model == 'Mistral':
        model_repo = 'Moxoff/Mistral-Ita' #'Moxoff/Mistral-Ita' #'mistral-7B' #'Mistral-Ita-7b'

        tokenizer = AutoTokenizer.from_pretrained(model_repo)

        bnb_config = BitsAndBytesConfig(
            load_in_4bit = True,
            bnb_4bit_quant_type = "nf4",
            bnb_4bit_compute_dtype = torch.float16,
            bnb_4bit_use_double_quant = True,
        )

        model = AutoModelForCausalLM.from_pretrained(
            model_repo,
            quantization_config = bnb_config,
            device_map = 'auto',
            low_cpu_mem_usage = True,
        )

        max_len = 1024

    elif model == 'Llama':
        model_repo = 'daryl149/llama-2-13b-chat-hf'

        tokenizer = AutoTokenizer.from_pretrained(model_repo, use_fast=True)

        bnb_config = BitsAndBytesConfig(
            load_in_4bit = True,
            bnb_4bit_quant_type = "nf4",
            bnb_4bit_compute_dtype = torch.float16,
            bnb_4bit_use_double_quant = True,
        )

        model = AutoModelForCausalLM.from_pretrained(
            model_repo,
            quantization_config = bnb_config,
            device_map = 'auto',
            low_cpu_mem_usage = True,
            trust_remote_code = True
        )

        max_len = 2048 # 8192

    else:
        print("Not implemented model (tokenizer and backbone)")

    return tokenizer, model, max_len

# HuggingFace access

In [None]:
from google.colab import userdata
secret_hf = userdata.get('HUGGINGFACE_TOKEN')
!huggingface-cli login --token $secret_hf

# Model access

In [None]:
tokenizer, model, max_len = get_model(model = CFG.model_name)

clear_output()

In [None]:
model.eval()

In [None]:
### check how Accelerate split the model across the available devices (GPUs)
model.hf_device_map

In [None]:
### hugging face pipeline
pipe = pipeline(
    task = "text-generation",
    model = model,
    tokenizer = tokenizer,
    pad_token_id = tokenizer.eos_token_id,
#   do_sample = True,
    max_length = max_len,
    temperature = CFG.temperature,
    top_p = CFG.top_p,
    repetition_penalty = CFG.repetition_penalty
)

### langchain pipeline
llm = HuggingFacePipeline(pipeline = pipe)

In [None]:
llm

# RAG

In [None]:
from langchain.document_loaders import DirectoryLoader, TextLoader


# Load dataset
def get_loader(source):

    loader = DirectoryLoader(
        CFG.input_path,
        glob="./" + source + ".txt",
        loader_cls=TextLoader,
        show_progress=True,
        use_multithreading=True
    )

    documents = loader.load()
    print(f'Loading sample dataset\n')
    print(f'We have {len(documents)} total page')
    return documents


# Split in chunk
def chunk_split(documents):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = CFG.split_chunk_size,
        chunk_overlap = CFG.split_overlap
    )

    texts = text_splitter.split_documents(documents)
    print(f'{len(texts)} chunk of {len(documents)} pages created')
    return texts


# Delete a folder
def clear_folder(path):
    if os.path.exists(CFG.Embeddings_path + '/index.faiss'):
        for item in os.listdir(path):
            item_path = os.path.join(path, item)
            if os.path.isdir(item_path):
                pass
            else:
                os.remove(item_path)
                print("Deleted file: " + item_path)


# Create embeddings
def create_embeddings(path, texts):

    clear_folder(path)

    embeddings = HuggingFaceInstructEmbeddings(
        model_name = CFG.embeddings_model_repo,
        model_kwargs = {"device": "cuda"}
    )

    vectordb = FAISS.from_documents(
        documents = texts,
        embedding = embeddings
    )
    print("Embeddings created at: " + path)
    return vectordb.save_local(path)


# Load vectordb
def load_vectordb(vectordb):
    embeddings = HuggingFaceInstructEmbeddings(
        model_name = CFG.embeddings_model_repo,
        model_kwargs = {"device": "cuda"}
    )

    ### load vector DB embeddings
    vectordb = FAISS.load_local(
        CFG.Embeddings_path,
        embeddings,
        allow_dangerous_deserialization=True #Attenzione!
    )
    print("vectordb loaded")
    clear_output()
    return vectordb

# Create retriever
def create_retriever(vectordb, k, search_type):
    return vectordb.as_retriever(search_kwargs = {"k": k, "search_type" : search_type})


# Prompt definition
prompt_template = """
Answer specifically only the input question and do not answer the questions present in the context.
If the answer is not provided in the context, simply write "Answer not available in the context", and do not provide the wrong answer.
Use only the following context to answer the final question.

Context: {context}

Question: {question}
Answer:"""


PROMPT = PromptTemplate(
    template = prompt_template,
    input_variables = ["context", "question"]
)

# Create qa_chain
def create_qa_chain(llm, retriever, PROMPT):
    qa_chain = RetrievalQA.from_chain_type(
        llm = llm,
        chain_type = "stuff", # map_reduce, map_rerank, stuff, refine
        retriever = retriever,
        chain_type_kwargs = {"prompt": PROMPT},
        return_source_documents = True,
        verbose = False
    )
    return qa_chain


# Output post-processing
def wrap_text_preserve_newlines(text, width=700):

    lines = text.split('\n')
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]
    wrapped_text = '\n'.join(wrapped_lines)
    return wrapped_text


def process_llm_response(llm_response):
    ans = wrap_text_preserve_newlines(llm_response['result'])
    sources_used = ' \n'.join(
        [
            source.metadata['source'].split('/')[-1][:-4]
            for source in llm_response['source_documents']
        ]
    )

    return ans


def extract_text_after_substring(s, substring1):
    index = s.find(substring1)
    if index != -1:
        return s[index + len(substring1):]
    return ""


# LLM response
def llm_ans(qa_chain, query):
    start = time.time()
    llm_response = qa_chain.invoke(query)
    ans = process_llm_response(llm_response)
    end = time.time()
    time_elapsed = int(round(end - start, 0))
    return ans

# Output
def get_result(source, query):
    documents = get_loader(source)
    texts = chunk_split(documents)
    output_path =  f"{CFG.output_path}/faiss_index_hp" #<path_destinazione_embeddings>
    vectordb = create_embeddings(output_path, texts)
    #da testare
    vectordb = load_vectordb(vectordb)
    retriever = create_retriever(vectordb, CFG.k, "similarity")
    qa_chain = create_qa_chain(llm, retriever, PROMPT)
    ans = llm_ans(qa_chain, query)
    #print(ans)
    return extract_text_after_substring(ans, "Answer:")

print(get_result("Machine-Learning", "How many type of Machine Learning exist?"))

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

! pip install --upgrade gradio -qq

import gradio as gr
print(gr.__version__)

import gradio as gr


def respond_to_message(state, item, message):
    # Generate a response based on the selected item and the message
    response = get_result(item, message)

    state.append((message, response))  # Append the user message and the response to the chat state
    return state, gr.update(visible=False), ""  # Return the updated state for the chat and the new state

def reset_chat(state):
    return [], [], ""

with gr.Blocks() as demo:


    state = gr.State([])  # Initialize the chat state as an empty list

    with gr.Tab("💬 QA Chatbot"):
        gr.Markdown("## 🗣️ Hi, how can I help you?")
        with gr.Row():
            item_dropdown = gr.Dropdown(["World-History", "Machine-Learning", "Health-and-Wellness"], label="Select your subject")
        with gr.Row():
            direct_chatbot = gr.Chatbot(label="💬 Chat")
        with gr.Row():
            with gr.Column(scale=5):
                direct_textbox = gr.Textbox(placeholder="💭 Insert your message here...")
            direct_submit_btn = gr.Button(value="Send")
        with gr.Row():
            #direct_regenerate_btn = gr.Button(value="🔄 Regenerate")
            direct_reset_btn = gr.Button(value="🗑️ Reset Chat")


    # Define the action for the submit button (pressed button)
    direct_submit_btn.click(
        respond_to_message,
        inputs=[state, item_dropdown, direct_textbox],  # Inputs to the function: the current state and the message from the textbox
        outputs=[direct_chatbot, state, direct_textbox]  # Outputs: updated chatbot, warning message, and state
    )

    # Define the action for the submit button (enter from keyboard)
    direct_textbox.submit(
        respond_to_message,
        inputs=[state, item_dropdown, direct_textbox],  # Inputs: state, selected model, and message
        outputs=[direct_chatbot, state, direct_textbox]  # Outputs: updated chatbot, warning message, and state
    )

    direct_reset_btn.click(
        reset_chat,
        inputs=[],
        outputs=[direct_chatbot, state, direct_textbox]  # Outputs: updated chatbot, warning message, state, and textbox
    )


if __name__ == "__main__":
    demo.launch()
