In [None]:
!pip install pdfplumber
!pip install cohere



In [None]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu117 --upgrade

In [None]:
!pip install langchain einops accelerate transformers bitsandbytes scipy

In [None]:
!pip install xformers sentencepiece

In [None]:
!pip install -U bitsandbytes


In [None]:
import gradio as gr
from PyPDF2 import PdfReader
import os
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import sent_tokenize
import cohere
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig
import torch

def load_llama_model():
    name = "meta-llama/Llama-3.1-8B-Instruct"
    auth_token = ""

    # Create tokenizer
    tokenizer = AutoTokenizer.from_pretrained(name, cache_dir='./model/', use_auth_token=auth_token)

    #Load the model with 4-bit quantization using transformers (with bitsandbytes under the hood)
    bnb_config = BitsAndBytesConfig(
        load_in_4bits=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    )
  #   bnb_config = BitsAndBytesConfig(
  #   load_in_8bits=True,  # Change to 8-bit quantization
  #   bnb_8bit_use_double_quant=True,  # Optional, use double quantization for better performance
  #   bnb_8bit_quant_type="nf8",  # 8-bit quantization type (you can choose others depending on your needs)
  #   bnb_8bit_compute_dtype=torch.bfloat16  # Use bfloat16 for computation
  # )

    model = AutoModelForCausalLM.from_pretrained(
        name,
        cache_dir='./model/',
        use_auth_token=auth_token,
        torch_dtype=torch.float16,  # Use FP16 precision
        quantization_config=bnb_config,  # 4-bit quantization for reduced memory usage and faster inference
        device_map="auto"  # Automatically map the model to the available GPU (if any)
    )
    return tokenizer, model

def split_into_sentence_chunks(text, max_length):
    """Split text into chunks of sentences with a maximum length."""
    sentences = text.split(". ")
    chunks = []
    current_chunk = ""

    for sentence in sentences:
        if len(current_chunk) + len(sentence) <= max_length:
            current_chunk += sentence + ". "
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence + ". "

    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks

def generate_embedding(text):
    response = cohere_client.embed(texts=[text])
    return response.embeddings[0]

# Function to process PDF files and return the filename with a success message
def process_pdfs(file_list):
    if not file_list:
        return "Please upload a file."

    success_messages = ""
    document_embeddings = []  # Store embeddings here

    for file in file_list:
        # Get just the filename (without path)
        file_name = os.path.basename(file.name)

        # Read the PDF file
        reader = PdfReader(file.name)
        pdf_text = ""
        for page in reader.pages:
            pdf_text += page.extract_text()

        # Add a success message with the filename, append to success_messages
        success_messages += f"'{file_name}' was processed successfully.<br>"

        # Split the text into chunks
        chunk_size = len(pdf_text)//10
        chunks = split_into_sentence_chunks(pdf_text, chunk_size)

        # Generate and store embeddings for the chunks
        document_embeddings.extend([generate_embedding(chunk) for chunk in chunks])

    # Store embeddings internally (e.g., in memory or a database)
    global stored_document_embeddings
    global documents
    documents = chunks
    stored_document_embeddings = document_embeddings

    return success_messages.strip()  # Return only success messages

def get_top_n_relevant_documents(query_embedding, document_embeddings, documents, top_n=3):
    similarities = cosine_similarity([query_embedding], document_embeddings)

    # Get the indices of the top N most similar documents
    top_indices = similarities[0].argsort()[-top_n:][::-1]

    relevant_documents = ""
    details = []

    for idx in top_indices:
        relevant_documents += documents[idx]
        details.append((documents[idx], similarities[0][idx]))

    return relevant_documents, details

def generate_query_embedding(query):
    return generate_embedding(query)

# Function to handle chat input and return a response
def respond_to_user(message, history):
    query_embedding = generate_query_embedding(message)
    relevant_documents, details = get_top_n_relevant_documents(query_embedding, stored_document_embeddings, documents)
    prompt = message+"\nUse these Above Documents and Give me an Answer:\n"+relevant_documents
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    output = model.generate(
        **inputs,
        max_new_tokens=50,  # Limit the number of output tokens (adjust as needed)
        temperature=0.7,    # Control randomness (higher value = more random)
        top_p=0.9,          # Control diversity (higher value = more diverse)
        use_cache=True
    )

    # Decode the generated output tokens to text
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

    return generated_text, history



def file_upload_ui():
    with gr.Blocks() as interface:
        gr.Markdown("""### DocuMind \nUpload PDF file, and the chatbot will process their contents.""")

        # File upload section
        with gr.Row():
            pdf_files = gr.File(label="Upload PDFs", file_types=[".pdf"], file_count="multiple")

        submit_button = gr.Button("Process PDFs")
        success_message = gr.Markdown("")  # Placeholder for success message

        # Trigger the process_pdfs function when the button is clicked and update success message
        submit_button.click(process_pdfs, inputs=pdf_files, outputs=success_message)

        # Chat interface for user input
        gr.Markdown("""### Chat with the bot""")

        # Use gr.ChatInterface with the response function
        chatbot = gr.ChatInterface(fn=respond_to_user)

    return interface

# Initialize global variable for embeddings
stored_document_embeddings = []
documents = []

# Launch the Gradio app
if __name__ == "__main__":
    cohere_client = cohere.Client("")
    tokenizer, model = load_llama_model()
    ui = file_upload_ui()
    ui.launch(debug=True)

Unused kwargs: ['load_in_8bits', 'bnb_8bit_use_double_quant', 'bnb_8bit_quant_type', 'bnb_8bit_compute_dtype']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]



Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://cddd807a71da34f811.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
    Output components:
        [state, chatbot]
    Output values returned:
        ["What is the Moral Of the story?
Relevant Documents:
The sweetness of the fruit tasted even better when shared, and Raju became known as the boy who learned patience.
Moral: Good things come to those who wait. Patience brings rewards that haste cannot.
.Late at night, he climbed the tree and ﬁlled his basket with green mangoes.
The next morning, he eagerly bit into one. To his surprise, it was sour and bitter. He had missed the sweetest part of the mangoes.
The boy then decided to come back the next day.
The next day, he ate the mangoes and found that they were sweet and delicious.
Raju had learned that waiting for a little", [('What is the Moral Of the story?', 'What is the Moral Of the story?\nRelevant Documents:\nThe sweetness of the fruit tasted even better when shared, and Raju became known as the boy who learned patience.\nM

Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://cddd807a71da34f811.gradio.live


In [None]:
import gradio as gr
from PyPDF2 import PdfReader
import os
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import sent_tokenize
import cohere
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch


# Load the LLaMA model
def load_llama_model():
    name = "meta-llama/Llama-3.1-8B-Instruct"
    auth_token = "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"

    # Create tokenizer
    tokenizer = AutoTokenizer.from_pretrained(name, cache_dir='./model/', use_auth_token=auth_token)

    # Load the model with 4-bit quantization using transformers (with bitsandbytes under the hood)
    bnb_config = BitsAndBytesConfig(
        load_in_4bits=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    )

    model = AutoModelForCausalLM.from_pretrained(
        name,
        cache_dir='./model/',
        use_auth_token=auth_token,
        torch_dtype=torch.float16,  # Use FP16 precision
        quantization_config=bnb_config,  # 4-bit quantization for reduced memory usage and faster inference
        device_map="auto"  # Automatically map the model to the available GPU (if any)
    )
    return tokenizer, model


# Split text into chunks based on maximum length
def split_into_sentence_chunks(text, max_length):
    """Split text into chunks of sentences with a maximum length."""
    sentences = text.split(". ")
    chunks = []
    current_chunk = ""

    for sentence in sentences:
        if len(current_chunk) + len(sentence) <= max_length:
            current_chunk += sentence + ". "
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence + ". "

    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks


# Generate embeddings using Cohere API
def generate_embedding(text):
    response = cohere_client.embed(texts=[text])
    return response.embeddings[0]


# Process PDF files and extract embeddings
def process_pdfs(file_list):
    if not file_list:
        return "Please upload a file."

    success_messages = ""
    document_embeddings = []  # Store embeddings here

    for file in file_list:
        # Get just the filename (without path)
        file_name = os.path.basename(file.name)

        # Read the PDF file
        reader = PdfReader(file.name)
        pdf_text = ""
        for page in reader.pages:
            pdf_text += page.extract_text()

        # Add a success message with the filename
        success_messages += f"'{file_name}' was processed successfully.<br>"

        # Split the text into chunks
        chunk_size = 300  # Adjust chunk size based on document content
        chunks = split_into_sentence_chunks(pdf_text, chunk_size)

        # Generate and store embeddings for the chunks
        document_embeddings.extend([generate_embedding(chunk) for chunk in chunks])

    # Store embeddings internally (e.g., in memory or a database)
    global stored_document_embeddings
    global documents
    documents = chunks
    stored_document_embeddings = document_embeddings

    return success_messages.strip()  # Return only success messages


# Get top N relevant documents based on cosine similarity
def get_top_n_relevant_documents(query_embedding, document_embeddings, documents, top_n=3):
    similarities = cosine_similarity([query_embedding], document_embeddings)

    # Get the indices of the top N most similar documents
    top_indices = similarities[0].argsort()[-top_n:][::-1]

    relevant_documents = ""
    details = []

    for idx in top_indices:
        relevant_documents += documents[idx] + "\n"
        details.append((documents[idx], similarities[0][idx]))

    return relevant_documents, details


# Generate embeddings for a query
def generate_query_embedding(query):
    return generate_embedding(query)


# Handle chat input and return a response
def respond_to_user(message, history):
    query_embedding = generate_query_embedding(message)
    relevant_documents, details = get_top_n_relevant_documents(query_embedding, stored_document_embeddings, documents)

    # Improved prompt with clear instructions
    prompt = (
        f"The following documents contain useful information:\n\n"
        f"{relevant_documents}\n\n"
        f"User's Question:\n{message}\n\n"
        f"Please provide a detailed answer based on the above information."
    )

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    output = model.generate(
        **inputs,
        max_new_tokens=len(prompt)+100,  # Increased token limit
        temperature=0.7,
        top_p=0.9,
        use_cache=True
    )

    # Decode and clean the generated output
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    final_answer = generated_text.replace(relevant_documents, "").strip()

    return final_answer, history


# Create Gradio UI for uploading PDFs and chatting
def file_upload_ui():
    with gr.Blocks() as interface:
        gr.Markdown("""### DocuMind \nUpload PDF files, and the chatbot will process their contents.""")

        # File upload section
        with gr.Row():
            pdf_files = gr.File(label="Upload PDFs", file_types=[".pdf"], file_count="multiple")

        submit_button = gr.Button("Process PDFs")
        success_message = gr.Markdown("")  # Placeholder for success message

        # Trigger the process_pdfs function when the button is clicked and update success message
        submit_button.click(process_pdfs, inputs=pdf_files, outputs=success_message)

        # Chat interface for user input
        gr.Markdown("""### Chat with the bot""")

        # Use gr.ChatInterface with the response function
        chatbot = gr.ChatInterface(fn=respond_to_user)

    return interface


# Initialize global variable for embeddings
stored_document_embeddings = []
documents = []

# Launch the Gradio app
if __name__ == "__main__":
    cohere_client = cohere.Client("vALklAqosIuoSOWurnZYy3SZMmMwdW3nSQKq25OG")
    tokenizer, model = load_llama_model()
    ui = file_upload_ui()
    ui.launch(debug=True)


Unused kwargs: ['load_in_4bits']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]



Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://7fc9b1d97920a0040a.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
    Output components:
        [state, chatbot]
    Output values returned:
        ["The following documents contain useful information:

Consequences  
 
World War  II caused immense devastation, with an estimated 70 -85 million deaths, including six 
million Jews killed in the Holocaust. Entire cities were destroyed, and economies lay in ruins.
World War II: A Brief Overview  
 
World War II (1939 –1945) was a global conflict that reshaped the modern world. Spanning six years, 
it involved more than 30 countries and resulted in unprecedented destruction and loss of life.
World War II remains a pivotal chapter in human history, offering lessons on the consequences of 
unchecked aggression and the importance of international cooperation. Its impact continues to 
shape global politics and society today.  
.


User's Question:
Tell Me About the consequences Of World War 2?

Please provide a detailed answer based on

Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://7fc9b1d97920a0040a.gradio.live


In [None]:
!pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [None]:
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

# Run the model to generate the output
output = model.generate(
    **inputs,
    max_new_tokens=50,  # Limit the number of output tokens (adjust as needed)
    temperature=0.7,    # Control randomness (higher value = more random)
    top_p=0.9,          # Control diversity (higher value = more diverse)
    use_cache=True
)


In [None]:

# Decode the generated output tokens to text
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

# Print the generated output
print(generated_text)


In [None]:
!pip install pdfplumber
!pip install nltk
!pip install cohere