In [None]:
!pip install transformers sentence-transformers langchain
!pip install -U langchain-community
!pip install pypdf
!pip install faiss-cpu
!pip install langchain_huggingface


Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting langchain
  Downloading langchain-0.2.12-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-core<0.3.0,>=0.2.27 (from langchain)
  Downloading langchain_core-0.2.29-py3-none-any.whl.metadata (6.2 kB)
Collecting langchain-text-splitters<0.3.0,>=0.2.0 (from langchain)
  Downloading langchain_text_splitters-0.2.2-py3-none-any.whl.metadata (2.1 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain)
  Downloading langsmith-0.1.98-py3-none-any.whl.metadata (13 kB)
Collecting tenacity!=8.4.0,<9.0.0,>=8.1.0 (from langchain)
  Downloading tenacity-8.5.0-py3-none-any.whl.metadata (1.2 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain-core<0.3.0,>=0.2.27->langchain)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting orjson<4.0.0,>=3.9.14 (from langsmith<0.2.0,>=0.1.17->langchain)
  Downloading orjson-3.10.7-cp310-cp310-manylinux_2_17_x86_64.m

In [None]:
import pypdf
import os
from google.colab import userdata
from huggingface_hub import InferenceClient
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
import transformers
from transformers import AutoModelForSequenceClassification, AutoTokenizer

  from tqdm.autonotebook import tqdm, trange


In [None]:
model_name = "microsoft/layoutlm-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

embedder = HuggingFaceEmbeddings(model_name="all-mpnet-base-v2")

def extract_text_from_pdf(pdf_file_path):
    """Extracts text from a PDF file using pypdf."""
    text = ""
    with open(pdf_file_path, 'rb') as f:
        pdf_reader = PdfReader(f)
        for page in pdf_reader.pages:
            text += page.extract_text()
    return text

def extract_text_from_multiple_pdfs(pdf_file_paths):
    """Extracts text from multiple PDF files."""
    combined_text = ""
    for pdf_file_path in pdf_file_paths:
        combined_text += extract_text_from_pdf(pdf_file_path)
    return combined_text

def create_faiss_index(text_data, embedder):
    """Creates a FAISS index for text data."""

    chunks = text_data.split("\n")

    vectorstore = FAISS.from_texts(chunks, embedder)
    return vectorstore

def handle_user_input(user_question, vectorstore, conversation_history):
    """Handles user input, searches for relevant text, and generates a response."""

    retriever = vectorstore.as_retriever()
    relevant_text = retriever.get_relevant_documents(user_question)

    response = generate_response_from_inference_api(user_question, relevant_text, conversation_history)

    return response


def generate_response_from_inference_api(user_question, relevant_text, conversation_history):
    """Sends a request to the Hugging Face Inference API."""
    sec_key = os.getenv("HUGGINGFACEHUB_API_TOKEN")
    if not sec_key:
        raise ValueError("HUGGINGFACEHUB_API_TOKEN environment variable not set")

    prompt = f"""
    You are an assistant that provides detailed answers based on the content of multiple PDF documents and the conversation history.

    Here is the relevant information extracted from the PDFs:
    {relevant_text}

    Here is the history of the conversation so far:
    {conversation_history}

    The user has asked the following question:
    {user_question}

    Provide a detailed and relevant answer based on the information from all the PDFs and the conversation history.

    Answer:
    """

    client = InferenceClient("mistralai/Mistral-Nemo-Instruct-2407", token=sec_key)
    response = client.text_generation(prompt, max_new_tokens=5000)
    if hasattr(response, 'generated_text'):
        return response.generated_text
    else:
        return response


Some weights of LayoutLMForSequenceClassification were not initialized from the model checkpoint at microsoft/layoutlm-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
pdf_file_paths = ["budget_speech.pdf", "AGI.pdf", "AI Assistants.pdf"]

pdf_text = extract_text_from_multiple_pdfs(pdf_file_paths)
vectorstore = create_faiss_index(pdf_text, embedder)

In [None]:
conversation_history = []

while True:
    user_input = input("You: ")
    conversation_history.append(f"You: {user_input}")

    if user_input.lower() == "exit":
        break

    response = handle_user_input(user_input, vectorstore, conversation_history)
    conversation_history.append(f"Bot: {response}")
    print(f"Bot: {response}")


You: how many pdfs r there
Bot:  There are 4 PDFs in total.
You: summarise all 4 pdfs
Bot:  Based on the information provided from the PDFs and our conversation history, here's a summary of the four documents:

     - Document 1: The document discusses the process of summarizing and retrieving information from extensive volumes of textual material. It emphasizes the importance of effectively managing and processing such material.

     - Document 2: This document appears to be a table of contents, with '2 Overview' and '6' listed. It suggests that the document might be structured with an overview section followed by other sections.

     - Document 3: Similar to Document 2, this document also lists '2 Overview'. It could indicate that the document starts with an overview section.

     - Document 4: This document jumps straight to '4 METHODOLOGY'. It suggests that the document might focus on the methodology used in a particular process or study.

     In summary, the PDFs discuss infor