In [None]:
!pip install transformers sentence-transformers langchain
!pip install -U langchain-community
!pip install pypdf
!pip install faiss-cpu
!pip install langchain_huggingface



Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting langchain
  Downloading langchain-0.2.14-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-core<0.3.0,>=0.2.32 (from langchain)
  Downloading langchain_core-0.2.34-py3-none-any.whl.metadata (6.2 kB)
Collecting langchain-text-splitters<0.3.0,>=0.2.0 (from langchain)
  Downloading langchain_text_splitters-0.2.2-py3-none-any.whl.metadata (2.1 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain)
  Downloading langsmith-0.1.104-py3-none-any.whl.metadata (13 kB)
Collecting tenacity!=8.4.0,<9.0.0,>=8.1.0 (from langchain)
  Downloading tenacity-8.5.0-py3-none-any.whl.metadata (1.2 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain-core<0.3.0,>=0.2.32->langchain)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting httpx<1,>=0.23.0 (from langsmith<0.2.0,>=0.1.17->langchain)
  Downloading httpx-0.27.0-py3-none-any.whl.metadata (7.2 kB)
Colle

In [None]:
import pypdf
import os
from google.colab import userdata
from huggingface_hub import InferenceClient
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
import transformers

In [None]:
model_name = "microsoft/layoutlm-base-uncased"
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
model = transformers.AutoModelForSequenceClassification.from_pretrained(model_name)

embedder = HuggingFaceEmbeddings(model_name="all-mpnet-base-v2")

def extract_text_from_pdf(pdf_file_path):
    """Extracts text from a PDF file using pypdf."""
    with open(pdf_file_path, 'rb') as f:
        pdf_reader = pypdf.PdfReader(f)
        text = ""
        for page in pdf_reader.pages:
            text += page.extract_text()
    return text

def create_faiss_index(text_data, embedder):
    """Creates a FAISS index for text data."""

    chunks = text_data.split("\n")


    vectorstore = FAISS.from_texts(chunks, embedder)
    return vectorstore

def handle_user_input(user_question, pdf_file_path):
    """Handles user input, searches for relevant text, and generates a response."""

    pdf_text = extract_text_from_pdf(pdf_file_path)

    global vectorstore
    try:
        vectorstore
    except NameError:
        vectorstore = create_faiss_index(pdf_text, embedder)

    retriever = vectorstore.as_retriever()
    relevant_text = retriever.get_relevant_documents(user_question)

    response = generate_response_from_inference_api(user_question, relevant_text,conversation_history)

    return response



def generate_response_from_inference_api(user_question, relevant_text, conversation_history):
    """Sends a request to the Hugging Face Inference API."""
    sec_key = userdata.get("HUGGINGFACEHUB_API_TOKEN")
    os.environ["HUGGINGFACEHUB_API_TOKEN"] = sec_key

    prompt = f"""
    The PDF document says: {relevant_text}
    Conversation History of previous responses:
    {conversation_history}
    Question : {user_question}
    Answer:
    """

    client = InferenceClient("mistralai/Mistral-Nemo-Instruct-2407", token=sec_key)
    response = client.text_generation(prompt, max_new_tokens=500)
    if hasattr(response, 'generated_text'):
        return response.generated_text
    else:
        return response

conversation_history = []

tokenizer_config.json:   0%|          | 0.00/170 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/606 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/451M [00:00<?, ?B/s]

Some weights of LayoutLMForSequenceClassification were not initialized from the model checkpoint at microsoft/layoutlm-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  warn_deprecated(


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
pdf_file_path = "budget_speech.pdf"
pdf_text = extract_text_from_pdf(pdf_file_path)
vectorstore = create_faiss_index(pdf_text, embedder)

conversation_history = []

while True:
    user_input = input("You: ")
    conversation_history.append(f"You: {user_input}")

    if user_input.lower() == "exit":
        break

    response = handle_user_input(user_input, pdf_file_path)
    conversation_history.append(f"Bot: {response}")
    print(f"Bot: {response}")

You: summarise the pdf
Bot:  - The document discusses the digitalization of customs and income tax services in the last few years.
     - It mentions the introduction of simplified measures and the plan to make all remaining services paperless in the next two years.
     - The document also contains a contents section and a section on data and statistics.
You: what is the Vision for Amrit Kaal?
Bot:  - The Vision for Amrit Kaal is to make the region an engine to attain Viksit Bharat.
You: How much the agriculture target will be increased to
Bot:  - The agriculture target will be increased to 1 crore farmers across the country in the next two years.
You: exit
