In [None]:
!pip install streamlit gradio transformers sentence-transformers faiss-cpu PyPDF2


Collecting streamlit
  Downloading streamlit-1.38.0-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting gradio
  Downloading gradio-4.44.0-py3-none-any.whl.metadata (15 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-3.1.1-py3-none-any.whl.metadata (10 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.8.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.7 kB)
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting tenacity<9,>=8.1.0 (from streamlit)
  Downloading tenacity-8.5.0-py3-none-any.whl.metadata (1.2 kB)
Collecting gitpython!=3.1.19,<4,>=3.0.7 (from streamlit)
  Downloading GitPython-3.1.43-py3-none-any.whl.metadata (13 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Collecting watchdog<5,>=2.1.5 (from streamlit)
  Downloading watchdog-4.0.2-py3-none-manylinux2014_x86_64.whl.metadata (38 kB)
Collecting aiofiles<24.0,>=22.0 (

In [None]:
import PyPDF2
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# Load models
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to process PDF
def process_pdf(pdf_file):
    pdf_reader = PyPDF2.PdfReader(pdf_file)
    document_text = ""
    for page in pdf_reader.pages:
        document_text += page.extract_text()
    sentences = document_text.split('. ')
    embeddings = embedding_model.encode(sentences)
    faiss_index = faiss.IndexFlatL2(embeddings.shape[1])
    faiss_index.add(embeddings)
    return sentences, embeddings, faiss_index

# Function to get relevant context
def get_relevant_context(query, faiss_index, sentences, k=3):
    query_vector = embedding_model.encode([query])
    _, I = faiss_index.search(query_vector, k)
    relevant_sentences = [sentences[i] for i in I[0]]
    return ". ".join(relevant_sentences)


  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
from transformers import pipeline

qa_model = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")

def answer_question(query, faiss_index, sentences):
    if not sentences:
        return "Please upload a document first.", ""
    relevant_context = get_relevant_context(query, faiss_index, sentences)
    answer = qa_model(question=query, context=relevant_context)
    return answer['answer'], relevant_context


config.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/261M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [None]:
import gradio as gr

def process_and_answer(pdf_file, query):
    sentences, embeddings, faiss_index = process_pdf(pdf_file)
    answer, context = answer_question(query, faiss_index, sentences)
    return answer, context

with gr.Blocks() as demo:
    gr.Markdown("# Interactive QA Bot")
    pdf_input = gr.File(label="Upload PDF")
    query_input = gr.Textbox(label="Ask a question about the document")
    answer_output = gr.Textbox(label="Answer")
    context_output = gr.Textbox(label="Relevant Context")
    submit_button = gr.Button("Submit")

    submit_button.click(process_and_answer, inputs=[pdf_input, query_input], outputs=[answer_output, context_output])

demo.launch()


Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://a35335e31a4b6d4cda.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


