In [3]:
!pip install transformers datasets faiss-cpu




In [4]:
import json

# Load research paper corpus and training data (question-answer)
with open('/content/corpus.json', 'r') as f:
    corpus_data = json.load(f)


In [5]:
!pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence_transformers-3.2.0-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.2.0-py3-none-any.whl (255 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m255.2/255.2 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence_transformers
Successfully installed sentence_transformers-3.2.0


In [6]:
!pip install -U sentence-transformers




In [7]:
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

# Extract the content for FAISS index and keep metadata
contents = [doc['body'] for doc in corpus_data]
metadata = [{"title": doc["title"], "author": doc["author"], "url": doc.get("url"), "source": doc.get("source"),
             "category": doc.get("category"), "published_at": doc.get("published_at")} for doc in corpus_data]

# Use SentenceTransformer to create embeddings for document content
model = SentenceTransformer('sentence-transformers/msmarco-distilbert-base-v4')

# Create embeddings for the content
content_embeddings = model.encode(contents)

# Initialize FAISS index
index = faiss.IndexFlatL2(content_embeddings.shape[1])
index.add(np.array(content_embeddings))

# Save index and embeddings
faiss.write_index(index, "content_index.faiss")
np.save("content_embeddings.npy", content_embeddings)


  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.75k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/545 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/319 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [8]:
from sentence_transformers import SentenceTransformer

# Initialize the SentenceTransformer for encoding (used for FAISS retrieval)
sentence_model = SentenceTransformer('sentence-transformers/msmarco-distilbert-base-v4')

def retrieve_documents(query, k=3):
    # Encode the query using SentenceTransformer
    query_embedding = sentence_model.encode([query])[0]

    # Perform FAISS search for the top k documents
    D, I = index.search(np.array([query_embedding]), k)

    # Retrieve the corresponding content and metadata
    retrieved_docs = [contents[i] for i in I[0]]
    retrieved_metadata = [metadata[i] for i in I[0]]
    facts = [doc.split('.')[0] + '.' for doc in retrieved_docs]  # Extract the first sentence as a "fact"

    return retrieved_docs, retrieved_metadata, facts


In [9]:
from transformers import pipeline

# Initialize summarization pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=0)

def summarize_content(content, max_length=130, min_length=30, do_sample=False):
    # Split the content into smaller chunks based on the max tokens BART can handle
    max_chunk_length = 1024  # Adjust according to the model's token limit
    content_chunks = [content[i:i+max_chunk_length] for i in range(0, len(content), max_chunk_length)]

    # Summarize each chunk separately
    summaries = []
    for chunk in content_chunks:
        summary = summarizer(chunk, max_length=max_length, min_length=min_length, do_sample=do_sample)
        summaries.append(summary[0]['summary_text'])

    # Join all summaries together
    full_summary = " ".join(summaries)
    return full_summary




config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [10]:
import requests

def generate_answer(query, context):
    API_URL = "https://api-inference.huggingface.co/models/meta-llama/Llama-3.2-3B-Instruct"
    headers = {"Authorization": f"Bearer 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"}

    def query_model(payload):
        response = requests.post(API_URL, headers=headers, json=payload)
        return response.json()

    # Input query with context
    data = query_model({
        "inputs": f"""
                You are have a task of giving answer of the question asked and classifying the question into one of four categories: 'comparison_query', 'temporal_query', 'inference_query' or 'null_query'.
                Here is a brief description of each category:
                - 'comparison_query': If The question is COMPARING two or more entities, companies etc. (e.g. using words like, 'comparision', 'in contrast to', 'vs' etc. in the question)
                - 'temporal_query': If there is a time or date specified in the question like name of monthes, or any specific year or timeline given.
                - 'null_query': If the question does not have an answer explicitly written in the context then it is a null query and answer it as 'Insufficient data'.
                - 'inference_query': The question has an answer that can be INFERED or derived DIRECTLY from the information given in the context given ONLY.

                Based on the context of the question, provide the a short one line straightforward Answer to the question first and then give Classification.

                    Question: {query},
                    Context: {context},
                Give your answer after writing 'Answer:' and Classification as 'Classification:'
                No need to apply any internet source or your database.
                Please give me BOTH Answer AND Classification based on context ONLY.
                    """
    })

    # Extract the answer part by removing the question and keeping only the generated answer
    generated_text = data[0]['generated_text']

    return generated_text



In [11]:
# Step 4: Print the answer and metadata
def extract_answer_and_classification(text):
    # Split the text into lines
    lines = text.split('\n')

    # Initialize an empty dictionary to store the answer and classification
    result = {}

    # Iterate through the lines to find the answer and classification
    for line in lines:
        if line.strip().startswith("Answer:"):
            # Extract the answer after 'Answer:'
            result['Answer'] = line.split("Answer:", 1)[1].strip()
        elif line.strip().startswith("Classification:"):
            # Extract the classification after 'Classification:'
            result['Classification'] = line.split("Classification:", 1)[1].strip().strip("'")

        elif line.strip().startswith("Classify:"):
            # Extract the classification after 'Classification:'
            result['Classify'] = line.split("Classify:", 1)[1].strip().strip("'")

    return result


In [12]:
import json
def format_answer(query, answer, question_type, evidence_list,fact):
    return {
        "query": query,
        "answer": answer,
        "question_type": question_type,
        "evidence_list": [
            {
                "title": evidence.get("title"),
                "author": evidence.get("author"),
                "url": evidence.get("url"),
                "source": evidence.get("source"),
                "category": evidence.get("category"),
                "published_at": evidence.get("published_at"),
                "fact": fact # First 200 chars as a fact
            } for evidence in evidence_list
        ]
    }

In [13]:
!pip install gradio
!pip install faiss-cpu
!pip install sentence-transformers
!pip install flask-ngrok
!pip install transformers
!pip install nltk

!pip install pyngrok

Collecting gradio
  Downloading gradio-5.0.2-py3-none-any.whl.metadata (15 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0 (from gradio)
  Downloading fastapi-0.115.2-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.4.0 (from gradio)
  Downloading gradio_client-1.4.0-py3-none-any.whl.metadata (7.1 kB)
Collecting httpx>=0.24.1 (from gradio)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting huggingface-hub>=0.25.1 (from gradio)
  Downloading huggingface_hub-0.25.2-py3-none-any.whl.metadata (13 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting orjson~=3.0 (from gradio)
  Downloading orjson-3.10.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata

Collecting flask-ngrok
  Downloading flask_ngrok-0.0.25-py3-none-any.whl.metadata (1.8 kB)
Downloading flask_ngrok-0.0.25-py3-none-any.whl (3.1 kB)
Installing collected packages: flask-ngrok
Successfully installed flask-ngrok-0.0.25
Collecting pyngrok
  Downloading pyngrok-7.2.0-py3-none-any.whl.metadata (7.4 kB)
Downloading pyngrok-7.2.0-py3-none-any.whl (22 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.2.0


In [15]:
import gradio as gr
import json

# Function to extract the answer, question type, and evidence
def process_query(query):
    # Retrieve documents and metadata
    retrieved_docs, retrieved_metadata, fact = retrieve_documents(query)

    # Combine the retrieved content as context and summarize it
    contexted = " ".join(retrieved_docs)
    context = summarize_content(contexted)

    # Generate the answer using LLaMA
    generated_answer = generate_answer(query, context)

    # Extract the answer and classification from the generated response
    final_res = extract_answer_and_classification(generated_answer)

    # Prepare evidence in a well-formatted string
    evidence_list = "\n\n".join([
        f"Title: {evidence['title']}\n Author: {evidence['author']}\n URL: {evidence.get('url', 'N/A')}\n Source: {evidence.get('source', 'N/A')}\n Category: {evidence.get('category', 'N/A')}\n Published At: {evidence.get('published_at', 'N/A')}\n Fact: {fact[i]}\n"
        for i, evidence in enumerate(retrieved_metadata)
    ])


    # Return answer, question_type, and evidence
    return final_res['Answer'], final_res['Classification'], evidence_list

# Define the Gradio interface with query on the left and outputs on the right
def create_web_app():
    with gr.Blocks(css=".block-title {font-size: 24px; font-weight: bold; text-align: center;}") as demo:
        # Title for the web app
        gr.Markdown(
            """
            <h1 style="text-align: center; color: #4A90E2;">Research Query Processor</h1>
            <p style="text-align: center;">Input your query and get concise answers with their question types and sources of evidence.</p>
            <hr style="border: none; border-top: 2px solid #4A90E2; width: 80%;">
            """
        )

        # Input for query
        query_input = gr.Textbox(label="Enter your query", placeholder="e.g., What are the benefits of AI in healthcare?", lines=2)

        # Button to submit the query
        submit_button = gr.Button("Submit Query", variant="primary")

        # Output sections with clear headings
        gr.Markdown("<h2 class='block-title'>Response</h2>")
        answer_output = gr.Textbox(label="Answer", interactive=False, lines=3)

        gr.Markdown("<h2 class='block-title'>Question Type</h2>")
        question_type_output = gr.Textbox(label="Question Type", interactive=False, lines=2)

        gr.Markdown("<h2 class='block-title'>Evidence</h2>")
        evidence_output = gr.Textbox(label="Evidence", interactive=False, lines=6)

        # Define the interaction between the input and outputs
        submit_button.click(process_query, inputs=[query_input], outputs=[answer_output, question_type_output, evidence_output])

    demo.launch()

# Run the web app
create_web_app()


Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://f5a5ea404928534fb5.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
