# <center>**QA Model**</center>

In [None]:
import pandas as pd
import requests
import pymupdf
from bs4 import BeautifulSoup
from tqdm import tqdm
import torch
from transformers import AutoTokenizer, AutoModel, RagTokenizer, RagRetriever, RagSequenceForGeneration
import chromadb
import os
import time

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
papers_df = pd.read_csv("../1_WEB_SCRAPING/crawl_outputs/biorxiv_genomics_papers_7070.csv")
print(f"Loaded {len(papers_df)} papers from CSV.")

In [None]:
# Function to get text from HTML abstract
def get_abstract(paper_url):
    time.sleep(1)
    response = requests.get(paper_url, headers={"User-Agent": "Mozilla/5.0"})
    if response.status_code != 200:
        return "Abstract not available"

    soup = BeautifulSoup(response.text, 'html.parser')
    abstract_section = soup.find("div", class_="abstract")
    return abstract_section.text.strip() if abstract_section else "Abstract not found."

In [None]:
# Load BioBERT model/tokenizer
model_name = "dmis-lab/biobert-base-cased-v1.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to(device)

In [None]:
# Function to get embeddings
def get_embedding(text, max_length=512):
    inputs = tokenizer(text[:max_length], return_tensors="pt", truncation=True, padding=True).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()

In [None]:
# Initialize ChromaDB
chroma_client = chromadb.PersistentClient(path="./chroma_db")
collection = chroma_client.get_or_create_collection("genomics_papers")

In [None]:
# # Process papers
# for _, paper in tqdm(papers_df.iterrows(), total=len(papers_df), desc="Processing Papers"):
#     abstract = get_abstract(paper["Paper_URL"])
#     text_content = abstract if abstract != "Abstract not found." else ""
#     embedding = get_embedding(text_content)
#     collection.add(
#         ids=[paper["DOI"]],
#         embeddings=[embedding.tolist()],
#         metadatas=[{"Title": paper["Title"], "Authors": paper["Authors"], "Date": paper["Date"], "URL": paper["Paper_URL"]}]
#     )

batch_size = 25  # Adjust based on your system capacity
batch = []

for _, paper in tqdm(papers_df.iterrows(), total=len(papers_df), desc="Processing Papers"):
    abstract = get_abstract(paper["Paper_URL"])
    if abstract == "Abstract not found.":
        continue
    embedding = get_embedding(abstract)
    
    batch.append((paper["DOI"], embedding.tolist(), {
        "Title": paper["Title"], 
        "Authors": paper["Authors"], 
        "Date": paper["Date"], 
        "URL": paper["Paper_URL"]
    }))
   
    # Process in batches
    if len(batch) >= batch_size:
        ids, embeddings, metadata = zip(*batch)
        collection.add(ids=list(ids), embeddings=list(embeddings), metadatas=list(metadata))
        batch = []  # Clear the batch to free memory

# Insert any remaining data
if batch:
    ids, embeddings, metadata = zip(*batch)
    collection.add(ids=list(ids), embeddings=list(embeddings), metadatas=list(metadata))

In [None]:
print("✅ All papers processed and stored in ChromaDB!")

In [None]:
# Load RAG model
model_name = "facebook/rag-token-base"
rag_tokenizer = RagTokenizer.from_pretrained(model_name, trust_remote_code=True)
rag_retriever = RagRetriever.from_pretrained(model_name, indexed_dataset=None, trust_remote_code=True)
rag_model = RagSequenceForGeneration.from_pretrained(model_name, retriever=rag_retriever, trust_remote_code=True).to(device)

In [None]:
# Function for RAG-based Q&A
def retrieve_and_answer(query):
    query_embedding = get_embedding(query)
    results = collection.query(query_embeddings=[query_embedding.tolist()], n_results=3)
    context = " ".join([doc["metadatas"]["Title"] + " " + doc["metadatas"]["URL"] for doc in results["documents"]])
    
    input_text = f"question: {query} context: {context}"
    inputs = rag_tokenizer(input_text, return_tensors="pt").to(device)
    output_ids = rag_model.generate(**inputs)
    return rag_tokenizer.decode(output_ids[0], skip_special_tokens=True)

In [None]:
queries = [
    "What are the key advancements in CRISPR technology?",
    "How does gene editing using CRISPR/Cas9 work?",
    "What are the recent discoveries in cancer genomics?",
    "What is the role of long non- coding RNAs in gene regulation?",
    "What are the ethical concerns sorrounding gene editing?",
    "How is genome sequencing used in personalized medicine?",
    "What are the most commonly studied human genome variations?",
    "What is epigenetic regulation and its significance in genomics?",
    "How are CRISPR and gene therapy related?",
    "What is the significance of genomic data in disease prediction?"
]

In [None]:
# Get the RAG response for the queries
for query in queries:
    answer = retrieve_and_answer(query)
    print(f"Query: {query}\nAnswer: {answer}\n")

In [None]:
# Clean up temporary files
os.remove("temp.pdf")
chroma_client.close()