In [None]:
!pip install PyPDF2 transformers faiss-cpu gradio torch scipy

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0.post1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.0 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft

In [None]:
import PyPDF2
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
import torch
import faiss
import numpy as np
import gradio as gr
import os
import requests
import json
from scipy.spatial.distance import cosine


In [None]:
PDF_PATH = '' #your pdf
EMBEDDING_MODEL_NAME = 'sentence-transformers/all-MiniLM-L6-v2'
RE_RANKING_MODEL_NAME = 'cross-encoder/ms-marco-TinyBERT-L-6'
GEMINI_API_KEY = '' #your gemini api key
GEMINI_MODEL_NAME = 'gemini-2.0-flash'

text_chunks = []
faiss_index = None
tokenizer = None
model = None
rerank_model = None
rerank_tokenizer = None

In [None]:

!pip install PyPDF2 transformers faiss-cpu gradio torch scipy

import PyPDF2
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
import torch
import faiss
import numpy as np
import gradio as gr
import os
import requests
import json
from scipy.spatial.distance import cosine

PDF_PATH = 'Advanced_Cybersecurity_Intro.pdf'
EMBEDDING_MODEL_NAME = 'sentence-transformers/all-MiniLM-L6-v2'
RE_RANKING_MODEL_NAME = 'cross-encoder/ms-marco-TinyBERT-L-6'
GEMINI_API_KEY = "YOUR_GEMINI_API_KEY"
GEMINI_MODEL_NAME = "gemini-2.0-flash"

text_chunks = []
faiss_index = None
tokenizer = None
model = None
rerank_model = None
rerank_tokenizer = None

def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            for page_num in range(len(reader.pages)):
                text += reader.pages[page_num].extract_text()
        print(f"Successfully extracted text from {pdf_path}")
    except FileNotFoundError:
        print(f"Error: PDF file not found at {pdf_path}")
        return ""
    except Exception as e:
        print(f"Error extracting text from PDF: {e}")
        return ""
    return text

def chunk_text(text, chunk_size=500, overlap=50):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = " ".join(words[i:i + chunk_size])
        chunks.append(chunk)
    print(f"Text chunked into {len(chunks)} segments.")
    return chunks

def initialize_embedding_model(model_name):
    global tokenizer, model
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModel.from_pretrained(model_name)
        print(f"Embedding model '{model_name}' loaded successfully.")
    except Exception as e:
        print(f"Error loading embedding model: {e}")
        tokenizer, model = None, None
    return tokenizer, model

def get_embeddings(texts):
    if tokenizer is None or model is None:
        print("Embedding model not initialized. Cannot generate embeddings.")
        return np.array([])

    encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**encoded_input)

    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
    sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)

    return sentence_embeddings.cpu().numpy()

def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

def build_faiss_index(embeddings):
    if embeddings.size == 0:
        print("No embeddings to build FAISS index.")
        return None
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    print(f"FAISS index built with {index.ntotal} vectors.")
    return index

def initialize_rerank_model(model_name):
    global rerank_model, rerank_tokenizer
    try:
        rerank_tokenizer = AutoTokenizer.from_pretrained(model_name)
        rerank_model = AutoModelForSequenceClassification.from_pretrained(model_name)
        print(f"Re-ranking model '{model_name}' loaded successfully.")
    except Exception as e:
        print(f"Error loading re-ranking model: {e}")
        rerank_model, rerank_tokenizer = None, None
    return rerank_tokenizer, rerank_model

def rerank_chunks(query, chunks):
    if rerank_model is None or rerank_tokenizer is None:
        print("Re-ranking model not initialized.")
        return chunks

    inputs = [query + " [SEP] " + chunk for chunk in chunks]
    encoding = rerank_tokenizer(inputs, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        logits = rerank_model(**encoding).logits
    scores = torch.softmax(logits, dim=1)[:, 1].cpu().numpy()
    ranked_chunks = [chunk for _, chunk in sorted(zip(scores, chunks), reverse=True)]

    return ranked_chunks

def retrieve_chunks(query, query_embedding, top_k=3):
    if faiss_index is None:
        print("FAISS index not built. Cannot retrieve chunks.")
        return []
    query_embedding = query_embedding.reshape(1, -1)
    distances, indices = faiss_index.search(query_embedding, top_k)
    retrieved_content = [text_chunks[i] for i in indices[0]]
    print(f"Retrieved {len(retrieved_content)} chunks.")
    return rerank_chunks(query, retrieved_content)

def call_gemini_llm(prompt_text):
    if not GEMINI_API_KEY:
        return "Error: Gemini API Key is not set."

    api_url = f"https://generativelanguage.googleapis.com/v1beta/models/{GEMINI_MODEL_NAME}:generateContent?key={GEMINI_API_KEY}"
    headers = {"Content-Type": "application/json"}
    payload = {
        "contents": [{"parts": [{"text": prompt_text}]}],
        "generationConfig": {
            "temperature": 0.7,
            "maxOutputTokens": 500
        }
    }

    try:
        response = requests.post(api_url, headers=headers, data=json.dumps(payload))
        print("Status Code:", response.status_code)
        print("Response Body:", response.text)

        response.raise_for_status()
        result = response.json()

        if "candidates" in result and result["candidates"]:
            return result["candidates"][0]["content"]["parts"][0]["text"]
        else:
            return "No valid response from Gemini."
    except Exception as e:
        return f"Gemini API error: {e}"

def rag_answer_question(user_question):
    if not user_question.strip():
        return "Please enter a question."
    if faiss_index is None or tokenizer is None or model is None:
        return "System not fully initialized."
    query_embedding = get_embeddings([user_question])
    retrieved_chunks = retrieve_chunks(user_question, query_embedding)
    if not retrieved_chunks:
        return "No relevant information found in the document for your question."
    context = "\n\n".join(retrieved_chunks)
    prompt = f"""
    You are an expert in advanced cybersecurity.
    Based on the following context from a cybersecurity document, answer the user's question accurately and concisely.
    If the answer is not available in the provided context, state that you don't have enough information.

    Context:
    {context}

    Question: {user_question}

    Answer:
    """
    return call_gemini_llm(prompt)

def initialize_rag_system():
    global text_chunks, faiss_index
    print("--- Initializing RAG System ---")
    full_text = extract_text_from_pdf(PDF_PATH)
    if not full_text:
        print("Initialization failed: No text extracted.")
        return
    text_chunks = chunk_text(full_text)
    initialize_embedding_model(EMBEDDING_MODEL_NAME)
    initialize_rerank_model(RE_RANKING_MODEL_NAME)
    embeddings = get_embeddings(text_chunks)
    faiss_index = build_faiss_index(embeddings)

if __name__ == "__main__":
    initialize_rag_system()
    gr.Interface(fn=rag_answer_question,
                 inputs=gr.Textbox(lines=2, placeholder="Ask a cybersecurity question..."),
                 outputs="text",
                 title="Cybersecurity RAG Assistant",
                 description="Ask questions about the content in the uploaded cybersecurity PDF."
    ).launch()


--- Initializing RAG System ---
Successfully extracted text from Advanced_Cybersecurity_Intro.pdf
Text chunked into 9 segments.
Embedding model 'sentence-transformers/all-MiniLM-L6-v2' loaded successfully.
Re-ranking model 'cross-encoder/ms-marco-TinyBERT-L-6' loaded successfully.
FAISS index built with 9 vectors.
It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://af12c85447fb9fe659.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


In [None]:
initialize_rag_system()
gr.Interface(fn=rag_answer_question,
             inputs=gr.Textbox(lines=2, placeholder='Ask a cybersecurity question...'),
             outputs='text',
             title='Cybersecurity RAG Assistant',
             description='Ask questions about the content in the uploaded cybersecurity PDF.'
).launch()

--- Initializing RAG System ---
Successfully extracted text from Advanced_Cybersecurity_Intro.pdf
Text chunked into 9 segments.
Embedding model 'sentence-transformers/all-MiniLM-L6-v2' loaded successfully.
Re-ranking model 'cross-encoder/ms-marco-TinyBERT-L-6' loaded successfully.
FAISS index built with 9 vectors.
It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://7c2277062878dc93c5.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


