<h2>** Group 24 **</h2>

    1. Ashutosh Mishra(2023aa05913)
    2. Waquar Haseeb(2023aa05970
    3. Ashish Verma(2023aa05919)
    4. Swati Agarwal(2023aa05819)
    5. Aditya Mittal(2023aa05589)


# Now here i have downloaded the models of marker.


In [None]:
!huggingface-cli download datalab-to/surya_layout --cache-dir ./local_models/
!huggingface-cli download datalab-to/texify --cache-dir ./local_models/
!huggingface-cli download vikp/surya_rec2 --cache-dir ./local_models/
!huggingface-cli download datalab-to/surya_tablerec --cache-dir ./local_models/
!huggingface-cli download vikp/surya_det3 --cache-dir ./local_models/
!huggingface-cli download datalab-to/inline_math_det0 --cache-dir ./local_models/
!uggingface-cli download datalab-to/inline_math_rec --cache-dir ./local_models/
!huggingface-cli download datalab-to/surya_ocr --cache-dir ./local_models/

# API For the PDF to MarkDown file


In [None]:
from fastapi import FastAPI, UploadFile, File, Form
import os
import tempfile
import requests
from pathlib import Path
from pymilvus import connections, Collection, FieldSchema, CollectionSchema, DataType, utility
from transformers import AutoTokenizer, AutoModel
import torch
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.output import text_from_rendered
from marker.config.parser import ConfigParser
from io import BytesIO
import atexit
import multiprocessing
import warnings
import numpy as np

# Clean up leaked multiprocessing resources
def cleanup_resources():
    multiprocessing.active_children()  # Ensure child processes are cleaned up
    warnings.filterwarnings("ignore", category=UserWarning)  # Suppress warnings

app = FastAPI()

# Base Directory for File Storage
BASE_DIR = r"D:\WorkSpace_0\CAI\infrence\ConvData\MarkerData"
os.makedirs(BASE_DIR, exist_ok=True)

# Milvus Configuration
MILVUS_HOST = "127.0.0.1"
MILVUS_PORT = "19530"

# Embedding Model (Local Path) - Use your retriever model
RETRIEVER_MODEL_DIR = r"D:\WorkSpace_0\CAI\infrence\retriever_model"

# Load the retriever model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(RETRIEVER_MODEL_DIR)
model = AutoModel.from_pretrained(RETRIEVER_MODEL_DIR)

# Calculate vector dimension from the model
with torch.no_grad():
    # Get sample embedding to determine dimension
    sample_inputs = tokenizer("This is a test", return_tensors="pt")
    sample_outputs = model(**sample_inputs)
    # The hidden state of the [CLS] token is often used as sentence embedding
    VECTOR_DIMENSION = sample_outputs.last_hidden_state[:, 0, :].shape[1]
    print(f"Using vector dimension: {VECTOR_DIMENSION}")

# Set Hugging Face Local Model Path
os.environ["HF_HOME"] = r"D:\WorkSpace_0\CAI\infrence\local_models_marker"

def embed_text(text):
    """Create embeddings using the retriever model."""
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
        # Use mean pooling to get the sentence embedding
        attention_mask = inputs["attention_mask"]
        token_embeddings = outputs.last_hidden_state
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        embedding = torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        return embedding[0].cpu().numpy()

def get_collection_name(username, project_name):
    """Generate a unique collection name using NV_username_projectname."""
    return f"NV_{username}_{project_name}"

def connect_to_milvus():
    """Connect to Milvus."""
    connections.connect("default", host=MILVUS_HOST, port=MILVUS_PORT)

def create_collection(collection_name):
    """Create Milvus collection if not exists."""
    if not utility.has_collection(collection_name):
        fields = [
            FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
            FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=8192),
            FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=VECTOR_DIMENSION)
        ]
        schema = CollectionSchema(fields, description=f"Collection for {collection_name}")
        collection = Collection(collection_name, schema)
        collection.create_index(
            field_name="embedding",
            index_params={"metric_type": "L2", "index_type": "IVF_FLAT", "params": {"nlist": 128}}
        )
        collection.load()
        return collection
    else:
        collection = Collection(collection_name)
        collection.load()
        return collection

def convert_pdf_to_md(pdf_bytes, output_md_path, output_img_dir):
    """Converts PDF to Markdown and saves extracted images."""
    
    os.makedirs(output_img_dir, exist_ok=True)
    
    config_dict = {
        "output_format": "markdown",
        "use_llm": False,
        "force_ocr": False,
        "strip_existing_ocr": False
    }
    config_parser = ConfigParser(config_dict)
    converter = PdfConverter(
        config=config_parser.generate_config_dict(),
        artifact_dict=create_model_dict(),
        processor_list=config_parser.get_processors(),
        renderer=config_parser.get_renderer(),
        llm_service=config_parser.get_llm_service()
    )

    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
        temp_pdf.write(pdf_bytes)
        temp_pdf_path = temp_pdf.name

    try:
        rendered = converter(temp_pdf_path)
        text, _, images = text_from_rendered(rendered)

        for img_name, img_obj in images.items():
            img_path = os.path.join(output_img_dir, f"{img_name}.png")
            img_buffer = BytesIO()
            img_obj.save(img_buffer, format="PNG")
            with open(img_path, "wb") as img_file:
                img_file.write(img_buffer.getvalue())
            # Replace image placeholders in Markdown with correct path
            text = text.replace(f"![]({img_name})", f"![]({img_path})")

        # Improve table formatting (Markdown tables)
        text = fix_markdown_tables(text)

        with open(output_md_path, "w", encoding="utf-8") as md_file:
            md_file.write(text)
        print(f"Converted PDF to Markdown: {output_md_path} with images in {output_img_dir}")
        return text # Return Markdown text for API response if needed
    finally:
        # Cleanup temp file
        if os.path.exists(temp_pdf_path):
            os.remove(temp_pdf_path)

def fix_markdown_tables(text):
    """Ensure tables are formatted correctly for Markdown."""
    lines = text.split("\n")
    new_lines = []
    inside_table = False

    for line in lines:
        if "|" in line:  # Detect table rows
            if not inside_table:
                inside_table = True
                # Add header formatting if missing
                new_lines.append(line)
                new_lines.append("|---" * (line.count("|") - 1) + "|")
            else:
                new_lines.append(line)
        else:
            inside_table = False
            new_lines.append(line)

    return "\n".join(new_lines)

@app.post("/convert")
async def convert_pdf(
    username: str = Form(...),
    project_name: str = Form(...),
    file: UploadFile = File(...)
):
    """Converts PDF to Markdown and saves metadata."""
    pdf_bytes = await file.read()

    # Define user, project, and parsed directories
    user_dir = Path(BASE_DIR) / username
    project_dir = user_dir / project_name
    parsed_dir = user_dir / f"{project_name}_parsed"

    # Ensure directories exist    
    project_dir.mkdir(parents=True, exist_ok=True)
    parsed_dir.mkdir(parents=True, exist_ok=True)

    # Define file paths
    pdf_path = project_dir / file.filename # Original PDF location
    base_name = os.path.splitext(file.filename)[0] # Extract file name without extension
    output_md_path = parsed_dir / f"{base_name}.md" # Markdown file
    output_img_dir = parsed_dir / f"{base_name}_images" # Images directory
    output_img_dir.mkdir(exist_ok=True) # Ensure image directory exists

    # Save the uploaded PDF
    with open(pdf_path, "wb") as f:
        f.write(pdf_bytes)

    # Call the function to convert PDF to Markdown
    markdown_text = convert_pdf_to_md(pdf_bytes, output_md_path, output_img_dir)

    return {
        "message": "Conversion successful!",
        "pdf_saved_at": str(pdf_path),
        "markdown_saved_at": str(output_md_path),
        "images_saved_in": str(output_img_dir),
        "markdown_content": markdown_text
    }

# Register cleanup at exit
atexit.register(cleanup_resources)

@app.post("/index")
async def index_markdown(username: str, project_name: str):
    """Indexes Markdown content in Milvus after PDF conversion."""
    user_dir = Path(BASE_DIR) / username
    parsed_dir = user_dir / f"{project_name}_parsed"

    md_files = list(parsed_dir.glob("*.md"))
    if not md_files:
        return {"error": "No Markdown files found for indexing"}

    connect_to_milvus()
    collection_name = get_collection_name(username, project_name)
    
    # Check if collection already exists with different dimensions
    if utility.has_collection(collection_name):
        old_collection = Collection(collection_name)
        old_schema = old_collection.schema
        old_dim = None
        for field in old_schema.fields:
            if field.name == "embedding":
                old_dim = field.params.get("dim")
                break
                
        # If dimensions don't match, drop and recreate the collection
        if old_dim is not None and old_dim != VECTOR_DIMENSION:
            print(f"Dropping collection {collection_name} due to dimension mismatch (old: {old_dim}, new: {VECTOR_DIMENSION})")
            utility.drop_collection(collection_name)
    
    collection = create_collection(collection_name)

    for md_file in md_files:
        with open(md_file, "r", encoding="utf-8") as f:
            markdown_text = f.read()

        text_chunks = [chunk.strip() for chunk in markdown_text.split("\n\n") if len(chunk.strip()) > 10]
        
        # Process chunks in batches to avoid memory issues
        batch_size = 32
        all_embeddings = []
        for i in range(0, len(text_chunks), batch_size):
            batch_chunks = text_chunks[i:i+batch_size]
            batch_embeddings = [embed_text(chunk).tolist() for chunk in batch_chunks]
            all_embeddings.extend(batch_embeddings)
            print(f"Processed batch {i//batch_size + 1}/{(len(text_chunks) + batch_size - 1)//batch_size}")

        insert_data = [text_chunks, all_embeddings]
        collection.insert(insert_data)
        collection.flush()

    return {"message": f"Indexed {len(md_files)} Markdown files in {collection_name}"}

if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=10001)

# I have Download the RETRIEVER_MODEL and RERANKER_MODEL


In [None]:
from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification

# Define model paths
RETRIEVER_MODEL = "facebook/contriever"  # Alternative: "thenlper/gte-large"
# Using a confirmed public reranker model instead
RERANKER_MODEL = "cross-encoder/ms-marco-MiniLM-L6-v2"  # Alternative to the original model

# Set local directories
retriever_dir = "retriever_model"
reranker_dir = "reranker_model"

print(f"Downloading and saving retriever model: {RETRIEVER_MODEL}")

# Download and save retriever model and tokenizer
retriever_tokenizer = AutoTokenizer.from_pretrained(RETRIEVER_MODEL)
retriever_model = AutoModel.from_pretrained(RETRIEVER_MODEL)

# Save retriever model and tokenizer
retriever_tokenizer.save_pretrained(retriever_dir)
retriever_model.save_pretrained(retriever_dir)

print(f"Retriever model saved to: {retriever_dir}")

print(f"Downloading and saving reranker model: {RERANKER_MODEL}")

# Download and save reranker model and tokenizer
# Add your Hugging Face token here
HF_TOKEN = ""  # Replace with your actual token

reranker_tokenizer = AutoTokenizer.from_pretrained(RERANKER_MODEL, token=HF_TOKEN)
reranker_model = AutoModelForSequenceClassification.from_pretrained(RERANKER_MODEL, token=HF_TOKEN)

# Save reranker model and tokenizer
reranker_tokenizer.save_pretrained(reranker_dir)
reranker_model.save_pretrained(reranker_dir)

print(f"Reranker model saved to: {reranker_dir}")

print("Both models have been successfully downloaded and saved.")

# Here I have Download the PHI-2


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Step 1: Download the model files - this will save them to your local cache
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    "microsoft/phi-2", 
    torch_dtype=torch.float16, 
    device_map="auto", 
    trust_remote_code=True
)

# Step 2: Save the model and tokenizer to a local directory
local_model_path = "./phi-2-local"
model.save_pretrained(local_model_path)
tokenizer.save_pretrained(local_model_path)

print(f"Model and tokenizer saved to {local_model_path}")

# Step 3: Load the model from the local directory
local_tokenizer = AutoTokenizer.from_pretrained(local_model_path, trust_remote_code=True)
local_model = AutoModelForCausalLM.from_pretrained(
    local_model_path,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)

# Test the locally loaded model
prompt = "Once upon a time,"
inputs = local_tokenizer(prompt, return_tensors="pt").to(local_model.device)
outputs = local_model.generate(**inputs, max_length=50)
print(local_tokenizer.decode(outputs[0], skip_special_tokens=True))

# API For the infrence


In [None]:
from fastapi import FastAPI, HTTPException, Query
from pydantic import BaseModel
import os
import torch
from pymilvus import connections, Collection
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel
from sentence_transformers import CrossEncoder
import re
import spacy
import logging
import numpy as np

# Configurations
PHI_2_MODEL_DIR = "D:\\WorkSpace_0\\CAI\\infrence\\phi-2-local"
RETRIEVER_MODEL_DIR = "D:\\WorkSpace_0\\CAI\\infrence\\retriever_model"
RERANKER_MODEL_DIR = "D:\\WorkSpace_0\\CAI\\infrence\\reranker_model"
MIN_QUERY_LENGTH = 5
RESTRICTED_WORDS_FILE = "restricted_words.txt"
PROFANITY_LIST = ["badword1", "badword2"]  # Expand this list as needed
DEFAULT_TOP_K = 5

# Setup logging
logging.basicConfig(filename='guardrail_logs.log', level=logging.INFO)

# Initialize FastAPI app
app = FastAPI()

# Global variables for models
tokenizer = None
model = None
retriever_model = None
retriever_tokenizer = None
reranker_model = None
nlp = None

# Load models at startup
@app.on_event("startup")
def load_models():
    """Load all the necessary models at startup."""
    global tokenizer, model, retriever_model, retriever_tokenizer, reranker_model, nlp
    
    # Load LLM for answer generation
    tokenizer, model = load_phi2_model()
    
    # Load retriever model
    retriever_tokenizer, retriever_model = load_retriever_model()
    
    # Load reranker model
    reranker_model = load_reranker_model()
    
    # Load NLP model for entity recognition
    nlp = spacy.load("en_core_web_sm")

# Define request model
class QueryRequest(BaseModel):
    username: str
    project_name: str
    query: str
    top_k: int = DEFAULT_TOP_K

# API endpoint
@app.post("/generate")
async def generate_response_endpoint(request: QueryRequest):
    """Generate a response based on the user's query using retrieval and reranking."""
    try:
        response_data = main(
            request.username,
            request.project_name,
            request.query,
            request.top_k,
            tokenizer,
            model,
            retriever_model,
            retriever_tokenizer,
            reranker_model,
            nlp
        )
        return response_data
    except ValueError as e:
        raise HTTPException(status_code=400, detail=str(e))
    except Exception as e:
        logging.error(f"Error processing request: {e}")
        raise HTTPException(status_code=500, detail="Internal server error")

# Core processing function
def main(username, project_name, query, top_k, tokenizer, model, retriever_model, retriever_tokenizer, reranker_model, nlp):
    """Process the query and generate a response with context, query, response, and references."""
    connect_to_milvus()
    collection_name = get_collection_name(username, project_name)
    collection = Collection(collection_name)
    collection.load()

    query = validate_query(query)
    
    # Create embeddings using the retriever model
    query_embedding = embed_query(query, retriever_model, retriever_tokenizer)
    
    # Initial retrieval with embeddings
    retrieved_docs, retrieval_scores = retrieve_documents(collection, query_embedding, query, top_k * 2)
    
    # Rerank the retrieved documents
    reranked_docs, confidence_scores = rerank_documents(reranker_model, query, retrieved_docs, top_k)
    
    # Extract sources from the reranked documents
    sources = [extract_source(doc) for doc in reranked_docs]
    unique_sources = list(set(sources))
    
    # Prepare context from the reranked documents
    context = "\n".join(reranked_docs)

    # Truncate context if too long
    if len(tokenizer.encode(context)) > 1500:
        context = tokenizer.decode(tokenizer.encode(context)[:1500])

    prompt = (
        f"Context: {context}\n\n"
        f"Question: {query}\n\n"
        "Only provide the exact answer to the question without additional information.\n"
        "Answer:"
    )
    
    # Generate the response
    answer = generate_response(model, tokenizer, prompt)
    final_response = filter_response(answer, nlp)

    return {
        "reference_text": context,
        "query": query,
        "response": final_response,
        "reference_doc": unique_sources,
        "confidence_score": max(confidence_scores) if confidence_scores else 0.0
    }

# Helper functions
def get_collection_name(username, project_name):
    """Generate a unique collection name based on username and project."""
    return f"NV_{username}_{project_name}"

def connect_to_milvus():
    """Establish a connection to the Milvus server."""
    connections.connect("default", host="127.0.0.1", port="19530")

def load_retriever_model():
    """Load the retriever model and tokenizer."""
    tokenizer = AutoTokenizer.from_pretrained(RETRIEVER_MODEL_DIR)
    model = AutoModel.from_pretrained(RETRIEVER_MODEL_DIR)
    return tokenizer, model

def load_reranker_model():
    """Load the reranker model."""
    return CrossEncoder(RERANKER_MODEL_DIR)

def embed_query(query, retriever_model, retriever_tokenizer):
    """Create embedding for the query using the retriever model."""
    inputs = retriever_tokenizer(query, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = retriever_model(**inputs)
        # Use mean pooling to get the sentence embedding
        attention_mask = inputs["attention_mask"]
        token_embeddings = outputs.last_hidden_state
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        embedding = torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        
        # Get the embedding as a numpy array
        embedding_np = embedding[0].cpu().numpy()
        
        # Check dimension compatibility with your original model
        # If your original SentenceTransformer model produced 384-dim vectors (common for all-MiniLM-L6-v2)
        expected_dim = 384  # Replace with the dimension of your Milvus collection
        actual_dim = embedding_np.shape[0]
        
        # Handle dimension mismatch
        if actual_dim != expected_dim:
            if actual_dim > expected_dim:
                # Truncate if new embedding is larger
                embedding_np = embedding_np[:expected_dim]
            else:
                # Pad with zeros if new embedding is smaller
                padding = np.zeros(expected_dim - actual_dim)
                embedding_np = np.concatenate([embedding_np, padding])
        
        return embedding_np.tolist()

def retrieve_documents(collection, query_embedding, query_text, limit=10):
    """Retrieve documents using vector search."""
    search_params = {"metric_type": "L2", "params": {"nprobe": 10}}
    results = collection.search(
        [query_embedding],
        "embedding",
        search_params,
        limit=limit,
        output_fields=["text"]
    )
    
    retrieved_docs = [hit.entity.get("text") for hit in results[0]]
    retrieval_scores = [hit.score for hit in results[0]]
    
    return retrieved_docs, retrieval_scores

def rerank_documents(reranker_model, query, documents, top_k=5):
    """Rerank the retrieved documents using the reranker model."""
    if not documents:
        return [], []
    
    # Prepare input pairs for reranking
    pairs = [(query, doc) for doc in documents]
    
    # Get scores from the reranker model
    scores = reranker_model.predict(pairs)
    
    # Sort documents by reranker scores
    ranked_results = [(doc, score) for doc, score in zip(documents, scores)]
    ranked_results.sort(key=lambda x: x[1], reverse=True)
    
    # Return top-k documents and their scores
    top_docs = [doc for doc, _ in ranked_results[:top_k]]
    top_scores = [float(score) for _, score in ranked_results[:top_k]]
    
    return top_docs, top_scores

def extract_source(text):
    """Extract the source name from the text."""
    match = re.match(r"Source:\s*(.+?)\n", text)
    return match.group(1) if match else "Unknown"

def load_restricted_words():
    """Load restricted words from a file or return defaults."""
    if os.path.exists(RESTRICTED_WORDS_FILE):
        with open(RESTRICTED_WORDS_FILE, 'r') as f:
            return [line.strip().lower() for line in f.readlines()]
    return ["hack", "exploit"]

def validate_query(query):
    """Validate the query for length, restricted words, and SQL injection attempts."""
    restricted_words = load_restricted_words()
    if len(query) < MIN_QUERY_LENGTH:
        logging.warning(f"Query too short: {query}")
        raise ValueError("Invalid query: Too short.")
    if any(word in query.lower() for word in restricted_words):
        logging.warning(f"Restricted word found in query: {query}")
        raise ValueError("Invalid query: Contains restricted content.")
    sql_injection_patterns = [
        r"\b(select|insert|update|delete|drop|alter)\b",
        r"\b(union|into|from|where)\b",
        r"--|/\*|\*/"
    ]
    for pattern in sql_injection_patterns:
        if re.search(pattern, query, re.IGNORECASE):
            logging.warning(f"Potential SQL injection attempt: {query}")
            raise ValueError("Invalid query: Potential SQL injection attempt.")
    return query

def filter_response(response, nlp):
    """Filter the response for profanity and sensitive entities."""
    if any(word in response.lower() for word in PROFANITY_LIST):
        logging.info(f"Profanity detected in response: {response}")
        return "[Response blocked due to inappropriate content.]"
    doc = nlp(response)
    redacted_response = response
    for ent in doc.ents:
        if ent.label_ in ["PERSON", "ORG", "GPE", "LOC"]:
            redacted_response = redacted_response.replace(ent.text, "[REDACTED]")
    logging.info(f"Filtered response: {redacted_response}")
    return redacted_response

def load_phi2_model():
    """Load the Phi-2 model and tokenizer."""
    tokenizer = AutoTokenizer.from_pretrained(PHI_2_MODEL_DIR, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(
        PHI_2_MODEL_DIR,
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True
    )
    return tokenizer, model

def generate_response(model, tokenizer, prompt):
    """Generate a response using the Phi-2 model, returning only the generated part."""
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    input_length = inputs["input_ids"].shape[1]
    outputs = model.generate(**inputs, max_new_tokens=200)
    generated_sequence = outputs[0, input_length:]
    return tokenizer.decode(generated_sequence, skip_special_tokens=True)

if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=10002)