# Install needed dependencies

In [None]:
!pip install transformers sentence_transformers docling chromadb rapidocr-onnxruntime

In [None]:
! pip install fastapi uvicorn nest_asyncio

# If i  will run in my local environment

In [None]:
!pip install 'accelerate>=0.26.0'

# Chroma db config

In [4]:
import chromadb
from chromadb.config import DEFAULT_TENANT, DEFAULT_DATABASE, Settings
from chromadb.utils import embedding_functions

sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name = "all-MiniLM-L6-v2"
)
client = chromadb.PersistentClient(
    path = "./.chroma_db",
    settings = Settings(),
    tenant = DEFAULT_TENANT,
    database = DEFAULT_DATABASE,
)

# user_collection = client.get_or_create_collection(
#     name = "user_collection",
#     embedding_function = sentence_transformer_ef,
#     metadata = {
#         "hnsw:space":"cosine",
#         "hnsw:search_ef":100
#     }
# )

general_collection = client.get_or_create_collection(
    name = "general_collection",
    embedding_function = sentence_transformer_ef,
    metadata = {
        "hnsw:space":"cosine",
        "hnsw:search_ef":100
    }
)

# Docling scrapper config

In [None]:
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import InputFormat, DocumentStream
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.chunking import HybridChunker
from transformers import AutoTokenizer

pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True
doc_converter = DocumentConverter(
    format_options = {
        InputFormat.PDF: PdfFormatOption(
            pipeline_options = pipeline_options, 
            backend = PyPdfiumDocumentBackend
        )
    }
)

MAX_TOKENS = 1000
OVERLAP_TOKENS = 200
EMBED_MODEL_ID = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(EMBED_MODEL_ID)
chunker = HybridChunker(
    tokmizer = tokenizer,
    max_tokens = MAX_TOKENS,
)

# Parse user pdf, do embedding and store data to vector db

In [17]:
from fastapi import UploadFile, WebSocket, HTTPException
import uuid
from io import BytesIO
from docling.datamodel.base_models import InputFormat, DocumentStream
async def upload_pdf(file: UploadFile,  store_type: str, indexName: str, websocket: WebSocket):
    if not file:
        raise HTTPException(status_code=400, detail="No file uploaded")

    if file.content_type != "application/pdf":
        raise HTTPException(status_code=400, detail="Uploaded file is not a PDF")
    
    if len(await file.read()) == 0:
        raise HTTPException(status_code=400, detail="Uploaded file is empty")
        
    file.file.seek(0)
    file_name = file.filename
    pdf_bytes = await file.read()
    buf = BytesIO(pdf_bytes)

    print(f"Processing PDF: Scrapping content...")
    await websocket.send_text("Processing PDF: Scraping content...")
    source = DocumentStream(name=file.filename, stream=buf)
    result = doc_converter.convert(source)
    doc = result.document

    print(f"Processing PDF: Chunk results...")
    await websocket.send_text("Processing PDF: Chunking results...")
    chunk_iter = chunker.chunk(doc)
    chunks = []
    prev_text = ""
    for i, chunk in enumerate(chunk_iter):
        overlap_text = prev_text[-OVERLAP_TOKENS:] if len(prev_text) > OVERLAP_TOKENS else prev_text
        combined_text = overlap_text + chunk.text
        chunks.append({"chunk_id": i, "text": combined_text})
        prev_text = chunk.text

    print(f"Processing PDF: Embedding results...")
    await websocket.send_text("Processing PDF: Embedding results...")
    embeddings = []
    metadata_list = []
    for chunk in chunks:
        embedding = sentence_transformer_ef([chunk["text"]]) 
        embeddings.append(embedding[0])
        metadata = {"chunk_id": chunk["chunk_id"], "source": file_name}
        metadata_list.append(metadata)
    
    unique_ids = [str(uuid.uuid4()) for _ in chunks]

    print(f"Processing PDF: Storing data in Vector DB...")
    await websocket.send_text("Processing PDF: Storing data in Vector DB (user index)...")
    user_collection = client.get_or_create_collection(
        name = indexName,
        embedding_function = sentence_transformer_ef,
        metadata = {
            "hnsw:space":"cosine",
            "hnsw:search_ef":100
        }
    )
    user_collection.add(
        ids=unique_ids,
        documents=[chunk["text"] for chunk in chunks],
        embeddings=embeddings,
        metadatas=metadata_list
    )
    if store_type == "user_and_general":
        await websocket.send_text("Processing PDF: Storing data in Vector DB (general index)...")
        general_collection.add(
            ids=unique_ids,
            documents=[chunk["text"] for chunk in chunks],
            embeddings=embeddings,
            metadatas=metadata_list
        )
    await websocket.send_text(f"Uploaded and indexed {len(chunks)} chunks from user PDF {file_name} to index {store_type}.")
    return f"Uploaded and indexed {len(chunks)} chunks from user PDF {file_name} to index {store_type}."

# Embedding and store general data to vector db

In [10]:
import json
# file_path = "/kaggle/input/anatomy-data-json/decoded_output.json"
file_path = "./data-json/decoded_output.json"
with open(file_path, "r", encoding="utf-8") as file:
    data = json.load(file)

embeddings = []
metadata_list = []
for item in data:
    embedding = sentence_transformer_ef([item["text"]]) 
    embeddings.append(embedding[0])
    metadata = {"chunk_id": item["chunk_id"], "source": item["source"]}
    metadata_list.append(metadata)
general_collection.add(
    ids=[str(index) for index, _ in enumerate(data)],
    documents=[item["text"] for item in data],
    embeddings=embeddings,
    metadatas=metadata_list
)
print(f"Uploaded and indexed {len(data)} chunks from JSON file.")

Uploaded and indexed 12085 chunks from JSON file.


# Retrival and G enerator config

In [20]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import time
from datetime import datetime

# model_name = "Qwen/Qwen2.5-0.5B-Instruct"

# model = AutoModelForCausalLM.from_pretrained(
#     model_name,
#     torch_dtype="auto",
#     device_map="auto"
# )
# tokenizer = AutoTokenizer.from_pretrained(model_name)

models = {
    "Qwen/Qwen2.5-0.5B-Instruct": None,
    "Qwen/Qwen2.5-1.5B-Instruct": None,
    "Qwen/Qwen2.5-3B-Instruct": None
}

tokenizers = {
    "Qwen/Qwen2.5-0.5B-Instruct": None,
    "Qwen/Qwen2.5-1.5B-Instruct": None,
    "Qwen/Qwen2.5-3B-Instruct": None
}

def load_model_and_tokenizer(model_name):
    if model_name not in models:
        raise ValueError(f"Model {model_name} is not supported.")
    if models[model_name] is None:
        models[model_name] = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype="auto",
            device_map="auto"
        )
        tokenizers[model_name] = AutoTokenizer.from_pretrained(model_name)
    return models[model_name], tokenizers[model_name]

def get_generate_response(query: str, collection_name: str, model_name: str, search_type: str):
    print("in generating method")
    query_timestamp = datetime.utcnow().isoformat()
    total_start_time = time.time()
    retrieval_start_time = time.time()

    model, tokenizer = load_model_and_tokenizer(model_name)

    collection = client.get_collection(collection_name)

    query_embedding = sentence_transformer_ef([query])[0]
    print("start reterival")
    results = collection.query(
                query_embeddings=query_embedding,
                n_results=6
            )
    retrieval_time = time.time() - retrieval_start_time

    documents = results.get("documents", [[]])[0]
    metadatas = results.get("metadatas", [[]])[0]
    similarities = results.get("distances", [])[0]

    context = "\n".join(documents)
    resources = [meta.get("source", "Unknown source") for meta in metadatas]
    unique_resources = list(set(resources))

    messages = [
        {"role": "system", "content": "You are an AI assistant that answers questions based on the provided context.\n"},
        {"role": "system", "content": f"Context: {context}\n"},
        {"role": "user", "content": query}
    ]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    print("begin generating a response")
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
    input_tokens = model_inputs.input_ids.shape[-1]

    generation_start_time = time.time()
    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=512
    )
    generation_time = time.time() - generation_start_time 

    output_tokens = sum(len(ids) for ids in generated_ids)

    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]
    
    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    print("end of generated")
    retrieved_docs = [
            {
                "content": doc,
                "similarity": round(similarity * 100, 2),  # Convert to percentage
                "source": meta.get("source", "Unknown source")
            }
            for doc, similarity, meta in zip(documents, similarities, metadatas)
    ]
    total_time = time.time() - total_start_time
    StackTrace = {
        "userQuery": query,
        "chatbotResponse": response,
        "retrievedDocs": retrieved_docs,
        "inputTokens": input_tokens,
        "outputTokens": output_tokens,
        "queryTimestamp": query_timestamp,
        "model_name": model_name,
        "sources": unique_resources,
        "performanceMetrics": {
            "totalTime": round(total_time, 4),
            "retrievalTime": round(retrieval_time, 4),
            "generationTime": round(generation_time, 4)
        }

    }

    return response, unique_resources, StackTrace
    


In [13]:
query = "what is anatomy and Physiology?"
collection_name = "general_collection"
search_type = "general_collection"
response, unique_resources, StackTrace = get_generate_response(query, collection_name, "Qwen/Qwen2.5-1.5B-Instruct", search_type)

In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer

models = {
    "Qwen/Qwen2.5-0.5B-Instruct": None,
    "Qwen/Qwen2.5-1.5B-Instruct": None,
    "Qwen/Qwen2.5-3B-Instruct": None
}

for model_name in models.keys():
    # Load the tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name)

    # Store them in the dictionary if needed
    # models[model_name] = {"model": model, "tokenizer": tokenizer}


Downloading shards: 100%|██████████| 2/2 [1:01:23<00:00, 1841.95s/it]
Loading checkpoint shards: 100%|██████████| 2/2 [00:34<00:00, 17.33s/it]


In [None]:
print(response)
print("\n -----------------------------------------------------------------------------")
print(unique_resources)

# Do Translation

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "Qwen/Qwen2.5-0.5B-Instruct"

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

def get_translation(query):
    print("\n ------------------------------- begin translation ----------------------------------------------")
    # Chat messages to guide translation
    messages = [
        {"role": "system", "content": "You are Qwen, a multilingual assistant. If the user provides a query, you should automatically translate it to user wanted language."},
        {"role": "user", "content": query}
    ]

    # Prepare the text using the chat template
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

    # Prepare the input for the model
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

    # Generate the response
    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=512
    )

    # Extract generated IDs and decode the response
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]
    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    print("\n ------------------------------- end translation ----------------------------------------------")

    # Display the translation
    return response


# Create APIs

In [None]:
from fastapi import FastAPI, UploadFile, WebSocket, File, Form, HTTPException
from pydantic import BaseModel
import uvicorn
import nest_asyncio
from fastapi.middleware.cors import CORSMiddleware

nest_asyncio.apply()

app = FastAPI()

# Add CORS Middleware
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],  # Allow all origins (replace '*' with specific domains in production)
    allow_credentials=True,
    allow_methods=["*"],  # Allow all HTTP methods
    allow_headers=["*"],  # Allow all headers
)

active_connections = {}

@app.get("/")
def read_root():
    return {"message": "Welcome to the AI Assistant API!"}

@app.websocketwebsocket("/ws/{process_id}")
async def websocket_endpoint(websocket: WebSocket, process_id: str):
    if not process_id or process_id == "undefined":  
        await websocket.close(code=403)  # Close the connection gracefully
        return
    print(f"New connection with process_id: {process_id}")
    await websocket.accept()
    active_connections[process_id] = websocket
    try:
        while True:
            await websocket.receive_text()
    except Exception as e:
        print(f"WebSocket error: {e}")
    finally:
        # Clean up the active connections
        active_connections.pop(process_id, None)
        await websocket.close()

@app.post("/upload-pdf")
async def upload_user_pdf(file: UploadFile = File(...), store_type: str = Form(...),  indexName: str = Form(...), process_id: str = Form(...)):
    if process_id in active_connections:
        websocket = active_connections[process_id]
        response = await upload_pdf(file, store_type, indexName, websocket)
        return {"message": response}
    else:
        raise HTTPException(status_code=400, detail="Invalid process_id or WebSocket connection not active")



class QueryInput(BaseModel):
    query: str
    index_name: str
    model_name: str
    search_type: str

@app.post("/query/")
async def query_index(input: QueryInput):
    query = input.query
    collection_name = input.index_name
    model_name = input.model_name
    search_type = input.search_type
    response, unique_resources, StackTrace = get_generate_response(query, collection_name, model_name, search_type)
    return {"response": response, "sources": unique_resources, "stackTrace": StackTrace}

class TranslateQuery(BaseModel):
    query: str

@app.post("/translate/")
async def translate_query(input: TranslateQuery):
    query = input.query
    translated_query = get_translation(query)
    return {"translatedQuery": translated_query}
if __name__ == "__main__":
    uvicorn.run(app, host="localhost", port=8000)