<a href="https://colab.research.google.com/github/arekarnarayan/AIModels/blob/dev1/sample_gemma3_ocr_mcp_v0.1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers torch gradio pypdf langchain chromadb sentence-transformers pillow pytesseract

In [None]:
!pip install huggingface_hub langchain_community

In [6]:
# Set up Hugging Face authentication
import os
from google.colab import userdata
from transformers import pipeline
import torch
import gradio as gr
from PIL import Image
import pytesseract
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

# Set up HuggingFace authentication
sec_key = userdata.get("HUGGINGFACEHUB_API_TOKEN")
os.environ["HUGGINGFACEHUB_API_TOKEN"] = sec_key

In [12]:
def initialize_models():
    # Initialize Gemma model using pipeline
    model_id = "google/gemma-3-4b-it"

    text_pipeline = pipeline(
        "text-generation",
        model=model_id,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        token=sec_key
    )

    # Initialize embeddings for RAG
    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2",
        model_kwargs={'device': 'cuda' if torch.cuda.is_available() else 'cpu'}
    )

    return text_pipeline, embeddings


In [8]:
# OCR function using MistralOCR
def perform_ocr(image):
    text = pytesseract.image_to_string(image)
    return text

# RAG implementation
def setup_rag(text, embeddings):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200
    )
    splits = text_splitter.split_text(text)

    vectorstore = Chroma.from_texts(
        texts=splits,
        embedding=embeddings,
        collection_name="document_store"
    )

    return vectorstore


In [9]:
# Multi-Context Prompting with pipeline
def generate_mcp_response(query, contexts, text_pipeline):
    # Combine multiple contexts with the query
    prompt = f"""Context:
{' '.join(contexts)}

Query: {query}
Response:"""

    # Generate response using the pipeline
    response = text_pipeline(
        prompt,
        max_length=512,
        num_return_sequences=1,
        temperature=0.7,
        top_p=0.9,
        do_sample=True
    )[0]['generated_text']

    # Extract only the generated response part
    response = response.split("Response:")[-1].strip()

    return response


In [10]:
# Gradio interface
def create_interface():
    text_pipeline, embeddings = initialize_models()

    def process_input(image, query):
        # Extract text from image using OCR
        ocr_text = perform_ocr(image)

        # Setup RAG with extracted text
        vectorstore = setup_rag(ocr_text, embeddings)

        # Retrieve relevant contexts
        relevant_docs = vectorstore.similarity_search(query, k=3)
        contexts = [doc.page_content for doc in relevant_docs]

        # Generate response using MCP
        response = generate_mcp_response(query, contexts, text_pipeline)

        return response

    # Create Gradio interface
    iface = gr.Interface(
        fn=process_input,
        inputs=[
            gr.Image(type="pil", label="Upload Image"),
            gr.Textbox(label="Enter your question")
        ],
        outputs=gr.Textbox(label="Response"),
        title="Gemma 3 + MistralOCR + RAG + MCP Demo",
        description="Upload an image and ask questions about its content.",
        examples=[
            ["example_image.jpg", "What is the main topic of this document?"]
        ]
    )

    return iface



In [14]:
# Launch the interface
if __name__ == "__main__":
    # Install required system dependencies
    !apt-get install -y tesseract-ocr

    # Create and launch the interface
    interface = create_interface()
    interface.launch(share=True)

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 34 not upgraded.


config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/90.6k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.64G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

Device set to use cpu
  embeddings = HuggingFaceEmbeddings(


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://44f1631c0552fbc387.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
