In [1]:
!pip install mistralai

Collecting mistralai
  Downloading mistralai-1.5.1-py3-none-any.whl.metadata (29 kB)
Collecting eval-type-backport>=0.2.0 (from mistralai)
  Downloading eval_type_backport-0.2.2-py3-none-any.whl.metadata (2.2 kB)
Collecting jsonpath-python>=1.0.6 (from mistralai)
  Downloading jsonpath_python-1.0.6-py3-none-any.whl.metadata (12 kB)
Collecting typing-inspect>=0.9.0 (from mistralai)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting mypy-extensions>=0.3.0 (from typing-inspect>=0.9.0->mistralai)
  Downloading mypy_extensions-1.0.0-py3-none-any.whl.metadata (1.1 kB)
Downloading mistralai-1.5.1-py3-none-any.whl (278 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m278.3/278.3 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading eval_type_backport-0.2.2-py3-none-any.whl (5.8 kB)
Downloading jsonpath_python-1.0.6-py3-none-any.whl (7.6 kB)
Downloading typing_inspect-0.9.0-py3-none-any.whl (8.8 kB)
Downloading mypy_extensions-1.0.0-py3-n

In [2]:
from mistralai import Mistral

api_key = "api"
client = Mistral(api_key=api_key)
text_model = "mistral-small-latest"
ocr_model = "mistral-ocr-latest"

### System and Tool
For the model to be aware of its purpose and what it can do, it's important to provide a clear system prompt with instructions and explanations of any tools it may have access to.

Let's define a system prompt and the tools it will have access to, in this case, `open_urls`.

*Note: `open_urls` can easily be customized with other resources and models ( for summarization, for example ) and many other features. In this demo, we are going for a simpler approach.*

In [3]:
system = """You are an AI Assistant with document understanding via URLs. You will be provided with URLs, and you must answer any questions related to those documents.

# OPEN URLS INSTRUCTIONS
You can open URLs by using the `open_urls` tool. It will open webpages and apply OCR to them, retrieving the contents. Use those contents to answer the user.
Only URLs pointing to PDFs and images are supported; you may encounter an error if they are not; provide that information to the user if required."""

In [4]:
def _perform_ocr(url: str) -> str:
    try:   # Apply OCR to the PDF URL
        response = client.ocr.process(
            model=ocr_model,
            document={
                "type": "document_url",
                "document_url": url
                }
            )
    except Exception:
        try:  # IF PDF OCR fails, try Image OCR
            response = client.ocr.process(
                model=ocr_model,
                document={
                    "type": "image_url",
                    "image_url": url
                    }
                )
        except Exception as e:
            return e  # Return the error to the model if it fails, otherwise return the contents
    return "\n\n".join([f"### Page {i+1}\n{response.pages[i].markdown}" for i in range(len(response.pages))])

In [5]:
def open_urls(urls: list) -> str:
    contents = "# Documents"
    for url in urls:
        contents += f"\n\n## URL: {url}\n{_perform_ocr(url)}"
    return contents

We also have to define the Tool Schema that will be provided to our API and model.

By following the [documentation](https://docs.mistral.ai/capabilities/function_calling/), we can create something like this:

In [6]:
tools = [
    {
        "type": "function",
        "function": {
            "name": "open_urls",
            "description": "Open URLs websites (PDFs and Images) and perform OCR on them.",
            "parameters": {
                "type": "object",
                "properties": {
                    "urls": {
                        "type": "array",
                        "description": "The URLs list.",
                    }
                },
                "required": ["urls"],
            },
        },
    },
]

In [7]:
names_to_functions = {
    'open_urls': open_urls
}

### Test
Everything is ready; we can quickly create a while loop to chat with our model directly in the console.


In [8]:
import json

messages = [{"role": "system", "content": system}]
while True:
    # Insert user input, quit if desired
    user_input = input("User > ")
    if user_input == "quit":
        break
    messages.append({"role": "user", "content": user_input})

    # Loop Mistral Small tool use until no tool called
    while True:
        response = client.chat.complete(
            model = text_model,
            messages = messages,
            temperature = 0,
            tools = tools
        )
        messages.append({"role":"assistant", "content": response.choices[0].message.content, "tool_calls": response.choices[0].message.tool_calls})

        # If tool called, run tool and continue, else break loop and reply
        if response.choices[0].message.tool_calls:
            tool_call = response.choices[0].message.tool_calls[0]
            function_name = tool_call.function.name
            function_params = json.loads(tool_call.function.arguments)
            function_result = names_to_functions[function_name](**function_params)
            messages.append({"role":"tool", "name":function_name, "content":function_result, "tool_call_id":tool_call.id})
        else:
            break

    print("Assistant >", response.choices[0].message.content)

User > Hi
Assistant > Hello! How can I assist you today?
User > summarise
Assistant > Sure, I can help with that. Please provide the URLs of the documents you would like me to summarize.
User > https://raw.githubusercontent.com/mistralai/cookbook/refs/heads/main/mistral/ocr/mistral7b.pdf
Assistant > The document titled "Mistral 7B" introduces a 7-billion-parameter language model designed for superior performance and efficiency. Developed by a team including Albert Q. Jiang, Alexandre Sablayrolles, Arthur Mensch, and others, Mistral 7B outperforms other models like Llama 2 and Llama 1 in various benchmarks, particularly in reasoning, mathematics, and code generation. The model uses grouped-query attention (GQA) for faster inference and sliding window attention (SWA) to handle longer sequences efficiently.

### Key Points:

1. **Performance and Efficiency**:
   - Mistral 7B outperforms Llama 2 13B across all benchmarks and Llama 1 34B in specific areas like mathematics and code generatio

## Built-In
Mistral provides a built-in feature that leverages OCR with all models. By providing a URL pointing to a document, you can extract text data that will be provided to the model.


### System and Regex
Let's define a simple system prompt, since there is no tool call required for this demo we can be fairly straightforward.

In [9]:
system = "You are an AI Assistant with document understanding via URLs. You may be provided with URLs, followed by their corresponding OCR."

To extract the URLs, we will use regex to extract any URL pattern from the user query.

*Note: We will assume there will only be PDF files for simplicity.*

In [10]:
import re

def extract_urls(text: str) -> list:
    url_pattern = r'\b((?:https?|ftp)://(?:www\.)?[^\s/$.?#].[^\s]*)\b'
    urls = re.findall(url_pattern, text)
    return urls

### Test

#### Example Prompts ( PDFs )
- Could you summarize what this research paper talks about? https://arxiv.org/pdf/2410.07073
- Explain this architecture: https://arxiv.org/abs/2401.04088

In [12]:
import json

messages = [{"role": "system", "content": system}]
while True:
    user_input = input("User > ")
    if user_input.lower() == "quit":
        break

    # Extract URLs from the user input, assuming they are always PDFs
    document_urls = extract_urls(user_input)
    user_message_content = [{"type": "text", "text": user_input}]
    for url in document_urls:
        user_message_content.append({"type": "document_url", "document_url": url})
    messages.append({"role": "user", "content": user_message_content})

    # Send the messages to the model and get a response
    response = client.chat.complete(
        model=text_model,
        messages=messages,
        temperature=0
    )
    messages.append({"role": "assistant", "content": response.choices[0].message.content})

    print("Assistant >", response.choices[0].message.content)


User > Could you summarize what this research paper talks about? https://arxiv.org/pdf/2410.07073
Assistant > The research paper introduces Pixtral 12B, a 12-billion-parameter multimodal language model designed to understand both natural images and documents. The model is trained to perform high-level reasoning and is capable of multi-turn, multi-image conversations. Pixtral 12B uses a new vision encoder trained from scratch, allowing it to process images at their native resolution and aspect ratio. This flexibility enables the model to handle images in latency-constrained settings or when fine-grained reasoning is required.

Key points from the paper:

1. **Architecture and Training**:
   - Pixtral 12B is based on the transformer architecture and consists of a multimodal decoder and a vision encoder.
   - The vision encoder, named PixtralViT, is designed to process images at variable resolutions and aspect ratios using a 400 million parameter vision transformer.
   - The model uses Ro

SDKError: API error occurred: Status 400
{"object":"error","message":"Invalid document type. text/html is not supported.","type":"invalid_file","param":null,"code":"1901"}