<a href="https://colab.research.google.com/github/arjunchint/mistral_hackathon/blob/main/luis_mistral_hack_2024.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install --upgrade pip
!pip install mistralai
!pip install PyPDF2

Collecting pip
  Downloading pip-24.0-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 23.1.2
    Uninstalling pip-23.1.2:
      Successfully uninstalled pip-23.1.2
Successfully installed pip-24.0
Collecting mistralai
  Downloading mistralai-0.1.6-py3-none-any.whl.metadata (1.9 kB)
Collecting httpx<0.26.0,>=0.25.2 (from mistralai)
  Downloading httpx-0.25.2-py3-none-any.whl.metadata (6.9 kB)
Collecting orjson<4.0.0,>=3.9.10 (from mistralai)
  Downloading orjson-3.9.15-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (49 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.5/49.5 kB[0m [31m970.7 kB/s[0m eta [36m0:00:00[0m
[?25hCollecting pandas<3.0.0,>=2.2.0 (from mistralai)
  Downloading pandas-2.2.1-cp310-cp310-manylinux_2_17_x86_64.manylinux20

In [None]:
import json
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import List

@dataclass
class NodeComponentInput:
    # Define the input class here
    name: str
    jsonInstructions: dict
    data: List[dict]

@dataclass
class NodeComponentOutput:
    # Define the output class here
    name: str
    data: dict

class NodeComponent(ABC):
    @abstractmethod
    def compute(self, input: NodeComponentInput) -> NodeComponentOutput:
        pass


In [None]:
from mistralai.client import MistralClient
from mistralai.models.chat_completion import ChatMessage
from google.colab import userdata

class Mistral:
    def __init__(self):
        # Get your Mistral API Key from environment variables
        self.api_key = userdata.get('MISTRAL_API_KEY')
        self.model = "mistral-large-latest"
        self.client = MistralClient(api_key=self.api_key)

    def call(self, msg: str):
        """
        Non-streaming call to Mistral API
        """
        messages = [
            ChatMessage(role="user", content=msg)
        ]
        return self.client.chat(model=self.model, messages=messages)

    def get_response(self, msg: str) -> str:
        responseObj = self.call(msg)
        return responseObj.choices[0].message.content


In [None]:
import requests
from PyPDF2 import PdfReader
from io import BytesIO

def get_url_pdf(url: str) -> str:
    """
    Will return PDF as string. If PDF can't be read, will return empty string.
    """
    try:
        # Download the PDF file
        response = requests.get(url)
        response.raise_for_status()

        # Load the PDF content into a BytesIO object (in-memory binary stream)
        pdf_stream = BytesIO(response.content)

        # Load the PDF content into a PyPDF2 PdfReader object directly from the memory stream
        pdf_reader = PdfReader(pdf_stream)

        pdf_text = ""
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            pdf_text += page.extract_text()

        return pdf_text
    except Exception as e:
        print(f"Error occurred while reading PDF, url: {url}, e: {e}")
        return ""

# Test it:
# url_1 = 'https://www.cityofartesia.us/DocumentCenter/View/7436/FY-2021-22-ACFR'
# url_2 = "https://issuu.com/mountainviewrecreation/docs/att2_-_acfr_and_independent_auditor_s_report"
# foo = get_url_pdf(url_1)
# print(f"output: {foo[:100]}")
# foo = get_url_pdf(url_2)
# print(f"output: {foo[:100]}")

In [None]:
import re
import ast

def extract_json_array(input_string):
    pattern = r"`json(.*?)`"  # Regex pattern for code blocks
    matches = re.findall(pattern, input_string, flags=re.DOTALL)
    executable_code = str(matches).strip()[1:-1]  # Remove extra '[' and ']'
    # Use ast.literal_eval for safe conversion
    return executable_code

In [None]:
import math

# Calculated by calling with a large PDF once and seeing that Mistral can't take all
# data in, about 1/3rd of it worked. That's around 80k chars.
MAX_CHAR_LEN = 70000

class DataIngestionNode(NodeComponent):

    def __init__(self):
        self.mistral = Mistral()

    def compute(self, input: NodeComponentInput) -> NodeComponentOutput:
        # Loop through input's data["documents"] and retrieve for relevant data

        prompt = input.jsonInstructions["prompt"]
        structured_data = dict()

        for map_data in input.data:
            if not("url" in map_data and map_data['url']):
                print(f"Error, no URL found: {map_data}")
                continue
            url = map_data['url']
            pdf_txt = get_url_pdf(url)
            if not pdf_txt:
                print(f"Failed to get PDF from URL: {url}")
                continue

            structured_data[url] = self._get_structured_data(prompt, pdf_txt)

        return NodeComponentOutput(name = "name", data = structured_data)

    def _get_structured_data(self, prompt: str, pdf_txt: str):
        """
        Returns JSON Array with structured data
        """

        final_prompts = []
        json_array_output = json.loads("[]")

        for i in range(math.ceil(len(pdf_txt) / MAX_CHAR_LEN)):
            start_idx = i * MAX_CHAR_LEN
            end_idx = (i + 1) * MAX_CHAR_LEN
            final_prompt = prompt + "/n" + pdf_txt[start_idx:end_idx]
            final_prompts.append(final_prompt)

        for prompt in final_prompts:
            response = self.mistral.get_response(prompt)
            try:
                json_response = extract_json_array(response)
                json_array_output += json.loads(json_response)
            except Exception as e:
                print(f"JSON loading failed, e: {e} \n\n\n response: {type(response)} {response} \n\n\n json_response: {type(json_response)} {json_response}")

        return json_array_output


In [None]:
prompt_instruction = """
Please give me a valid JSON ARRAY containing financial data that will be relevant and insightful for a bond trader who is looking to price and purchase bonds from this municipality using the below text.

Here are some example JSON ARRAYs, I want something similar though not exactly like this:

[
  {
    "symbol": "AAPL",
    "companyName": "Apple Inc.",
    "stockPrice": 150.25
  },
  {
    "symbol": "GOOGL",
    "companyName": "Alphabet Inc.",
    "stockPrice": 2800.50
  }
]

[
  {
    "category": "Income",
    "amount": 5000
  },
  {
    "category": "Rent",
    "amount": -1500
  }
]

[
  {
    "currency": "EUR",
    "rate": 0.92
  },
  {
    "currency": "JPY",
    "rate": 110.25
  }
]

This JSON ARRAY will be passed to matplotlib to visualize the data.

IMPORTANT: Use the right scale for the numbers as highlighted in the text and positive or negative values accordingly.
IMPORTANT: Only return the JSON ARRAY.

Here's the text:
"""

foo = NodeComponentInput(
    name = "test_input",
    jsonInstructions = {
        "prompt": prompt_instruction,
    },
    data = [
        {
            "url":'https://www.cityofartesia.us/DocumentCenter/View/7436/FY-2021-22-ACFR',
            "municipality": "Palo Alto"
        }
    ]
)

dIN = DataIngestionNode()
result = dIN.compute(foo)

for key, value in result.data.items():
    print(key)
    print(value)


JSON loading failed, e: Expecting value: line 1 column 1 (char 0) 


 response: <class 'str'> Based on the provided text, here's a JSON array containing financial data relevant for a bond trader looking to price and purchase bonds from the municipality:

```json
[
  {
    "Category": "Net Position",
    "Governmental Activities": 36405858,
    "Business-Type Activities": 1340690,
    "Total": 37746548
  },
  {
    "Category": "Change in Net Position",
    "Governmental Activities": 6330813,
    "Business-Type Activities": -172264,
    "Total": 6158549
  },
  {
    "Category": "Revenue",
    "Property Taxes": 2146321,
    "Transient Occupancy Taxes": 410567,
    "Sales Taxes": 5551693,
    "Franchise Taxes": 909908,
    "Business License Taxes": 525901,
    "Other Taxes": 209327,
    "Operating Grants and Contributions": 1623044,
    "Capital Grants and Contributions": 7843268,
    "Charges for Services": 1173584,
    "Total": 20494101
  },
  {
    "Category": "Expenditure",
    "Genera

In [None]:
for _, value in result.data.items():
    value_1 = value
    json_array_1 = json.loads(value)
    value_2 = value
    json_array_2 = json.loads(value)
    combined_json_array = json_array_1 + json_array_2

    print(combined_json_array)

[{'Fiscal Year': '2022', 'Net Position': 37746548, 'Unrestricted Net Position': 3074461, 'Total Revenues': 11250895, 'Net (Expense) Revenue and Changes in Net Position': 6158549}, {'Fund': 'General', 'Total Assets': 19306280, 'Total Liabilities': 1003369, 'Fund Balances': 11770470}, {'Fund': 'Capital Projects', 'Total Assets': 2615547, 'Total Liabilities': 3761, 'Fund Balances': 1371179}, {'Fund': 'Billboard', 'Total Assets': 1985656, 'Total Liabilities': 0, 'Fund Balances': 546095}, {'Fund': 'ARPA', 'Total Assets': 1374940, 'Total Liabilities': 0, 'Fund Balances': 6831995}, {'Fund': 'Paid Parking', 'Total Assets': 947072, 'Total Liabilities': 96731, 'Fund Balances': 851251}, {'Fund': 'Artesia Towne Center', 'Total Assets': 489439, 'Total Liabilities': 0, 'Fund Balances': 489439}, {'Fiscal Year': '2022', 'Net Position': 37746548, 'Unrestricted Net Position': 3074461, 'Total Revenues': 11250895, 'Net (Expense) Revenue and Changes in Net Position': 6158549}, {'Fund': 'General', 'Total As