In [180]:
from langchain.agents import create_agent
from langchain.agents.structured_output import ToolStrategy
from dotenv import load_dotenv
# from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_groq import ChatGroq
from pydantic import BaseModel
from typing import Literal, List
from langchain.messages import SystemMessage, HumanMessage

load_dotenv()

True

In [181]:
class LawArticle(BaseModel):
    _id: str
    law_id: str
    article_number: str
    article_index: float
    text: str
    language: Literal['en', 'sl']

class LawArticleResponse(BaseModel):
    article_number: str
    article_index: float
    text: str
    chapter: str
    language: Literal['en', 'sl']

class Response(BaseModel):
    laws: List[LawArticleResponse]

In [190]:
model = ChatGroq(
    model='llama-3.3-70b-versatile',
    temperature=0.1
)

agent = create_agent(
    model=model,
    response_format=Response
)

In [258]:
system_prompt = """
You are a legal text parser for Slovenian laws. Your task is to extract articles (člen) from a law text and return them in a structured JSON format.

Instructions:
IMPORTANT:  If an article has the title in paragraphs (under the article number), keep it in the 'text' field as a separate paragraph.

1. Return **each article** (including sub-articles, e.g., 3.a) as a separate object.
2. Include the following fields in the output:
   - article_number: the exact article number as in the text (e.g., "3", "3.a")
   - article_index: float representing the article's position (sub-articles use decimals, e.g., 3.1)
   - text: the exact text of the article (do not change it)
   - language: "sl"
   - chapter: (optional) the chapter/section this article belongs to, if indicated in the text

3. Output **only a JSON array** of article objects. Example:

[
  {
    "article_number": "3",
    "article_index": 3.0,
    "text": "Slovenija je država vseh svojih državljank in državljanov...",
    "language": "sl",
    "chapter": "I. SPLOŠNE DOLOČBE"
  },
  {
    "article_number": "3.a",
    "article_index": 3.1,
    "text": "Slovenija lahko z mednarodno pogodbo...",
    "language": "sl",
    "chapter": "I. SPLOŠNE DOLOČBE"
  }
]

4. Only include real articles. Do not include titles, preambles, footnotes, or amendments as separate objects.
5. Preserve all characters, accents, and formatting exactly as in the original text.
6. Keep each paragraph exactly as-is. Each line is its own paragraph. 
In the 'text' field, insert literal newline characters between paragraphs, like "\\n". 
Do not merge paragraphs. 
"""


In [344]:
data = """
X. PREHODNE IN KONČNE DOLOČBE
172. člen
Ta ustava začne veljati z razglasitvijo.
173. člen
Določbe te ustave se uporabljajo z dnem razglasitve, razen če v ustavnem zakonu za izvedbo te ustave ni drugače določeno.
174. člen
Za izvedbo te ustave in za zagotovitev prehoda k uporabi določb te ustave se sprejme ustavni zakon.
Ustavni zakon se sprejme z dvotretjinsko večino glasov vseh poslancev v vseh zborih Skupščine Republike Slovenije.
"""

In [345]:
def get_structured_articles(data: str, law_id: str):
    result = agent.invoke(
        {
            'messages': [
                SystemMessage(content=system_prompt),
                HumanMessage(content=data)
            ]
        }
    )

    articles = result['structured_response'].laws
    print(f"Created {len(articles)} articles")

    return [
    {
        '_id': f"{law_id}_{article.article_index}",
        'law_id': law_id,
        **article.model_dump()
    } for article in articles
]


Run `get_structured_articles`

data: raw text with articles

law_id: slug for the law

In [346]:
articles = get_structured_articles(data, 'ustava')

Created 3 articles


In [347]:
articles

[{'_id': 'ustava_172.0',
  'law_id': 'ustava',
  'article_number': '172',
  'article_index': 172.0,
  'text': 'Ta ustava začne veljati z razglasitvijo.',
  'chapter': 'X. PREHODNE IN KONČNE DOLOČBE',
  'language': 'sl'},
 {'_id': 'ustava_173.0',
  'law_id': 'ustava',
  'article_number': '173',
  'article_index': 173.0,
  'text': 'Določbe te ustave se uporabljajo z dnem razglasitve, razen če v ustavnem zakonu za izvedbo te ustave ni drugače določeno.',
  'chapter': 'X. PREHODNE IN KONČNE DOLOČBE',
  'language': 'sl'},
 {'_id': 'ustava_174.0',
  'law_id': 'ustava',
  'article_number': '174',
  'article_index': 174.0,
  'text': 'Za izvedbo te ustave in za zagotovitev prehoda k uporabi določb te ustave se sprejme ustavni zakon.\nUstavni zakon se sprejme z dvotretjinsko večino glasov vseh poslancev v vseh zborih Skupščine Republike Slovenije.',
  'chapter': 'X. PREHODNE IN KONČNE DOLOČBE',
  'language': 'sl'}]

In [348]:
from pymongo.mongo_client import MongoClient
import os

client = MongoClient(os.getenv("MONGODB_URI"))

db = client.get_database('pravni-vodnik')
articles_col = db.get_collection('articles')


In [349]:
articles_col.insert_many(articles)

InsertManyResult(['ustava_172.0', 'ustava_173.0', 'ustava_174.0'], acknowledged=True)

In [352]:
articles_in_db = articles_col.find().to_list()

for article in articles_in_db:
    print(article['article_index'], end=' ')

1.0 2.0 3.0 3.1 4.0 5.0 6.0 7.0 8.0 9.0 10.0 11.0 12.0 13.0 14.0 15.0 16.0 17.0 18.0 19.0 20.0 21.0 22.0 23.0 24.0 25.0 26.0 29.0 30.0 31.0 32.0 33.0 34.0 35.0 36.0 37.0 38.0 39.0 40.0 41.0 42.0 43.0 44.0 45.0 46.0 47.0 48.0 49.0 52.0 53.0 54.0 55.0 56.0 57.0 58.0 59.0 60.0 61.0 62.0 62.1 63.0 64.0 65.0 50.0 51.0 66.0 67.0 68.0 69.0 70.0 70.1 71.0 72.0 73.0 74.0 74.1 75.0 76.0 77.0 78.0 79.0 80.0 81.0 82.0 83.0 84.0 85.0 86.0 87.0 88.0 89.0 90.0 91.0 92.0 93.0 94.0 95.0 96.0 97.0 98.0 99.0 100.0 101.0 102.0 103.0 104.0 105.0 106.0 107.0 108.0 109.0 110.0 111.0 112.0 113.0 114.0 115.0 116.0 117.0 118.0 119.0 120.0 121.0 122.0 123.0 124.0 125.0 126.0 127.0 128.0 129.0 130.0 131.0 132.0 133.0 134.0 138.0 139.0 140.0 141.0 142.0 143.0 144.0 145.0 146.0 147.0 148.0 149.0 150.0 151.0 152.0 153.0 154.0 155.0 156.0 157.0 158.0 159.0 160.0 161.0 162.0 163.0 164.0 165.0 166.0 167.0 168.0 169.0 170.0 171.0 172.0 173.0 174.0 