In [None]:
from groq import Groq
import json
import os
import re
import unicodedata
import dotenv
dotenv.load_dotenv("../../evalap/.env")

def normalize_text(text: str) -> str:
    """
    Removes extra spaces and newlines from a string.
    - Replaces multiple spaces/tabs with a single space.
    - Removes leading/trailing whitespace from each line.
    - Removes empty lines.
    """
    text = unicodedata.normalize('NFKD', text)
    text = text.encode('ascii', 'ignore').decode('ascii')
    
    # Replace multiple whitespace characters (excluding newlines) with a single space
    text = re.sub(r'[ \t]+', ' ', text)
    
    # Split text into lines, strip whitespace from each, and filter out empty lines
    lines = [line.strip() for line in text.split('\n')]
    non_empty_lines = [line for line in lines if line]
    
    # Join the non-empty lines back with a single newline
    return '\n'.join(non_empty_lines)

def get_llm_output(text_content: str, schema: dict) -> dict:
    """
    Calls the Groq API to extract information from text based on a JSON schema.

    Args:
        text_content: The normalized input text.
        schema: The JSON schema for the expected output.

    Returns:
        A dictionary containing the extracted information.
    """
    response = client.chat.completions.create(
        model="moonshotai/kimi-k2-instruct",
        messages=[
            {"role": "system", "content": "Extract informations from the text according to the provided JSON schema."},
            {"role": "user", "content": text_content},
        ],
        response_format={
            "type": "json_schema",
            "json_schema": {
                "name": "entity_extraction",
                "schema": schema,
                # "strict": True
            }
        }
    )
    return json.loads(response.choices[0].message.content)

# --- Main execution ---

client = Groq() # api_key is loaded from GROQ_API_KEY environment variable

directory = "/home/erwan/Desktop/clients/MinistereTransitionEcologique/mon-devis-sans-oublis-benchmark-gestes"
schema_path = "./mdso-admin-schema.json"
dataset_dir = os.path.join(directory, "dataset")

with open(schema_path, "r", encoding="utf-8") as f:
    schema = json.load(f)

results = []
# Iterate over each folder in the dataset directory
for folder_name in os.listdir(dataset_dir)[:2]:
    folder_path = os.path.join(dataset_dir, folder_name)
    if os.path.isdir(folder_path):
        text_path = os.path.join(folder_path, "text.txt")
        
        if os.path.exists(text_path):
            print(f"Processing: {text_path}")
            with open(text_path, "r", encoding="utf-8") as f:
                raw_text = f.read()

            # Prepare the data for the model
            text_content = normalize_text(raw_text)
            
            # Get the model's output
            try:
                llm_output = get_llm_output(text_content, schema)
                
                # Store the results
                results.append({
                    "query": text_content,
                    "output_true": json.dumps(llm_output)
                })
            except Exception as e:
                print(f"An error occurred while processing {text_path}: {e}")


# Print the final results as a JSON array
print(json.dumps(results, indent=2))


In [None]:
import pandas as pd
pd.DataFrame(results).to_csv("../../datasets/mdso-dataset2.csv", index=False)