In [1]:
import sys
import os
import pandas as pd
import json
from dotenv import load_dotenv
import tiktoken
import pickle

from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.rate_limiters import InMemoryRateLimiter

from langchain.vectorstores import FAISS
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA

from langchain.agents import AgentExecutor, Tool, initialize_agent, load_tools
import json


module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.llm import get_azure_embeddings_client, get_llm_client, get_gemini_llm_client

if not load_dotenv():
    raise Exception('Error loading .env file. Make sure to place valid keys in the .env file.')

In [2]:
ARTICLES_CLEAN_DIR = os.path.join("..", "data", "articles_clean")
FILTERED_METADATA_PATH = os.path.join("..", "data", "filtered_metadata.csv")
DB_PATH = os.path.join("..", "data", "db", "sample.db")
RESULTS_DIR =  os.path.join("..", "data", "results")

if not os.path.exists(DB_PATH):
    os.makedirs(os.path.dirname(DB_PATH), exist_ok=True)

if not os.path.exists(RESULTS_DIR):
    os.makedirs(os.path.dirname(RESULTS_DIR), exist_ok=True)

In [3]:
# Default go-to Openrouter LLM - check README for other available models
llm = get_llm_client(
    # Configurable parameters
    max_tokens=1024,
    temperature=0.2,
)

In [9]:
system_prompt = """
You are an expert assistant. Use only the provided news document to create a list of the main entities in the document.
Assign each person a relevance score from 0 to 1 based on their impact in the news article. For example, the main subject 
should have a score of 1, and someone who only appears once to contribute a quote should receive a score near 0. Each entity
should also have an entity_type attribute that describes the type of entity (it should be person if the entity is a human)
Furthermore, create a list of relationships between entity in the story. Each relationship should have a descriptor (for example 
"empolyee of", "enemy of", "brother"), and an attitude attribute (1 for strong allies, -1 for committed enemies, 0 for ambivalent). 
If the nature of the relationship cannot be determined, use 0 as the attitude, and leave the descriptor blank. There should be a 
relationship between each pair of entity in the first list. 
The output should be structured as a JSON file with two fields:
entities: the list of entity, each an object with name and relevance
relationships: the list of relationships, each an object with originator, target, descriptor, and attitude.
Double check that the output is valid JSON.
Document: {context}
"""

prompt_template = PromptTemplate(
    input_variables=["context"], 
    template=system_prompt
)

# Define a tool to validate JSON
def validate_json(output):
    """Checks if output is valid JSON."""
    try:
        json.loads(output)
        return True
    except json.JSONDecodeError:
        return False

# Define the agent
def create_json_agent():
    agent = initialize_agent(
        tools=[],  # No extra tools needed
        llm=llm,
        agent="zero-shot-react-description",
        verbose=True
    )
    return agent

# Create the agent
agent = create_json_agent()

# Define a function to run the query and ensure valid JSON output
def query_with_json_validation(query):
    valid_output = False
    attempts = 0
    max_attempts = 5
    result = ""

    while not valid_output and attempts < max_attempts:
        attempts += 1
        print(f"Attempt {attempts}...")
        response = agent.run(prompt_template.format(query=query))
        if validate_json(response):
            valid_output = True
            result = response
        else:
            print("Invalid JSON. Retrying...")

    if not valid_output:
        raise ValueError("Failed to get valid JSON after multiple attempts.")

    return result

  agent = initialize_agent(


ValueError: Got no tools for ZeroShotAgent. At least one tool must be provided.

In [5]:
filtered_metadata = pd.read_csv(FILTERED_METADATA_PATH)
filtered_metadata.head(5)

Unnamed: 0,id,filename,published_at,author,title,category,section,word_count,financial_crisis,sustainability,fake_news,ai,digitalization,local_journalism,covid,demographics,innovation,valid_indicator
0,a04948c4-0233-45f3-9254-ee5806ee8f1f,verraterische-blutstropfen.json,2015-06-04 20:00:00,Heiner Boberski,Verräterische Blutstropfen,Wissen,Nachrichten,559,0.3887,0.4845,0.3991,0.4591,0.4603,0.4584,0.4906,0.505,0.5192,False
1,e1fe872d-9463-4191-971d-e0b5205c3527,wenn-man-nicht-mehr-aufstehen-kann.json,2021-12-10 06:45:00,Petra Tempfer,Wenn man nicht mehr aufstehen kann,Politik,Nachrichten,868,0.2073,0.2716,0.2995,0.2896,0.263,0.3101,0.3453,0.3499,0.2856,False
2,91153224-95e2-42c4-8cc4-17831dd51c1a,demokratie-und-gerichtsbarkeit.json,2022-12-21 13:00:00,Nikolaus Lehner,Demokratie und Gerichtsbarkeit,Recht,Themen,1229,0.3479,0.4082,0.6284,0.5545,0.461,0.5414,0.6642,0.4497,0.5965,True
3,7ac60fbf-b5d4-498c-a836-3e220290cc8c,gefahr-fur-schutzstandards-oder-nur-pr-gag.json,2021-06-10 17:15:00,Franz Leidenmühler,Gefahr für Schutzstandards oder nur PR-Gag?,Recht,Themen,863,0.3626,0.6611,0.4566,0.4712,0.4566,0.3706,0.4835,0.444,0.4683,True
4,372821fe-b364-41cf-90a4-22f8d625e417,ein-facebook-account-kann-teuer-werden.json,2021-08-10 15:23:00,Katharina Braun,Ein Facebook-Account kann teuer werden,Recht,Themen,741,0.6265,0.5746,0.5079,0.5483,0.5812,0.3298,0.5881,0.6009,0.5552,True


In [6]:
def get_documents_from_path(filenames: list[str]) -> list[Document]:
    documents = []
    
    for file_name in filenames:
        file_path = os.path.join(ARTICLES_CLEAN_DIR, file_name)
        with open(file_path, "r", encoding="utf-8") as file:
            file = json.load(file)

        text = file.get("text", "")
        documents.append(Document(page_content=text, metadata={
            "title": file.get("title", ""),
            "author": file.get("author", ""),
            "published_at": file.get("published_at", ""),
            "id": file.get("id", ""),
        }))

    return documents

In [7]:
documents = get_documents_from_path(filtered_metadata["filename"])
print(f"Number of articles: {len(documents)}")

Number of articles: 143


In [8]:
jsons = []
for document in documents:
    result_filename = f'{document.metadata['id']}.json'
    result_filepath = os.path.join(RESULTS_DIR, result_filename)
    print(result_filepath)
    if os.path.exists(result_filepath):
        print('Skipping')
        continue
    query_input = document.page_content
    result: str = query_with_json_validation(query_input)
    jsons.append(result)
    cleaned = result.removeprefix("```json").removesuffix('```')
    try:
        result_json = json.JSONDecoder().decode(cleaned)
        with open(result_filepath, "w") as file:
            json.dump(result_json, file, indent=4)
    except Exception:
        print('Error')
        continue


..\data\results\a04948c4-0233-45f3-9254-ee5806ee8f1f.json
Skipping
..\data\results\e1fe872d-9463-4191-971d-e0b5205c3527.json
Skipping
..\data\results\91153224-95e2-42c4-8cc4-17831dd51c1a.json
Skipping
..\data\results\7ac60fbf-b5d4-498c-a836-3e220290cc8c.json
Skipping
..\data\results\372821fe-b364-41cf-90a4-22f8d625e417.json
Skipping
..\data\results\6c051deb-7b89-452c-9a68-4d58fe17bf63.json
Skipping
..\data\results\93d9b9ae-5561-4735-b3a4-a416c6f95f75.json
Skipping
..\data\results\d819df7e-3050-4e27-be8b-8f1b5244d768.json
Skipping
..\data\results\d93a3ea7-a58c-4199-b995-46bda033c973.json
Skipping
..\data\results\478da68d-93a3-4514-836e-21412566b724.json


NameError: name 'query_with_json_validation' is not defined