# About:

Prepare the data to populate the 'TexteFiscExactCode' column from the 'Textes' table.

In [1]:
# !pip install llama-index
# !pip install llama-index-readers-database
# !pip install llama-index-embeddings-huggingface
# !pip install llama-index-llms-ollama
# !pip install llama-index-postprocessor-cohere-rerank
# !pip install llama-index-postprocessor-flag-embedding-reranker
# !pip install FlagEmbedding
# !pip install openpyxl
# !pip install psycopg2
# !pip install pandas
# !pip install sqlalchemy
# !pip install tiktoken

In [None]:
# !pip install llama-index-llms-openai

In [2]:
# !pip install pyvis

In [3]:
# !pip install llama-index-embeddings-ollama
# !pip install llama-index-postprocessor-colbert-rerank

In [21]:
# !pip install voyageai
# !pip install llama-index-embeddings-voyageai

Collecting llama-index-embeddings-voyageai
  Downloading llama_index_embeddings_voyageai-0.2.2-py3-none-any.whl.metadata (701 bytes)
Downloading llama_index_embeddings_voyageai-0.2.2-py3-none-any.whl (2.8 kB)
Installing collected packages: llama-index-embeddings-voyageai
Successfully installed llama-index-embeddings-voyageai-0.2.2


In [146]:
# !pip install llama-index-postprocessor-voyageai-rerank

In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
from dotenv import load_dotenv
import os

load_dotenv()

True

In [3]:
COUNTRY_NAME = os.environ.get("COUNTRY_NAME")
SPOKEN_LANGUAGE = "French"

table_column_name = "TexteFiscExactCode"

In [4]:
from general_config import COUNTRY_NAMES_LIST

# validate COUNTRY_NAME
if COUNTRY_NAME in COUNTRY_NAMES_LIST:
    print('country name OK')

country name OK


## Get data from Postgres

In [5]:
from postgres_connection import get_postgress_data
from sql_files import sql_files
import pandas as pd

### This is not necessary if we extract the data using the llama_index db reader

In [6]:
df = get_postgress_data(sql_files['get_docs_per_country'].replace("%country_name%", COUNTRY_NAME))

  df = pd.read_sql(query, conn)


In [7]:
df.shape

(218, 3)

In [8]:
df_droit = get_postgress_data(sql_files['get_droit_type_for_docs_per_country'].replace("%country_name%", COUNTRY_NAME))

In [9]:
df_droit.shape

(218, 2)

In [10]:
df = pd.merge(df,df_droit,how='left', on='title')

### This is mandatory

In [11]:
df_fisc_exacts = get_postgress_data(sql_files['get_textes_fiscaux_exacts'], db='Ferdi')

  df = pd.read_sql(query, conn)


In [12]:
df_fisc_exacts['TexteFiscExactCode'] = df_fisc_exacts['TexteFiscExactCode'].apply(lambda x: x[1:])
df_fisc_exacts['TexteFiscStandardCode'] = df_fisc_exacts['TexteFiscStandardCode'].apply(lambda x: x[1:])

In [13]:
df_fisc_exacts[df_fisc_exacts['TexteFiscExactCode'] == 'Gen_CGI_CGIprocedModif']['TexteFiscExactComplet'][20]

'Modification du Livre de procédures fiscales. Définit les modalités de déclaration et de paiement, établit les délais et sanctions en cas de non-respect, et encadre le contrôle et le contentieux liés à cette taxe destinée à favoriser l’emploi des jeunes. Peut contenir « Livre de procédures fiscales » ou « LPF » dans le titre du document.'

In [14]:
df_fisc_standards = get_postgress_data(sql_files['get_textes_fiscaux_standards'], db='Ferdi')

In [15]:
df_fisc_standards['TexteFiscStandardCode'] = df_fisc_standards['TexteFiscStandardCode'].apply(lambda x: x[1:])

In [16]:
definitions = df_fisc_standards.set_index('TexteFiscStandardCode')['TexteFiscStandardComplet'].to_dict()
definitions

{'Gen_CGI': 'Code général des impôts. Le Code général des impôts est un recueil législatif centralisant toutes les lois et règlements relatifs à la fiscalité. Il définit les types d’impôts, les modalités de calcul, de déclaration et de paiement, ainsi que les obligations des contribuables et les pouvoirs de l’administration fiscale, assurant une gestion cohérente des impôts. Inclut les fonds de logement.',
 'Gen_IT': "Impôt sur le revenu. L'Impôt sur le revenu encadre la taxation des revenus des individus et des entreprises. Il précise les sources de revenus imposables, les barèmes d'imposition, les mécanismes de déclaration et de paiement, ainsi que les dispositions relatives aux exonérations et déductions fiscales.",
 'Gen_CGT': "Impôt sur les plus-values. L'Impôt sur les plus-values régit la taxation des gains réalisés lors de la vente d’actifs tels que biens immobiliers, actions ou autres investissements. Il définit les méthodes de calcul, les obligations de déclaration et de paiem

# RAG approach

## Get data from Postgres using llama-index db reader

In [17]:
from postgres_connection import psql_conn_config
from llama_index.readers.database import DatabaseReader
from sql_files import sql_files

In [18]:
db = DatabaseReader(
    scheme="postgresql",  # Database Scheme
    host=psql_conn_config.get("HOSTNAME"),  # Database Host
    port="5432",  # Database Port
    user=psql_conn_config.get("USERNAME"),  # Database User
    password=psql_conn_config.get("PASSWORD"),  # Database Password
    dbname=psql_conn_config.get("DATABASE"),  # Database Name
)

### Load the data as llama_index documents

In [19]:
from llama_index.core import Document, VectorStoreIndex

# documents = db.load_data(query=sql_files['get_docs_MALI'])

# texts = df[~df['title'].str.contains("JO")][df['content'].str.len()<2000000].to_dict(orient='index')
# texts = df[~df['title'].str.contains("JO")].to_dict(orient='index')
texts = df.to_dict(orient='index')
# texts = dict(list(texts.items())[-30:])
documents = [Document(text=txt['content'].replace('\x92', '\''), metadata={'title': txt['title'], 'law type': txt['tag']}) for txt in texts.values()]

## RAG

### Embeddings model

#### Hugging Face embedding

In [20]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from sentence_transformers import SentenceTransformer

# embed_model = HuggingFaceEmbedding(model_name="dunzhang/stella_en_1.5B_v5", trust_remote_code=True) #Alibaba-NLP/gte-Qwen2-1.5B-instruct
# embed_model = SentenceTransformer("dunzhang/stella_en_1.5B_v5", trust_remote_code=True)

### Vector DataBase

In [21]:
from llama_index.llms.ollama import Ollama
from llama_index.core import Settings

import os
from llama_index.core import VectorStoreIndex, load_index_from_storage
from llama_index.core.storage import StorageContext
from llama_index.core import Settings

# Settings.embed_model = embed_model # we specify the embedding model to be used

from llama_index.core.node_parser import TokenTextSplitter


In [22]:
# imports

import os
from llama_index.embeddings.voyageai import VoyageEmbedding

# get API key and create embeddings

model_name = "voyage-law-2"  # Please check https://docs.voyageai.com/docs/embeddings for the available models
VOYAGE_API_KEY = "pa-8PHDn6SpL9EILUBXOuUiQYT7J7qNVYsRbV8rBmvlxGk"
voyage_api_key = os.environ.get("VOYAGE_API_KEY", VOYAGE_API_KEY)

embed_model = VoyageEmbedding(
    model_name=model_name, voyage_api_key=voyage_api_key
)

In [23]:
from llama_index.embeddings.openai import OpenAIEmbedding
# embed_model = OpenAIEmbedding(model="text-embedding-3-small") 

In [24]:
from llama_index.core import Settings

Settings.embed_model = embed_model

In [25]:
from llama_index.core.text_splitter import SentenceSplitter

In [26]:
from llama_index.core.node_parser import SemanticSplitterNodeParser

splitter = SemanticSplitterNodeParser(
    buffer_size=1, breakpoint_percentile_threshold=95, embed_model=embed_model
)

# nodes = splitter.get_nodes_from_documents(documents)

In [27]:
from transformers import AutoTokenizer

voyage_tokenizer = AutoTokenizer.from_pretrained('voyageai/voyage-3')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [28]:
chunk_size = int(1024)
overlap = 100 if chunk_size > 1024 else int(chunk_size/10)

transformations_example = [
    TokenTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=int(chunk_size/10),
        separator=" ",
    ),
    embed_model,
]

transformations_sentence = [SentenceSplitter(chunk_size=chunk_size, chunk_overlap=overlap)]
# transformations_sentence = [splitter]

storage_name = f"storage_{COUNTRY_NAME}_complete_1024_voyage-law"

if not os.path.exists(storage_name):
    index = VectorStoreIndex.from_documents(documents, show_progress=True, transformations=transformations_example)
    # save index to disk
    index.set_index_id("vector_index")
    index.storage_context.persist(f"./{storage_name}")
else:
    print('loading from local')
    # rebuild storage context
    storage_context = StorageContext.from_defaults(persist_dir=storage_name)
    # load index
    index = load_index_from_storage(storage_context, index_id="vector_index")

vector_index = index

loading from local


In [29]:
doc_titles = pd.Series(list(set([v.get('title') for k,v in index.vector_store.to_dict()['metadata_dict'].items()])))
doc_titles[0]

'img_Décret n°2020-288 (08.12.2020) Convention minière type 2020 (ITIE Mali)'

#### Query pipeline

In [30]:
from llama_index.core import PromptTemplate
from llama_index.core.query_pipeline import QueryPipeline

from llama_index.core.response_synthesizers import ResponseMode
from llama_index.core import get_response_synthesizer

from llama_index.core.response_synthesizers import TreeSummarize

In [41]:
# setting up the llm
# llm = Ollama(
#     model="llama3.1", 
#     temperature=0.01, 
#     request_timeout=180.0,
#     # context_window=chunk_size,
#     context_window=2**14) 

from llama_index.llms.openai import OpenAI

llm = OpenAI(model="gpt-4o-mini", temperature=0)

# os.environ["OPENAI_API_KEY"] = ""

Settings.llm = llm

In [32]:
# retriever = index.as_retriever(similarity_top_k=5)
summarizer = TreeSummarize(llm=llm)

In [33]:
from llama_index.core import get_response_synthesizer
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.query_pipeline import InputComponent


In [69]:
vector_index = index

In [34]:
# import QueryBundle
from llama_index.core import QueryBundle

# import NodeWithScore
from llama_index.core.schema import NodeWithScore

# Retrievers
from llama_index.core.retrievers import (
    BaseRetriever,
    VectorIndexRetriever,
    KeywordTableSimpleRetriever,
)

from typing import List

In [71]:
# doc_name = 'Ordonnance n°2020-013 (21.12.2020) Loi de finances 2021 (Ministère des Finances)'
# doc_name = "Loi n°2001-075 (18.07.2001) Code des douanes 2001 (eRegulations Mali)"
# doc_name ="img_Loi n°2014-056 (26.12.2014) Annexe fiscale Loi de finances 2015 (Droit-Afrique)" # this document contains  MODIFICATION DU CODE GENERAL DES IMPOTS and Les articles 73 (nouveau) et 74 A du Code Général des Impôts sont modifiés ainsi qu’il suit

# doc_name = "Décret n°2004-357 (08.09.2004) Application Code pétrolier 2004 (Droit-Afrique)"
# doc_name = "img_Décret n°2019-006 (10.01.2019) ITIE 2019 (ITIE Mali)"
# doc_name = "img_Loi n°2023-041 (29.08.2023) Contenu local minier 2023 (LinkedIn)"
# doc_name = 'Loi n°2006-067 (29.12.2006) Code général des impôts 2006 (CNP) déverrouillé'
# doc_name = 'Loi n°2019-070 (24.12.2019) Loi de finances 2020 (Ministère des Finances)'
doc_name = 'Décret n°1997-182 (02.06.1997) Application Jeux de hasard 1997 (SGG)'
doc_name = 'img_Loi n°2016-056 (21.12.2016) Loi de finances 2017 (Ministère des Finances)'

#### Trial 1

In [35]:
from llama_index.core.postprocessor import SimilarityPostprocessor

processor = SimilarityPostprocessor(similarity_cutoff=0.5)
# filtered_nodes = processor.postprocess_nodes(nodes)

In [36]:
from llama_index.core.postprocessor import LLMRerank

from llama_index.postprocessor.colbert_rerank import ColbertRerank

In [37]:
from llama_index.postprocessor.voyageai_rerank import VoyageAIRerank

voyageai_rerank = VoyageAIRerank(
    api_key=VOYAGE_API_KEY, top_k=2, model="rerank-2", truncation=False
)

In [42]:
from llama_index.core import get_response_synthesizer
from llama_index.core.response_synthesizers import ResponseMode
from llama_index.core.query_engine import RetrieverQueryEngine

from llama_index.core.tools import QueryEngineTool
from llama_index.core.query_engine import RouterQueryEngine
from llama_index.core.selectors import LLMSingleSelector, LLMMultiSelector
from llama_index.core.selectors import (
    PydanticMultiSelector,
    PydanticSingleSelector,
)

vector_index = vector_index

from llama_index.core.vector_stores import (
    MetadataFilter,
    MetadataFilters,
    FilterOperator,
)

def create_vector_engine(doc_name):
    
    # vector_index = vector_index_chunk  # VectorStoreIndex(all_nodes, storage_context=storage_context)


    # list_query_engine = summary_index.as_query_engine(
    #     response_mode="tree_summarize",
    #     use_async=True,
    # )
    # # vector_query_engine = vector_index.as_query_engine()

    # list_tool = QueryEngineTool.from_defaults(
    #     query_engine=list_query_engine,
    #     description=(
    #         "Useful for summarization questions related to legal texts from Mali."
    #     ),
    # )

    filters = MetadataFilters(
        filters=[
            MetadataFilter(
                key="title", value=doc_name
            ),
        ]
    )

    reranker = LLMRerank(
                llm=llm,
                choice_batch_size=10,
                top_n=5,
            )

    colbert_reranker = ColbertRerank(
        top_n=5,
        model="colbert-ir/colbertv2.0",
        tokenizer="colbert-ir/colbertv2.0",
        keep_retrieval_score=True,
    )

    vector_retriever = VectorIndexRetriever(index=vector_index, similarity_top_k=10, filters=filters) #, alpha=0.5) #, embed_model=embed_model)
    response_synthesizer = get_response_synthesizer(llm=llm, response_mode=ResponseMode.COMPACT)

    vector_query_engine = RetrieverQueryEngine(
        retriever=vector_retriever,
        response_synthesizer=response_synthesizer,
        node_postprocessors=[voyageai_rerank]
        # node_postprocessors=[colbert_reranker],
        # node_postprocessors=[processor]
    )

    # vector_tool = QueryEngineTool.from_defaults(
    #     query_engine=vector_query_engine,
    #     description=(
    #         "Useful for classification of legal texts from Mali."
    #     ),
    # )

    # query_engine = RouterQueryEngine(
    #     selector=PydanticSingleSelector.from_defaults(),
    #     query_engine_tools=[
    #         list_tool,
    #         vector_tool,
    #         # vector_query_engine,
    #     ],
    # )
    # return query_engine
    
    return vector_query_engine
    

In [43]:
from difflib import get_close_matches

def find_most_similar_string(target, string_list):
    # Using difflib's get_close_matches function to find the closest match
    # It returns a list of close matches; we take the first (most similar) one
    closest_matches = get_close_matches(target, string_list, n=1, cutoff=0.0)
    return closest_matches[0] if closest_matches else None

def get_large_categ(doc_name, definitions=definitions):
    vector_query_engine = create_vector_engine(doc_name)

    """
    You are a helpful assistant, with legal document analysis expertise. Please analyze the given context.
    """
    response = vector_query_engine.query(
        f"""
        Consider the contents of the document with this title '{doc_name}'. Then, these definitions: {str(definitions)} 
        In which of these categories : {', '.join(list(definitions.keys()))}, is it part of ? Answer with a valid category, keep the category only. 
        Do not write an introduction, summary, rewrite or anything after the category.
        Do not include any "=" in the final answer. 
        If the document does not match any category, then reply with 'None'.
        If the result is none of the above categories, find the most suitable one, using the definitions and the context.
        """
    )

    print(doc_name, len(response.source_nodes))
    print([node.score for node in response.source_nodes])
    print([node.text for node in response.source_nodes])

    large_cat = response.response
    print(large_cat)

    if large_cat[-1] == "_":
        large_cat = large_cat[-1]

    definitions2 = df_fisc_exacts[df_fisc_exacts['TexteFiscStandardCode']==large_cat].set_index(['TexteFiscExactCode'])['TexteFiscExactComplet'].to_dict()
    print(definitions2.keys())

    if not definitions2:
        return "no_texte_fiscale_standard"
    else:
        response2 = vector_query_engine.query(
            f"""
            The following represents a list of definitions, where we define categories, followed by '=' and then the definitions: {", ".join([f"{k}={v}" for k,v in definitions2.items()])}
            Consider the contents of the document with this name '{doc_name}'.
            In which category, is it part of ? Answer with a valid category, keep the category only. Do not include any "=" in the final answer. 
            Do not include any "=" in the final answer. 
            If the document does not match any category, then reply with 'None'.
            If the result is none of the above categories, find the most suitable one, using the definitions and the context.
            """
        )

        print([node.score for node in response2.source_nodes])
        print([node for node in response2.source_nodes])
        print(response2.response)

    if not response2.response or response2.response == "None":
        return response.response
    else:
        if response2.response not in definitions2.keys():
            most_similar = find_most_similar_string(response2.response, definitions2.keys())
            return most_similar
        else:
            return response2.response

In [44]:
response = get_large_categ(doc_titles[0]) # doc_titles[6]

print(response)

img_Décret n°2020-288 (08.12.2020) Convention minière type 2020 (ITIE Mali) 2
[0.8359375, 0.83203125]
["Des droits d’enregistrement hypothécaire\xa0;\ni) De la contribution au Programme de Vérification des Importations (P.V.I.)\xa0;\nj) De la redevance statistique. 23.11 LA SOCIETE titulaire de permis d’exploitation de grande Mine ou de petite Mine est soumise au paiement des impôts, droits et taxes ci-après :\n\na) les impôts, droits et taxes prévus aux articles 108 à 114 du Code minier ;\nb) la Contribution forfaitaire à la charge de l’employeur, au taux en vigueur ;\nc) la taxe-logement, au taux en vigueur ;\nd) les charges et contributions sociales dues pour les employés, telles que prévues par la réglementation en vigueur ;\ne) l’Impôt sur les traitements et salaires dû par les employés ;\nf) les vignettes sur les véhicules, à l’exception des engins lourds exclusivement liés aux opérations d’exploitation ;\ng) la taxe sur les contrats d’assurance, à la l’exception des véhicules di

In [158]:
response.source_nodes[0].score

0.5391332388422562

In [45]:
large_categs = list()

for doc in doc_titles:
    try:
        categ = get_large_categ(doc)
    except Exception as e:
        print(e)

        categ = "non_determined"

    large_categs.append((doc, categ))

    

img_Décret n°2020-288 (08.12.2020) Convention minière type 2020 (ITIE Mali) 2
[0.8359375, 0.83203125]
["Des droits d’enregistrement hypothécaire\xa0;\ni) De la contribution au Programme de Vérification des Importations (P.V.I.)\xa0;\nj) De la redevance statistique. 23.11 LA SOCIETE titulaire de permis d’exploitation de grande Mine ou de petite Mine est soumise au paiement des impôts, droits et taxes ci-après :\n\na) les impôts, droits et taxes prévus aux articles 108 à 114 du Code minier ;\nb) la Contribution forfaitaire à la charge de l’employeur, au taux en vigueur ;\nc) la taxe-logement, au taux en vigueur ;\nd) les charges et contributions sociales dues pour les employés, telles que prévues par la réglementation en vigueur ;\ne) l’Impôt sur les traitements et salaires dû par les employés ;\nf) les vignettes sur les véhicules, à l’exception des engins lourds exclusivement liés aux opérations d’exploitation ;\ng) la taxe sur les contrats d’assurance, à la l’exception des véhicules di

In [46]:
df_TexteFiscExactCode = pd.DataFrame.from_dict(dict(large_categs), orient='index', columns=['TexteFiscExactCode'])

df_TexteFiscExactCode.to_csv(f"output/TexteFiscExactCode_{COUNTRY_NAME}.csv", index=True)

df_TexteFiscExactCode

Unnamed: 0,TexteFiscExactCode
img_Décret n°2020-288 (08.12.2020) Convention minière type 2020 (ITIE Mali),Min_CM_CMconv
Décret n°2016-272 (29.04.2016) Application Code pétrolier 2016 (ITIE Mali),Pétrol_CP_CPappli
Loi n°1997-014 (07.03.1997) Modification CGI 1997 (SGG),Gen_CGI_CGImodif
img_Décret n°2016-520 (22.07.2016) ITIE 2016 (ITIE Mali),Min_CM_CMitie
Ordonnance n°2013-021 (03.12.2013) Annexe fiscale Loi de finances 2014 (Droit-Afrique),Gen_LF_LF
...,...
Loi n°1994-034 (09.06.1994) Modification CGI 1994 (SGG),Gen_CGI_CGImodif
img_Loi n°2014-056 (26.12.2014) Annexe fiscale Loi de finances 2015 (Droit-Afrique),Gen_FI_FI
Décret n°1996-179 (19.06.1996) Application Office malien de l_habitat 1996 (SGG),Gen_CGI
Ordonnance n°1970-006 (27.02.1970) Code général des impôts 1999 (CIPB),Gen_CGI_CGI


In [52]:
large_categs

[('img_Décret n°2020-288 (08.12.2020) Convention minière type 2020 (ITIE Mali)',
  'Min_CM_CMconv'),
 ('Décret n°2016-272 (29.04.2016) Application Code pétrolier 2016 (ITIE Mali)',
  'Pétrol_CP_CPappli'),
 ('Loi n°1997-014 (07.03.1997) Modification CGI 1997 (SGG)',
  'Gen_CGI_CGImodif'),
 ('img_Décret n°2016-520 (22.07.2016) ITIE 2016 (ITIE Mali)', 'Min_CM_CMitie'),
 ('Ordonnance n°2013-021 (03.12.2013) Annexe fiscale Loi de finances 2014 (Droit-Afrique)',
  'Gen_LF_LF'),
 ('Loi n°2012-016 (27.02.2012) Code des investissements 2012 (Droit-Afrique)',
  'Invest_CI_CI'),
 ('img_Loi n°2008-027 (23.07.2008) Modification Code pétrolier 2008 (Ministère des Mines)',
  'Pétrol_CP_CPmodif'),
 ('Décret n°2005-036 (27.01.2005) Impôt spécial sur certains produits 2005 (Droit-Afrique)',
  'Gen_CGI_ISCP'),
 ('Tarif douanier CEDEAO 2017 amendé 2020 (DGD)', 'no_texte_fiscale_standard'),
 ('img_Ordonnance n°2000-013 (10.02.2000) Modification Code minier 2000 (Ministère des Mines)',
  'Min_CM_CMmodif'),


## Load existing categories

In [100]:
# COUNTRY_NAME = 'BEN Bénin'

large_categs = pd.read_csv(f"output/{table_column_name}_{COUNTRY_NAME}.csv")
large_categs = list(list(large_categs.set_index(['Unnamed: 0']).to_dict().values())[0].items())

## Update column in table

In [55]:
from sqlalchemy import create_engine, Column, Integer, String
from sqlalchemy.orm import sessionmaker, declarative_base
# import logging

from postgres_connection import create_psql_engine

In [81]:

# # Configure logging
# logging.basicConfig(
#     level=logging.INFO,
#     filename='sqlalchemy_bulk_update.log',
#     filemode='a',
#     format='%(asctime)s - %(levelname)s - %(message)s'
# )

engine = create_psql_engine(db="Ferdi")

# Create a configured "Session" class
Session = sessionmaker(bind=engine)

# Create a Session
session = Session()

# Declare a mapping
Base = declarative_base()

class Textes(Base):
    __tablename__ = 'textes'
    
    TexteCode = Column("TexteCode", Integer, primary_key=True)
    PaysCode = Column("PaysCode", String, nullable=False)
    AnneeCodeDebut = Column("AnnéeCodeDébut", Integer)
    AnneeCodeFin = Column("AnnéeCodeFin", Integer)
    TexteLegalExactCode = Column("TexteLégalExactCode", String)
    TexteFiscExactCode = Column("TexteFiscExactCode", String)
    TexteCodeArborescence = Column("TexteCodeArborescence", String)
    TexteCourt = Column("TexteCourt", String)
    TexteComplet = Column("TexteComplet", String)

texts_table = Textes()

def bulk_update_textes(textes_to_update, column_name, texts_table=texts_table, COUNTRY_NAME=COUNTRY_NAME):
    """    
    :param employees_to_update: list of tuples (employee_id, new_annee_code_debut)
    :param column_name: valid column name from the textes table
    """
    for texte_code, new_col_value in textes_to_update:
        try:
            # Query the employee
            text_title = session.query(Textes).filter_by(TexteCode=texte_code, PaysCode=COUNTRY_NAME.split(" ")[0]).one_or_none()
            if text_title:
                # text_title.AnneeCodeDebut = new_code
                setattr(text_title, column_name, f"_{new_col_value}")
                session.commit()
                msg_ = f"{texte_code} column {column_name} updated to {new_col_value}."
                print(msg_)
                # logging.info(f"{texte_code} column {column_name} updated to {new_col_value}.")
            else:
                msg_ = f"{texte_code} not found."
                print(msg_)
                # logging.warning(f"{texte_code} not found.")
        except Exception as e:
            session.rollback()
            msg_ = f"Error updating {texte_code}: {e}"
            print(msg_)
            # logging.error(f"Error updating {texte_code}: {e}")


In [102]:
# bulk_update_employees(textes_to_update = large_categs[0:1], )

bulk_update_textes(
    textes_to_update = large_categs, 
    column_name = table_column_name, 
    texts_table=texts_table, 
    COUNTRY_NAME=COUNTRY_NAME)

# Close the session
session.close()

img_Circulaire n°2018-176 (05.03.2018) Application Loi de finances 2018 (DGI) column TexteFiscExactCode updated to Gen_LF_LFappli.
img_Loi n°2020-030 (28.10.2020) Loi de finances rectificative 2020 (SGG) column TexteFiscExactCode updated to Gen_LF_LFR.
Loi n°2007-033 (02.01.2008) Loi de finances 2008 (Droit-Afrique) column TexteFiscExactCode updated to Gen_LF_LF.
Loi n°2010-046 (30.12.2010) Loi de finances 2011 (Droit-Afrique) column TexteFiscExactCode updated to Gen_LF_LF.
img_Circulaire n°2017-118 (02.02.2017) Application Loi de finances 2017 (DGI) column TexteFiscExactCode updated to Gen_LF_LFappli.
Error updating img_Loi n°1997-014 (06.06.1997) Taxe sur les nuitées 1997 (LégiBénin): (psycopg2.errors.ForeignKeyViolation) insert or update on table "textes" violates foreign key constraint "textes_TexteFiscExactCode_fkey"
DETAIL:  Key (TexteFiscExactCode)=(_Gen_GST) is not present in table "textes_fiscaux_exacts".

[SQL: UPDATE textes SET "TexteFiscExactCode"=%(TexteFiscExactCode)s WHE

## Misc

In [None]:
set(large_categs)

{'_Gen_CGI',
 '_Gen_IT',
 '_Gen_LF',
 '_Gen_LF_',
 '_Gen_VAT',
 '_Min_CM',
 '_Pétrol_CP'}

In [None]:
df_fisc_exacts['TexteFiscStandardCode'].unique()

array(['_Gen_CGI', '_Gen_IT', '_Gen_VAT', '_Gen_LF', '_Min_CM',
       '_Pétrol_CP'], dtype=object)

In [None]:
print(f"{len(response.source_nodes)}")

for _node in response.source_nodes:
    print(f"{_node.score}\n{_node.text[:100]}")

5
0.7510572671890259
Mlle DIARRA  
PRESIDENCE DE LA REPUBLIQUE  
SECRETARIAT GENERAL  

REPUBLIQUE DU MALI  
Un Peuple - 
0.7502212524414062
le Contenu local.

CHAPITRE IV : DE LA MISE EN ŒUVRE ET DU SUIVI DU CONTENU LOCAL

Article 10 : Le C
0.7459142208099365
inclusion sur la plateforme d’appel à concurrence et l’interdiction de conclure des marchés liés aux
0.7431255578994751
les services de consultants et d’assistances techniques ;</li>
<li>h) la gestion des projets pour la
0.7271713614463806
<td>100</td>
  </tr>
  <tr>
    <td>16</td>
    <td>Forage hydraulique</td>
    <td>80</td>
    <td>


In [None]:
definitions2 = df_fisc_exacts[df_fisc_exacts['TexteFiscStandardCode']==str('_Gen_CGI')].set_index(['TexteFiscExactCode'])['TexteFiscExactComplet'].to_dict()

definitions2

{'_Gen_CGI_CGI': 'Code général des impôts',
 '_Gen_CGI_CGImodif': 'Modification du Code général des impôts',
 '_Gen_CGI_CGImodifModif': 'Modification de la Modification du Code général des impôts',
 '_Gen_CGI_CGIamendé': 'Code général des impôts amendé',
 '_Gen_CGI_CGIappli': 'Application du Code général des impôts',
 '_Gen_CGI_CGIappliModif': "Modification de l'Application du Code général des impôts",
 '_Gen_CGI_CGIsuppl': 'Supplément au Code général des impôts',
 '_Gen_CGI_CGIsupplAppli': 'Application du Supplément au Code général des impôts',
 '_Gen_CGI_CGIone': 'Code des impôts directs et indirects',
 '_Gen_CGI_CGIoneModif': 'Modification du Code des impôts directs et indirects',
 '_Gen_CGI_CGIoneAmendé': 'Code des impôts directs et indirects amendé',
 '_Gen_CGI_CGIbis': "Code de l'enregistrement, du timbre et de l'impôt sur le revenu des capitaux mobiliers",
 '_Gen_CGI_CGIbisModif': "Modification du Code de l'enregistrement, du timbre et de l'impôt sur le revenu des capitaux mobil

In [None]:
doc_name = 'Loi n°2006-067 (29.12.2006) Code général des impôts 2006 (CNP)'

vector_query_engine = create_vector_engine(doc_name)

response2 = vector_query_engine.query(
    f"""
    Consider the contents of the document with this title '{doc_name}'. Then, these definitions: {str(definitions2)} 
    In which category, is it part of ? Answer with a valid category, keep the category only. Do not write an introduction, summary, rewrite or anything after the category.
    """

    # The following represents a list of definitions, where we define categories, followed by '=' and then the definitions: {", ".join([f"{k}={v}" for k,v in definitions2.items()])}
    # Consider the contents of the document with this name '{doc_name}'.
    # In which category, is it part of ? Answer with a valid category, keep the category only. Do not write an introduction, summary, rewrite or anything after the category.

    #  Please reply in French
    # f"""
    # Ce qui suit représente une liste de définitions, où nous définissons les catégories, suivies de '=' puis des définitions : {", ".join([f"{k}={v}" for k,v in definitions2.items()])}
    # Considérez le contenu du document portant ce nom '{doc_name}'.
    # Dans laquelle de ces catégories : {', '.join(list(definitions2.keys()))}, cela en fait-il partie ? Répondez avec une catégorie valide, gardez uniquement la catégorie.
    # """
)

print(response2)

Token indices sequence length is longer than the specified maximum sequence length for this model (897 > 512). Running this sequence through the model will result in indexing errors


RuntimeError: The size of tensor a (897) must match the size of tensor b (512) at non-singleton dimension 1

In [None]:
print(response2.source_nodes[1].text)

visibles pour les employés.
Art.27.- La demande d’autorisation de transfert Les titulaires de titres pétroliers et leurs sous-
suite à une cession ou d’amodiation doit être adres- traitants sont également tenus :
sée au Ministre dans les mêmes formes que celles • a) d’assurer aux travailleurs sur le site des
définies à l’Article 12 du présent décret. conditions d’hygiène et de salubrité conformes
à la législation en vigueur ;
Art.28.- Le transfert suite à une cession est accordé • b) de respecter la législation et les règlements
par décret pris en Conseil des Ministres, Il prend sanitaires ;
effet à compter de la date de signature du décret
• c) de respecter les conditions générales du tra-
d’autorisation de transfert.
vail relatives à la prévention et à la protection
contre les accidents de travail et les maladies
Art.29.- L’amodiation transfère à l’amodiataire
professionnelles ;
tous les droits et obligations de l’autorisation
• d) de contribuer à partir de la date de la pre-
d’explo

# Generate definitions from existing documents

In [None]:
laws_dict = {
    "img_Décret n°2016-520 (22.07.2016) ITIE 2016 (ITIE Mali)": ["Initiative pour la Transparence dans les Insdustries Extractives", "_Min_CM_CMitie"],
    "img_Ordonnance n°1999-032 (19.08.1999) Code minier 1999 (Ministère des Mines)": ["Code minier", "_Min_CM_CM"],
    "img_Décret n°2012-311 (21.06.2012) Application Code minier 2012 (Ministère des Mines)": ["Application du Code minier", "_Min_CM_CMappli"],
    "Décret n°2000-050 (10.02.2000) Modification Convention minière type 2000 (SGG)": ["Modification de la Convention minière type", "_Min_CM_CMconvModif"],
    "img_Loi n°2023-040 (29.08.2023) Code minier 2023 (LinkedIn)": ["Code minier", "_Min_CM_CM"],
    "img_Décret n°1999-256 (15.09.1999) Convention minière type 1999 (Ministère des Mines)": ["Convention minière type", "_Min_CM_CMconv"],
    "img_Loi n°2012-015 (27.02.2012) Code minier 2012 (Ministère des Mines)": ["Code minier", "_Min_CM_CM"],
    "img_Décret n°2000-050 (10.02.2000) Modification Convention minière type 2000 (Ministère des Mines)": ["Modification de la Convention minière type", "_Min_CM_CMconvModif"],
    "Décret n°1999-256 (15.09.1999) Convention minière type 1999 (SGG)": ["Convention minière type", "_Min_CM_CMconv"],
    "Décret n°1999-255 (15.09.1999) Application Code minier 1999 (SGG)": ["Application du Code minier", "_Min_CM_CMappli"],
    "img_Loi n°2023-041 (29.08.2023) Contenu local minier 2023 (LinkedIn)": ["Supplément au Code minier", "_Min_CM_CMsuppl"],
    "img_Décret n°2007-180 (06.06.2007) ITIE 2007 (ITIE Mali)": ["Initiative pour la Transparence dans les Insdustries Extractives", "_Min_CM_CMitie"],
    "img_Décret n°1999-255 (15.09.1999) Application Code minier 1999 (Ministère des Mines)": ["Application du Code minier", "_Min_CM_CMappli"],
    "img_Décret n°2018-685 (31.08.2018) ITIE 2018 (ITIE Mali)": ["Initiative pour la Transparence dans les Insdustries Extractives", "_Min_CM_CMitie"],
    "img_Ordonnance n°2000-013 (10.02.2000) Modification Code minier 2000 (Ministère des Mines)": ["Modification du Code minier", "_Min_CM_CMmodif"],
    "img_Ordonnance n°1991-065 (19.09.1991) Code minier 1991 (Ministère des Mines)": ["Code minier", "_Min_CM_CM"],
    "Ordonnance n°2000-013 (10.02.2000) Modification Code minier 2000 (SGG)": ["Modification du Code minier", "_Min_CM_CMmodif"],
    "img_Décret n°2019-006 (10.01.2019) ITIE 2019 (ITIE Mali)": ["Initiative pour la Transparence dans les Insdustries Extractives", "_Min_CM_CMitie"],
    "img_Décret n°2020-288 (08.12.2020) Convention minière type 2020 (ITIE Mali)": ["Convention minière type", "_Min_CM_CMconv"],
    "img_Loi n°2008-027 (23.07.2008) Modification Code pétrolier 2008 (Ministère des Mines)": ["Modification du Code pétrolier", "_Pétrol_CP_CPmodif"],
    "Décret n°2004-357 (08.09.2004) Application Code pétrolier 2004 (Ministère des Mines)": ["Application du Code pétrolier", "_Pétrol_CP_CPappli"],
    "img_Loi n°2015-035 (16.07.2015) Code pétrolier 2015 (Ministère des Mines)": ["Code pétrolier", "_Pétrol_CP_CP"],
    "Loi n°2015-035 (16.07.2015) Code pétrolier 2015 (Droit-Afrique)": ["Code pétrolier", "_Pétrol_CP_CP"],
    "Décret n°2004-357 (08.09.2004) Application Code pétrolier 2004 (Droit-Afrique)": ["Application du Code pétrolier", "_Pétrol_CP_CPappli"],
    "Contrat de partage de production type (Ministère des Mines)": ["Convention pétrolière type", "_Pétrol_CP_CPconv"],
    "Décret n°2016-272 (29.04.2016) Application Code pétrolier 2016 (ITIE Mali)": ["Application du Code pétrolier", "_Pétrol_CP_CPappli"],
    "img_Loi n°2004-037 (02.08.2004) Code pétrolier 2004 (Droit-Afrique)": ["Code pétrolier", "_Pétrol_CP_CP"],
    "img_Loi n°2017-073 (26.12.2017) Loi de finances 2018 (Ministère des Finances)": ["Loi de finances", "_Gen_LF_LF"],
    "img_Loi n°2008-021 (22.07.2008) Modification LPF 2008 (Droit-Afrique)": ["Modification du Livre de procédures fiscales", "_Gen_CGI_CGIprocedModif"],
    "img_Loi n°2012-063 (26.12.2012) Annexe fiscale Loi de finances 2013 (Droit-Afrique)": ["Loi de finances", "_Gen_LF_LF"],
    "Loi n°2016-056 (21.12.2016) Annexe fiscale Loi de finances 2017 (Ministère des Finances)": ["Loi de finances", "_Gen_LF_LF"],
    "Loi n°1996-052 (16.10.1996) Taxe touristique 1996 (SGG)": ["Supplément au Code général des impôts", "_Gen_CGI_CGIsuppl"],
    "Loi n°1996-021 (21.02.1966) Jeux de hasard 1996 (SGG)": ["Supplément au Code général des impôts", "_Gen_CGI_CGIsuppl"],
    "Loi n°2001-064 (09.07.2001) Modification CGI 2001 (SGG)": ["Modification du Code général des impôts", "_Gen_CGI_CGImodif"],
    "Décret n°1996-179 (19.06.1996) Application Office malien de l_habitat 1996 (SGG)": ["Application du Fonds logement", "_Gen_CGI_FondsLogementAppli"],
    "img_Loi n°1996-030 (12.06.1996) Office malien de l_habitat 1996 (Faolex)": ["Fonds logement", "_Gen_CGI_FondsLogement"],
    "img_Loi n°2006-068 (29.12.2006) Livre de procédures fiscales 2006": ["Livre de procédures fiscales", "_Gen_CGI_CGIproced"],
    "Loi n°2006-068 (29.12.2006) Livre de procédures fiscales 2013 (Ekladata)": ["Livre de procédures fiscales amendé", "_Gen_CGI_CGIprocedAmendé"],
    "img_Loi n°2008-009 (28.02.2008) Modification CGI 2008 (Droit-Afrique)": ["Modification du Code général des impôts", "_Gen_CGI_CGImodif"],
    "Ordonnance n°2020-001 (04.09.2020) Loi de finances rectificative 2020 (Ministère des Finances)": ["Loi de finances rectificative", "_Gen_LF_LFR"],
    "Ordonnance n°1970-006 (27.02.1970) Code général des impôts 1999 (CIPB)": ["Code général des impôts amendé", "_Gen_CGI_CGIamendé"],
    "img_Loi n°2006-067 (29.12.2006) Code général des impôts 2016 (DGI)": ["Code général des impôts amendé", "_Gen_CGI_CGIamendé"],
    "Décret n°1997-182 (02.06.1997) Application Jeux de hasard 1997 (SGG)": ["Application du Supplément au Code général des impôts", "_Gen_CGI_CGIsupplAppli"],
    "img_Loi n°2017-022 (12.06.2021) Cadre général des exonérations fiscales et douanières 2017 (DGI)": ["Supplément au Code général des impôts", "_Gen_CGI_CGIsuppl"],
    "img_Loi n°2008-020 (22.07.2008) Modification CGI 2008 (Droit-Afrique)": ["Modification du Code général des impôts", "_Gen_CGI_CGImodif"],
    "img_Loi n°2016-056 (21.12.2016) Loi de finances 2017 (Ministère des Finances)": ["Loi de finances", "_Gen_LF_LF"],
    "Loi n°2015-054 (22.12.2015) Annexe fiscale Loi de finances 2016 (SEAG Conseil)": ["Loi de finances", "_Gen_LF_LF"],
    "Loi n°2011-078 (23.12.2011) Annexe fiscale Loi de finances 2012 (SEAG Conseil)": ["Loi de finances", "_Gen_LF_LF"],
    "Loi n°2012-063 (26.12.2012) Annexe fiscale Loi de finances 2013 (DGI)": ["Loi de finances", "_Gen_LF_LF"],
    "Loi n°1997-014 (07.03.1997) Modification CGI 1997 (SGG)": ["Modification du Code général des impôts", "_Gen_CGI_CGImodif"],
    "img_Loi n°2010-060 (30.12.2010) Loi de finances 2011 (Droit-Afrique)": ["Loi de finances", "_Gen_LF_LF"],
    "Loi n°2006-067 (29.12.2006) Code général des impôts 2006 (CNP)": ["Code général des impôts", "_Gen_CGI_CGI"],
    "Loi n°2006-067 (29.12.2006) Code général des impôts 2006 (CNP) déverrouillé": ["Code général des impôts", "_Gen_CGI_CGI"],
    "Loi n°2019-070 (24.12.2019) Loi de finances 2020 (Ministère des Finances)": ["Loi de finances", "_Gen_LF_LF"],
    "Loi n°1994-034 (09.06.1994) Modification CGI 1994 (SGG)": ["Modification du Code général des impôts", "_Gen_CGI_CGImodif"],
    "img_Décret n°2018-595 (24.07.2018) Application Cadre général des exonérations fiscales et douanières 2018 (DGI)": ["Application du Supplément au Code général des impôts", "_Gen_CGI_CGIsupplAppli"],
    "Loi n°2021-071 (23.12.2021) Loi de finances 2022 (Ministère des Finances)": ["Loi de finances", "_Gen_LF_LF"],
    "Loi n°1997-024 (23.01.1997) Modification CGI 1997 (SGG)": ["Modification du Code général des impôts", "_Gen_CGI_CGImodif"],
    "Loi n°1996-030 (12.06.1996) Office malien de l_habitat 1996 (SGG)": ["Fonds logement", "_Gen_CGI_FondsLogement"],
    "Ordonnance n°2013-021 (03.12.2013) Annexe fiscale Loi de finances 2014 (Droit-Afrique)": ["Loi de finances", "_Gen_LF_LF"],
    "img_Loi n°2015-054 (22.12.2015) Loi de finances 2016 (Ministère des Finances)": ["Loi de finances", "_Gen_LF_LF"],
    "img_Loi n°2017-073 (26.12.2017) Annexe fiscale Loi de finances 2018 (Ministère des Finances)": ["Loi de finances", "_Gen_LF_LF"],
    "Décret n°2005-036 (27.01.2005) Impôt spécial sur certains produits 2005 (Droit-Afrique)": ["Impôt spécial sur certains produits", "_Gen_CGI_ISCP"],
    "img_Loi n°2011-078 (23.12.2011) Annexe fiscale Loi de finances 2012 (Investir-Afrique)": ["Loi de finances", "_Gen_LF_LF"],
    "img_Loi n°2014-056 (26.12.2014) Annexe fiscale Loi de finances 2015 (Droit-Afrique)": ["Loi de finances", "_Gen_LF_LF"],
    "Ordonnance n°2020-013 (21.12.2020) Loi de finances 2021 (Ministère des Finances)": ["Loi de finances", "_Gen_LF_LF"],
    "Loi n°2018-072 (21.12.2018) Loi de finances 2019 (Ministère des Finances)": ["Loi de finances", "_Gen_LF_LF"],
}


In [None]:
def generate_definitions(doc_name, definition):
        vector_query_engine = create_vector_engine(doc_name)

        x = vector_query_engine.query(f"""
                Use the contents and title of the document with this title '{doc_name}', to create a definition that will help identify similar documents,
                around this phrase: '{definition}', with the emphasis on the type of law, type of document and the functionality of the document.
                Skip the document name, country and the year of the document from the answer. Keep the summary only, without any rewrites, notes etc.""")

        return x.response


In [None]:
generated_definitions = {k: [generate_definitions(k, v[0]), v[0], v[1]] for (k,v) in laws_dict.items()}

In [None]:
generated_definitions

{'img_Décret n°2016-520 (22.07.2016) ITIE 2016 (ITIE Mali)': "This is a decree that establishes the institutional framework for the Initiative for Transparency in Extractive Industries (ITIE), a law related to mining rights. The document outlines the structure and functions of ITIE's committees and secretariat, ensuring transparency and accountability in extractive industries.",
 'img_Ordonnance n°1999-032 (19.08.1999) Code minier 1999 (Ministère des Mines)': '**Rewrite**\n\nThe Code Minier is a type of droit minier document that outlines regulations and procedures for mining activities, providing rules for exploration, exploitation, environmental protection, and guidelines for permits and licenses.',
 'img_Décret n°2012-311 (21.06.2012) Application Code minier 2012 (Ministère des Mines)': '**Rewrite**\n\nThis document outlines the conditions and modalities for applying the Code minier, detailing procedures for obtaining permits, authorizations, and concessions for mining activities, a

In [None]:
definitions = {
    "_Gen_LF_LOLF": "An organic law relating to finance laws (LOLF) is a legislative text that complements and clarifies the constitutional rules that govern the development, execution, and control of finance laws. It is thus a special law that explains how finance laws must be drafted and implemented. In French-speaking countries, the name of an organic law relating to finance laws generally includes the words 'portant loi organique relative aux lois de finances' or simply 'relative aux lois de finances'.",
    "_Gen_LF_LOLFmodif": "A modification of an organic law relating to finance laws is a legislative text that amends the constitutional rules governing the development, execution, and control of finance laws. It is thus a law that modifies the organic law relating to finance laws. In French-speaking countries, the name of a modification of an organic law relating to finance laws generally includes the words 'portant modification de la loi organique relative aux lois de finances' or 'modifiant la loi organique relative aux lois de finances'.",
    "_Gen_LF_LF": "This document is a fiscal law that outlines the financial regulations for the upcoming year, serving as a comprehensive guide for budgeting and expenditure management within the government. The emphasis on fiscal responsibility and transparency suggests a focus on prudent financial planning and accountability. It details various sections, programs, and articles, providing specific allocations for personnel, biens et services, transfers, and investments, ensuring effective allocation of resources and maintaining a stable economic environment.",
    "_Gen_LF_LFmodif": "A modification of a finance law is a legislative text (law or ordinance) aimed at amending a previously adopted finance law, without constituting a true rectifying finance law. In French-speaking countries, the name of a modification of a finance law generally includes the words 'portant modification de la loi de finances' or 'portant modification du budget général de l'État'. In English-speaking countries, a modification of a finance law is called a 'Finance (Amendment) Act'.",
    "_Gen_LF_LFamende": "An amended finance law is a modified version of a finance law up to a certain date. This amended version is based on the initial finance law and subsequently includes all subsequent modifications.",
    "_Gen_LF_LFratif": "The ratification of a finance law is a law passed to ratify an initial finance law adopted in the form of an ordinance. In French-speaking countries, the name of a ratification of a finance law generally includes the words 'ratifiant la loi de finances' or 'portant ratification de la loi de finances'.",
    "_Gen_LF_LFR": "A rectifying finance law is a legislative text (law or ordinance) adopted to replace an initial finance law. Its purpose is to revise the provisional budget during a year. The rectifying finance law may contain tax provisions that can modify the general tax code or any other tax law. In French-speaking countries, the name of a rectifying finance law generally includes the words 'portant loi de finances rectificative'.",
    "_Gen_LF_LFRmodif": "A modification of a rectifying finance law is a legislative text (law or ordinance) aimed at amending an already adopted rectifying finance law, without constituting a new rectifying finance law. In French-speaking countries, the name of a modification of a rectifying finance law generally includes the words 'portant modification de la loi de finances rectificative'.",
    "_Gen_LF_LFRratif": "The ratification of a rectifying finance law is a law passed to ratify a rectifying finance law adopted in the form of an ordinance. In French-speaking countries, the name of a ratification of a rectifying finance law generally includes the words 'ratifiant la loi de finances rectificative' or 'portant ratification de la loi de finances rectificative'.",
    "_Gen_LF_LFappli": "The application of a finance law is a regulatory text (decree or order) aimed at specifying the application methods of a finance law. In French-speaking countries, the name of the application of a finance law generally includes the words 'précisant les modalités d'application des dispositions fiscales de la loi de finances'.",
    "_Gen_LF_LFappliModif": "A modification of an application of a finance law is a regulatory text (decree or order) aimed at amending the application methods of a finance law already established in another regulatory text. In French-speaking countries, the name of a modification of the application of a finance law generally includes the words 'modifiant les modalités d'application des dispositions fiscales de la loi de finances'."
}

In [None]:
definitions

{'_Gen_LF_LOLF': "An organic law relating to finance laws (LOLF) is a legislative text that complements and clarifies the constitutional rules that govern the development, execution, and control of finance laws. It is thus a special law that explains how finance laws must be drafted and implemented. In French-speaking countries, the name of an organic law relating to finance laws generally includes the words 'portant loi organique relative aux lois de finances' or simply 'relative aux lois de finances'.",
 '_Gen_LF_LOLFmodif': "A modification of an organic law relating to finance laws is a legislative text that amends the constitutional rules governing the development, execution, and control of finance laws. It is thus a law that modifies the organic law relating to finance laws. In French-speaking countries, the name of a modification of an organic law relating to finance laws generally includes the words 'portant modification de la loi organique relative aux lois de finances' or 'mod

In [None]:
doc_name = "img_Loi n°2017-073 (26.12.2017) Loi de finances 2018 (Ministère des Finances)"

vector_query_engine = create_vector_engine(doc_name)

response = vector_query_engine.query(
    f"""
    Consider the contents of the document with this title '{doc_name}'. Then, these definitions: {str(definitions)} 
    In which of these categories : {', '.join(list(definitions.keys()))}, is it part of ? Answer with a valid category, keep the category only. Do not write an introduction, summary, rewrite or anything after the category.
    """
    # Answer with a valid category, keep the category only.
    # f"""
    # Considérez le contenu du document portant ce nom '{doc_name}'. Ensuite, ces définitions : {str(definitions)}
    # Dans laquelle de ces catégories : {', '.join(list(definitions.keys()))}, cela en fait-il partie ? Répondez avec une catégorie valide, gardez uniquement la catégorie.
    # """
)

# print(response)
print(doc_name, len(response.source_nodes), response.response)

img_Loi n°2017-073 (26.12.2017) Loi de finances 2018 (Ministère des Finances) 20 _Gen_LF_LF


In [None]:
response.response

'_Gen_LF_LOLF'

# View the document contents

In [None]:
print(f"Title: {'Loi n°2006-067 (29.12.2006) Code général des impôts 2006 (CNP) déverrouillé'}\nContent:\n{df[df['title']=='Loi n°2006-067 (29.12.2006) Code général des impôts 2006 (CNP) déverrouillé'].iloc[0,1]}")

Title: Loi n°2006-067 (29.12.2006) Code général des impôts 2006 (CNP) déverrouillé
Content:
PRESIDENCE DE LA REPUBLIQUE DU MALI
Un Peuple - Un But - Une Foi
-------------------

LOI N° 06 - 067 / DU 29 DEC. 2006
PORTANT CODE GENERAL DES IMPOTS

L'Assemblée Nationale a délibéré et adopté en sa séance du 1er décembre
2006 ;

Le Président de la République promulgue la Loi dont la teneur suit :

Titre 1 : Impôts directs

Chapitre 1 : Les impôts sur le revenu

Section I : L'impôt sur les traitements et salaires

Sous-section I : Revenus soumis à l'impôt

Article 1
Il est institué au profit du budget de l'État un Impôt sur les Traitements et
Salaires applicable à toutes les sommes payées dans l'année aux salariés par les
employeurs publics et privés, directement ou par l'entremise d'un tiers, en
contrepartie ou à l'occasion du travail, notamment à titre de traitements,
indemnités, émoluments, commissions, participations, primes, gratifications,
gages, pourboires et autres rétributions, quell

# Pipeline

In [None]:
from llama_index.core.retrievers import (
    BaseRetriever,
    VectorIndexRetriever,
    KeywordTableSimpleRetriever,
)

In [None]:
definitions = df_fisc_standards.set_index(['TexteFiscStandardCode'])['TexteFiscStandardComplet'].to_dict()
definitions.keys()

dict_keys(['_Gen_CGI', '_Gen_IT', '_Gen_VAT', '_Gen_NHI', '_Gen_LF', '_Invest_CI', '_Min_CM', '_Pétrol_CP', '_Gaz_CG', '_Forest_CF'])

In [None]:
doc_name = 'Loi n°2006-067 (29.12.2006) Code général des impôts 2006 (CNP) déverrouillé'

In [None]:
# setting up the llm
llm = Ollama(
    model="llama3.1", 
    temperature=0, 
    request_timeout=180.0,
    context_window=chunk_size) 

Settings.llm = llm

In [None]:
from llama_index.core.postprocessor import LLMRerank

from llama_index.postprocessor.colbert_rerank import ColbertRerank
from llama_index.core.response_synthesizers import TreeSummarize

In [None]:
from llama_index.core import get_response_synthesizer
from llama_index.core.response_synthesizers import ResponseMode
from llama_index.core.query_engine import RetrieverQueryEngine

vector_index = index

from llama_index.core.vector_stores import (
    MetadataFilter,
    MetadataFilters,
    FilterOperator,
)

def create_vector_engine(doc_name):
    filters = MetadataFilters(
        filters=[
            MetadataFilter(
                key="title", value=doc_name
            ),
        ]
    )

    reranker = LLMRerank(
                llm=llm,
                choice_batch_size=5,
                top_n=3,
            )

    colbert_reranker = ColbertRerank(
        top_n=5,
        model="colbert-ir/colbertv2.0",
        tokenizer="colbert-ir/colbertv2.0",
        keep_retrieval_score=True,
    )

    vector_retriever = VectorIndexRetriever(index=vector_index, similarity_top_k=10, filters=filters) #, alpha=0.5) #, embed_model=embed_model)
    response_synthesizer = get_response_synthesizer(llm=llm, response_mode=ResponseMode.COMPACT)

    vector_query_engine = RetrieverQueryEngine(
        retriever=vector_retriever,
        response_synthesizer=response_synthesizer,
        # node_postprocessors=[colbert_reranker],
        # node_postprocessors=[processor]
    )

    return vector_query_engine

In [None]:
filters = MetadataFilters(
        filters=[
            MetadataFilter(
                key="title", value=doc_name
            ),
        ]
    )

vector_retriever = VectorIndexRetriever(index=vector_index, similarity_top_k=10, filters=filters)

In [None]:
colbert_reranker = ColbertRerank(
        top_n=5,
        model="colbert-ir/colbertv2.0",
        tokenizer="colbert-ir/colbertv2.0",
        keep_retrieval_score=True,
    )

In [None]:
definitions2 = df_fisc_exacts[df_fisc_exacts['TexteFiscStandardCode']==str('_Gen_CGI')].set_index(['TexteFiscExactCode'])['TexteFiscExactComplet'].to_dict()

definitions2

{'_Gen_CGI_CGI': 'Code général des impôts',
 '_Gen_CGI_CGImodif': 'Modification du Code général des impôts',
 '_Gen_CGI_CGImodifModif': 'Modification de la Modification du Code général des impôts',
 '_Gen_CGI_CGIamendé': 'Code général des impôts amendé',
 '_Gen_CGI_CGIappli': 'Application du Code général des impôts',
 '_Gen_CGI_CGIappliModif': "Modification de l'Application du Code général des impôts",
 '_Gen_CGI_CGIsuppl': 'Supplément au Code général des impôts',
 '_Gen_CGI_CGIsupplAppli': 'Application du Supplément au Code général des impôts',
 '_Gen_CGI_CGIone': 'Code des impôts directs et indirects',
 '_Gen_CGI_CGIoneModif': 'Modification du Code des impôts directs et indirects',
 '_Gen_CGI_CGIoneAmendé': 'Code des impôts directs et indirects amendé',
 '_Gen_CGI_CGIbis': "Code de l'enregistrement, du timbre et de l'impôt sur le revenu des capitaux mobiliers",
 '_Gen_CGI_CGIbisModif': "Modification du Code de l'enregistrement, du timbre et de l'impôt sur le revenu des capitaux mobil

In [None]:
from llama_index.postprocessor.cohere_rerank import CohereRerank
from llama_index.core.response_synthesizers import TreeSummarize
from llama_index.core import PromptTemplate

llm = Ollama(
    model="llama3.1", 
    temperature=0, 
    request_timeout=180.0,
    context_window=chunk_size,
    context_window=16000) 

Settings.llm = llm

# define modules
prompt_str =  """
    Consider the contents of the document with this title '{doc_name}'. Then, these definitions: """ + ", ".join([f"{k}={v}" for (k,v) in definitions.items()]) + """
    In which category, is it part of ? Answer with a valid category described in the definitions above, keep the category only. Do not write an introduction, summary, rewrite or anything after the category."""
prompt_tmpl = PromptTemplate(prompt_str)
retriever = index.as_retriever(similarity_top_k=10, filters=filters)
reranker = ColbertRerank(
        top_n=5,
        model="colbert-ir/colbertv2.0",
        tokenizer="colbert-ir/colbertv2.0",
        keep_retrieval_score=True,
    )
summarizer = TreeSummarize(llm=llm)

In [None]:
# define query pipeline
p = QueryPipeline(verbose=True)
p.add_modules(
    {
        "llm": llm,
        "prompt_tmpl": prompt_tmpl,
        "retriever": retriever,
        "summarizer": summarizer,
        "reranker": reranker,
    }
)

In [None]:
# p.add_link("prompt_tmpl", "llm")
# p.add_link("llm", "retriever")
# p.add_link("retriever", "reranker", dest_key="nodes")
# # p.add_link("llm", "reranker", dest_key="query_str")
# p.add_link("reranker", "summarizer", dest_key="nodes")
# # p.add_link("llm", "summarizer", dest_key="query_str")

p.add_link("input", "retriever")
p.add_link("input", "summarizer", dest_key="query_str")
p.add_link("retriever", "summarizer", dest_key="nodes")

# look at summarizer input keys
print(summarizer.as_query_component().input_keys)

ValueError: Module input does not exist in pipeline.

In [None]:
response = p.run(doc_name=doc_name)

[1;3;38;2;155;135;227m> Running module prompt_tmpl with input: 
doc_name: Loi n°2006-067 (29.12.2006) Code général des impôts 2006 (CNP) déverrouillé

[0m[1;3;38;2;155;135;227m> Running module llm with input: 
messages: 
    Consider the contents of the document with this title 'Loi n°2006-067 (29.12.2006) Code général des impôts 2006 (CNP) déverrouillé'. Then, these definitions: _Gen_CGI=Code général des impôts, _Ge...

[0m[1;3;38;2;155;135;227m> Running module retriever with input: 
input: assistant: _Gen_CGI

[0m[1;3;38;2;155;135;227m> Running module reranker with input: 
query_str: assistant: _Gen_CGI
nodes: [NodeWithScore(node=TextNode(id_='caad4a58-e526-4749-b7ea-f7ed11e35809', embedding=None, metadata={'title': 'Loi n°2006-067 (29.12.2006) Code général des impôts 2006 (CNP) déverrouillé', 'law type': '...

[0m[1;3;38;2;155;135;227m> Running module summarizer with input: 
query_str: assistant: _Gen_CGI
nodes: [NodeWithScore(node=TextNode(id_='95507a83-7fdb-4baf-b230-8722

In [None]:
response.source_nodes

[NodeWithScore(node=TextNode(id_='95507a83-7fdb-4baf-b230-872229a68b1f', embedding=None, metadata={'title': 'Loi n°2006-067 (29.12.2006) Code général des impôts 2006 (CNP) déverrouillé', 'law type': 'droit fiscal', 'retrieval_score': 0.2596357885960406}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='d04b67e4-5e4a-44e0-ac80-9f7ba31e79b9', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'title': 'Loi n°2006-067 (29.12.2006) Code général des impôts 2006 (CNP) déverrouillé', 'law type': 'droit fiscal'}, hash='ad04bfd8094d31bca34f68ba83f73414bcf164955c187796de428c6bf237b6a0'), <NodeRelationship.PREVIOUS: '2'>: RelatedNodeInfo(node_id='077a2a45-c0cf-44a1-b55b-daaf5ae95fdb', node_type=<ObjectType.TEXT: '1'>, metadata={'title': 'Loi n°2006-067 (29.12.2006) Code général des impôts 2006 (CNP) déverrouillé', 'law type': 'droit fiscal'}, hash='9057e4d40630f1506a4db2fd587d3a8f1cebdf5a971d2114d9a349485a404b52')

In [None]:
## create graph
from pyvis.network import Network

net = Network(notebook=True, cdn_resources="in_line", directed=True)
net.from_nx(p.dag)
net.show("rag_dag.html")

## another option using `pygraphviz`
# from networkx.drawing.nx_agraph import to_agraph
# from IPython.display import Image
# agraph = to_agraph(p.dag)
# agraph.layout(prog="dot")
# agraph.draw('rag_dag.png')
# display(Image('rag_dag.png'))

## Creating parent child embeddings

In [None]:
from llama_index.core.schema import IndexNode

node_parser = SentenceSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
base_nodes = node_parser.get_nodes_from_documents(documents)
# set node ids to be a constant
for idx, node in enumerate(base_nodes):
    node.id_ = f"node-{idx}"

embed_model = embed_model

from llama_index.llms.openai import OpenAI

llm = OpenAI(model="gpt-4o-mini", temperature=0)

# os.environ["OPENAI_API_KEY"] = ""

Settings.llm = llm

# base_index = VectorStoreIndex(base_nodes, embed_model=embed_model)

sub_chunk_sizes = [128, 256, 512]
sub_node_parsers = [
    SentenceSplitter(chunk_size=c, chunk_overlap=20) for c in sub_chunk_sizes
]

all_nodes = []
for base_node in base_nodes:
    for n in sub_node_parsers:
        sub_nodes = n.get_nodes_from_documents([base_node])
        sub_inodes = [
            IndexNode.from_text_node(sn, base_node.node_id) for sn in sub_nodes
        ]
        all_nodes.extend(sub_inodes)

    # also add original node to node
    original_node = IndexNode.from_text_node(base_node, base_node.node_id)
    all_nodes.append(original_node)


all_nodes_dict = {n.node_id: n for n in all_nodes}


AttributeError: 'IndexNode' object has no attribute 'get_doc_id'

In [None]:
len(all_nodes)

164773

In [None]:
storage_name = "storage_MLI Mali_complete_1024_pa-ch_voyage-law"

if not os.path.exists(storage_name):
    vector_index_chunk = VectorStoreIndex(all_nodes, show_progress=True, embed_model=embed_model)
    # index = VectorStoreIndex.from_documents(all_nodes, embed_model=embed_model, show_progress=True)
    # save index to disk
    vector_index_chunk.set_index_id("vector_index")
    vector_index_chunk.storage_context.persist(f"./{storage_name}")
else:
    print('loading from local')
    # rebuild storage context
    storage_context = StorageContext.from_defaults(persist_dir=storage_name)
    # load index
    vector_index_chunk = load_index_from_storage(storage_context, index_id="vector_index")

vector_index = vector_index_chunk
vector_as_dict = vector_index.vector_store.to_dict()

loading from local
