In [1]:
with open("example_md_to_text.txt", "r", encoding="latin-1") as f: # TO DO: check proper encoding for .md files
    markdown_example = f.read()

with open("example.pdf", "rb") as f:
    pdf = f.read()

with open("HF_TOKEN.txt", "r") as f:
    hf_token = f.read()

with open("GROQ_KEY.txt", "r") as f:
    groq_token = f.read()

In [2]:
from huggingface_hub import HfFolder, whoami

HfFolder.save_token(hf_token)
print(whoami()["name"])

  from .autonotebook import tqdm as notebook_tqdm


alberto-lorente


In [3]:
import torch

device = "cpu"
if torch.cuda.is_available():
    print("Cuda available")
    device = torch.device('cuda')

Cuda available


In [4]:
pdf_path = "example.pdf"

base_prompt="""CONTEXTE
L'image suivante contient une page et un tableau. 

TÂCHE
Décrivez le tableau et le contenu de la page en accordant une attention particulière au contexte qui l'entoure, qu'il s'agisse de budgets, de dates, d'élections, d'agendas, de projets futurs ou de sujets connexes. 

FORMAT DE LA RÉPONSE
Votre réponse doit être aussi détaillée que possible. 
Votre résultat doit être la description du tableau directement.
La langue de votre réponse est le français"""


summary_prompt = """Il s'agit d'un texte concernant un projet de géothermie :
TEXTE: 
{}

TÂCHE:
Make a one page summary paying special attention to all the administrative matters like budgets, plans, actions to take in the future, organizational and hierarchical charts, announcements, meetings, contacts, elections, reports and all the related topics to these.

OUTPUT:
Publier directement le résumé.
"""


In [None]:
from council_rag.preprocessing import preprocess_markdown_text
from council_rag.data_transformations import process_tables, summarize_clusters
import time

start = time.time()

paragraphs_list, clusters_dict = preprocess_markdown_text(markdown_example,
                                                        model_id ="HIT-TMG/KaLM-embedding-multilingual-mini-v1", 
                                                        spacy_model="fr_core_news_sm", 
                                                        n_sents_per_para=10,
                                                        device=device)

processed_tables = process_tables(pdf_path, 
                                base_prompt, 
                                groq_token)

clusters_dict = summarize_clusters(clusters_dict, 
                                    summary_prompt, 
                                    groq_token, 
                                    model="gemma2-9b-it", 
                                    token_limit=14000, 
                                    sleep_time=60)

end = time.time()
print(end - start)

In [68]:
print((end - start)/60)

6.1801420529683435


The processing time so far is around 4 minutes and a half.

The variables we are going to query are:

In [69]:
# paragraphs_dict 
# processed_tables
# clusters_dict

## Indexing and querying

Deleting some info that we won't need and turning the cluster dict into a list to match the paragraphs list.

In [70]:
del paragraphs_list[0]["para_embedding"]

In [71]:
clusters_list = list(clusters_dict.values())
i = 0
while i < len(clusters_list):
    clusters_list[i]["cluster"] = i
    i += 1


Getting the embedding model and setting up the index and vector stores.

In [72]:
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

In [73]:
# checking the shape of the embedding to pass it to the index
shape_emb = embedding_model.encode("Hello World!")
emd_dims =  shape_emb.shape[0]

In [74]:
import faiss

# init the intex
index = faiss.IndexFlatL2(emd_dims)

In [75]:
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document

# creating the vector store
vector_store = FAISS(embedding_model, index, InMemoryDocstore({}), {})

`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.


In [76]:
clusters_list[0].keys()

dict_keys(['para_indexes', 'union_paras', 'cluster_summary', 'cluster'])

Processing all the information we have collected so far into Document objects that will be fed into the vector store.

In [77]:
cluster_paras = [] # to process into smaller chunks
cluster_summ_docs = [] # filled with the cluster summaries langchain doc type

for cluster in clusters_list:
    
    cluster_para = cluster["union_paras"]
    cluster_summ = cluster["cluster_summary"]
    
    cluster_paras.append(cluster_para)
    cluster_summ_docs.append(Document(page_content=cluster_summ, metadata={"cluster": cluster["cluster"],
                                                                            "type": "summary"}))

In [78]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(chunk_size=450, 
                                        chunk_overlap=35,
                                        length_function=len,
                                        is_separator_regex=False)

In [79]:
cluster_paras_docs = []
for cluster_union in cluster_paras:
    cluster_union_docs = splitter.split_text(cluster_union)
    cluster_paras_docs.append(cluster_union_docs)

In [80]:
len(cluster_paras_docs[0])

54

In [81]:
cluster_chunks = []
i = 0
while i < len(cluster_paras_docs):  
    curr_cluster = cluster_paras_docs[i]
    for cluster_chunk in curr_cluster:
        cluster_chunks.append(Document(page_content=cluster_chunk, metadata={"cluster": i,
                                                                                "type": "cluster_chunk"}))
    i += 1


In [82]:
augmented_table_chunks = []
table_descriptions_chunks = []
html_tables_chunks = []

for table in processed_tables:
    augmented_chunk = table["table_augmented_context"]
    augmented_table_chunks.append(Document(page_content=augmented_chunk, metadata={"type": "augmented_table"}))
    
    description_chunk = table["table_context"]
    table_descriptions_chunks.append(Document(page_content=description_chunk, metadata={"type": "description_table"}))
    
    html_chunk = table["table_html"]
    html_tables_chunks.append(Document(page_content=html_chunk, metadata={"type": "html_table"}))
    


ValidationError: 1 validation error for Document
page_content
  Input should be a valid string [type=string_type, input_value=["### **Page 1 sur 49**\n...du 1er septembre 2021."], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/string_type

Overall we have:

- The individual text chunks.
- The clusters summaries chunks.
- The augmented table chunks.
- The table descriptions chunks.
- The html tables chunks.


In [45]:
cluster_chunks[0]

Document(metadata={'cluster': 0, 'type': 'cluster_chunk'}, page_content="PROCES VERBAL\n\n\n\n L'an deux mille vingt et un, le 28 octobre à 18 h 00, les conseillers communautaires de Roannais Agglomération, se sont réunis à l\x92Espace Chorum \x96 Halle Vacheresse \x96 Rue des Vernes à Roanne.\n\n\n\n La convocation de tous les conseillers a été faite le 22 octobre 2021, dans les formes et délais prescrits par la loi, par Yves Nicolin, Président.\n\n\n\n # Etaient présents :")

In [46]:
cluster_summ_docs[0]

Document(metadata={'cluster': 0, 'type': 'summary'}, page_content='The provided text describes the proceedings of a meeting of the Roannais Agglomeration (likely a French "agglomération"  is a type of metropolitan or urban agglomeration. **The documents appear to be minutes of a meeting of Roannais agglomération.  **\n\n\nThe document is a record of a meeting of the  of Roannais Agglomeration\n\n\nPlease provide the full text!\n\n\n## Summary of proceedings of  Roannais Agglomè\n\nThe document is a record of a meeting \n\nThe document is a record of a meeting\n\nThe document is a record of a \n\n\n## Summary of  \n\n### Budget \n\n\n## Administrative Matters\n\n\nHere is a summary of the  \n- Budget General\n\n\nHere\'\n\n\nThis document appears to be a record of a meeting \n___\n\nPlease provide the full text  \n\n\nPlease provide the full text\n\nThis document is a record of a meeting\n\n\nPlease provide the full text!\n\n\nPlease\n\n*   Report of a\n\n*\n\n\n##.  \n\nThe document is

In [None]:
augmented_table_chunks[0]

In [None]:
table_descriptions_chunks[0]

In [None]:
html_tables_chunks[0]