## imports

In [1]:
from langchain_text_splitters import CharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
# from langchain_community.document_loaders import Docx2txtLoader
import os
import json

from langchain_community.retrievers import BM25Retriever
from langchain_core.prompts import ChatPromptTemplate
from langchain.docstore.document import Document

import requests

import pandas as pd

In [2]:


folder_path = "Inputs GenAI BMS"

## helper functions

In [None]:
import os
import pdfplumber  # Extraction texte des PDFs
import pytesseract  # OCR pour images et PDFs scannés
import cv2
import pandas as pd
from pdf2image import convert_from_path  # Convertir PDF en images
from pptx import Presentation  # Extraction texte PowerPoint
from PIL import Image
import numpy as np

# Configuration
folder_path = "Inputs GenAI BMS"  # Modifier avec votre chemin réel
output_csv = "extracted_dataset.csv"
pytesseract.pytesseract.tesseract_cmd = r"/usr/bin/tesseract"  # Modifier selon installation

# Liste des données extraites
dataset = []

# Vérifier si le dossier existe
if not os.path.exists(folder_path):
    print(f"❌ Dossier introuvable : {folder_path}")
    exit()

# Vérifier le nombre de fichiers trouvés
files = os.listdir(folder_path)
if not files:
    print("❌ Aucun fichier trouvé dans le dossier.")
    exit()
print(f"📂 {len(files)} fichiers trouvés dans {folder_path}")

# Fonction pour extraire du texte depuis un PDF avec pdfplumber
def extract_text_from_pdf(pdf_path):
    text_pages = []
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page_num, page in enumerate(pdf.pages, start=1):
                text = page.extract_text() or ""  # Récupérer le texte ou une chaîne vide
                text_pages.append({"page_num": page_num, "text": text.strip()})
        return text_pages
    except Exception as e:
        print(f"❌ Erreur extraction PDF ({pdf_path}): {e}")
        return []

# Fonction OCR sur images d'un PDF
def extract_ocr_from_pdf(pdf_path, dpi=150, max_pages=5):
    ocr_text = []
    try:
        images = convert_from_path(pdf_path, dpi=dpi, first_page=1, last_page=max_pages)
        for img_num, img in enumerate(images, start=1):
            img_array = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
            h, w, _ = img_array.shape
            if w < 50 or h < 50:
                continue  # Ignorer les petites images

            gray = cv2.cvtColor(img_array, cv2.COLOR_BGR2GRAY)
            gray = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
            text = pytesseract.image_to_string(gray, lang="fra+eng").strip()
            if text:
                ocr_text.append({"page_num": img_num, "ocr_text": text})
        return ocr_text
    except Exception as e:
        print(f"❌ Erreur OCR PDF ({pdf_path}): {e}")
        return []

# Fonction pour extraire le texte d'un PowerPoint
def extract_text_from_pptx(pptx_path):
    text = []
    try:
        prs = Presentation(pptx_path)
        for slide_num, slide in enumerate(prs.slides, start=1):
            slide_text = "\n".join(shape.text.strip() for shape in slide.shapes if hasattr(shape, "text"))
            text.append({"slide_num": slide_num, "text": slide_text.strip()})
        return text
    except Exception as e:
        print(f"❌ Erreur extraction PPTX ({pptx_path}): {e}")
        return []

# Traitement des fichiers
df_data = []
for file_name in files:
    file_path = os.path.join(folder_path, file_name)
    
    if file_name.endswith(".pdf"):
        print(f"📄 Extraction PDF : {file_name}")
        pdf_text = extract_text_from_pdf(file_path)
        pdf_ocr = extract_ocr_from_pdf(file_path, dpi=150, max_pages=5)

        for page in pdf_text:
            page_num = page["page_num"]
            ocr_text = next((ocr["ocr_text"] for ocr in pdf_ocr if ocr["page_num"] == page_num), "")
            df_data.append({"file": file_name, "text": page["text"], "ocr_text": ocr_text})

    elif file_name.endswith(".pptx"):
        print(f"📊 Extraction PPTX : {file_name}")
        ppt_text = extract_text_from_pptx(file_path)
        for slide in ppt_text:
            df_data.append({"file": file_name, "text": slide["text"], "ocr_text": ""})

# Vérifier si des données ont été extraites
if df_data:
    df = pd.DataFrame(df_data)
    df.to_csv(output_csv, index=False, encoding="utf-8")
    print(f"✅ Extraction terminée ! Données enregistrées dans {output_csv}")
else:
    print("❌ Aucune donnée extraite, le fichier CSV ne sera pas généré.")


In [None]:
#nettoyage des données
import pandas as pd
import re

# Charger le dataset extrait
df = pd.read_csv("extracted_dataset.csv", encoding="utf-8")

# Nettoyage des colonnes "text" et "ocr_text"
def clean_text(text):
    # Conversion en minuscules
    text = text.lower()

    #Suppression des espaces superflus
    text = text.strip()

    #Suppression des caractères spéciaux et des retours à la ligne inutiles
    text = re.sub(r'\n+', ' ', text)  # Remplacer les sauts de ligne par des espaces
    text = re.sub(r'[^\w\s]', '', text)  # Supprimer tous les caractères spéciaux

    #Suppression des multiples espaces
    text = re.sub(r'\s+', ' ', text)

    return text

#Appliquer le nettoyage à la colonne "text" et "ocr_text"
df['text'] = df['text'].apply(lambda x: clean_text(str(x)))
df['ocr_text'] = df['ocr_text'].apply(lambda x: clean_text(str(x)))

#Suppression des doublons (lignes identiques)
df = df.drop_duplicates(subset=["text", "ocr_text"])

#Suppression des lignes avec des valeurs manquantes
df = df.dropna(subset=["text", "ocr_text"])

#Vérification après nettoyage
print(f"Data cleaned. Number of rows after cleaning: {len(df)}")

#Sauvegarder les données nettoyées dans un nouveau fichier CSV
df.to_csv("cleaned_dataset.csv", index=False, encoding="utf-8")
print(f"Cleaned dataset saved to cleaned_dataset.csv")


# Chunking method

In [52]:
text_splitter = CharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=200,
)

# chunking

In [53]:
chunks_all = []

In [None]:
from langchain_text_splitters import CharacterTextSplitter

def split_dataset_into_chunks(df, chunk_size=800, chunk_overlap=200):
    """Divise le dataset en chunks en utilisant CharacterTextSplitter.

    Args:
        df: Le DataFrame Pandas contenant les données à diviser.
        chunk_size: La taille maximale de chaque chunk en caractères.
        chunk_overlap: Le chevauchement entre les chunks en caractères.

    Returns:
        Une liste de dictionnaires, où chaque dictionnaire représente un chunk
        et contient les clés "chunk", "file" et "type".
    """
    text_splitter = CharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )

    chunks_all = []
    for index, row in df.iterrows():
        chunks = text_splitter.split_text(row['text'])
        for chunk in chunks:
            chunks_all.append({
                'chunk': chunk,
                'file': row['file'],
                'type': 'text'  # Indique que le chunk provient de la colonne 'text'
            })

        # Faire de même pour la colonne 'ocr_text' si nécessaire
        chunks = text_splitter.split_text(row['ocr_text'])
        for chunk in chunks:
            chunks_all.append({
                'chunk': chunk,
                'file': row['file'],
                'type': 'ocr_text'  # Indique que le chunk provient de la colonne 'ocr_text'
            })
    
    return chunks_all

# Utilisation de la fonction :
chunks_all = split_dataset_into_chunks(df) 

# Enregistrement des chunks dans un fichier CSV si vous le souhaitez :
import pandas as pd
chunks_df = pd.DataFrame(chunks_all)
chunks_df.to_csv("chunks_dataset.csv", index=False, encoding="utf-8") 


inputs/Inputs GenAI BMS/Concept MIL High Level Testing.pdf
inputs/Inputs GenAI BMS/GrundlagenElektrotechnik_35006.pdf
inputs/Inputs GenAI BMS/How2CANalyzer.pdf
inputs/Inputs GenAI BMS/How2Controldesk2_Gen5.pdf
inputs/Inputs GenAI BMS/How2Ediabas.pdf
inputs/Inputs GenAI BMS/How2INCA.pdf
inputs/Inputs GenAI BMS/How2JIRA.pdf
inputs/Inputs GenAI BMS/Short_ISTQB.pdf
inputs/Inputs GenAI BMS/BMS Doc/140228_Ladestrategie_und_Regelungstechnik_Schulung.pdf
inputs/Inputs GenAI BMS/BMS Doc/applsci-12-10756-v3.pdf
inputs/Inputs GenAI BMS/BMS Doc/Arrow-Infineon-Battery-Management-Systems-BMS Whitepaper.pdf
inputs/Inputs GenAI BMS/BMS Doc/Fit4HV_Speicher_Version_2021.pdf
inputs/Inputs GenAI BMS/BMS Doc/Infineon-INF1197_ART_BMS_Whitepaper_d08-Whitepaper-v01_00-EN.pdf
inputs/Inputs GenAI BMS/BMS Doc/sustainability-14-15912.pdf
inputs/ISO 26262 methods/ISO-26262-1.pdf
inputs/ISO 26262 methods/ISO-26262-10.pdf
MuPDF error: syntax error: unknown keyword: 'l673.99'

inputs/ISO 26262 methods/ISO-26262-2.pdf

In [55]:
len(chunks_all)

1118

In [None]:
import pandas as pd
from deep_translator import GoogleTranslator

# Charger le dataset
df = pd.read_csv("chunks_dataset.csv")

# Fonction pour traduire la colonne
def translate_column(column):
    translated_column = []
    for x in column:
        if pd.notna(x):
            # Si le texte est trop long, le découper en morceaux plus petits
            max_length = 800  # Longueur maximale autorisée
            chunks = [x[i:i+max_length] for i in range(0, len(x), max_length)]

            translated_text = ""
            for chunk in chunks:
                translated_text += GoogleTranslator(source='auto', target='en').translate(chunk) + " "
            translated_column.append(translated_text.strip())
        else:
            translated_column.append(x)
    return translated_column


In [None]:
from langchain_huggingface import HuggingFaceEmbeddings
model_kwargs = {"trust_remote_code": True, "device": "cpu"}
embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", model_kwargs=model_kwargs)

In [None]:
from chromadb import Client
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.schema import Document

# Define the embedding function
embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")

# Create a folder to store the database
DB_FOLDER = f'db_3'
print(DB_FOLDER)

# Convert the chunks_all dictionary into a list of Documents
documents = []
for chunk in chunks_all:
    documents.append(Document(page_content=chunk['chunk'], metadata={'file': chunk['file'], 'type': chunk['type']}))

# Initialize Chroma with documents and embeddings
db = Chroma.from_documents(documents, embedding_function, persist_directory=DB_FOLDER)


# embedding and loading into the main vectorDB

In [6]:
from langchain_huggingface import HuggingFaceEmbeddings
model_kwargs = {"trust_remote_code": True, "device": "cpu"}
embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", model_kwargs=model_kwargs)

  from tqdm.autonotebook import tqdm, trange


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.12k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [14]:
db_n = 0

In [56]:
from langchain_chroma import Chroma
db_n = db_n + 1
DB_FOLDER = f'db_3'
print(DB_FOLDER)
db = Chroma.from_documents(chunks_all, embedding_function, persist_directory=DB_FOLDER)

db_3
