# Parse pdf

In [2]:
!pip install pdfplumber

Collecting pdfplumber
  Downloading pdfplumber-0.11.6-py3-none-any.whl.metadata (42 kB)
Collecting pdfminer.six==20250327 (from pdfplumber)
  Downloading pdfminer_six-20250327-py3-none-any.whl.metadata (4.1 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
Collecting cryptography>=36.0.0 (from pdfminer.six==20250327->pdfplumber)
  Downloading cryptography-44.0.3-cp39-abi3-manylinux_2_34_x86_64.whl.metadata (5.7 kB)
Collecting cffi>=1.12 (from cryptography>=36.0.0->pdfminer.six==20250327->pdfplumber)
  Downloading cffi-1.17.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting pycparser (from cffi>=1.12->cryptography>=36.0.0->pdfminer.six==20250327->pdfplumber)
  Downloading pycparser-2.22-py3-none-any.whl.metadata (943 bytes)
Downloading pdfplumber-0.11.6-py3-none-any.whl (60 kB)
Downloading pdfminer_six-20250327-py3-none-any.whl (5.6 MB)
[2K   

In [None]:
import re
import pdfplumber
import csv

# Regex patterns
chapter_re = re.compile(r'^(\d+)\.\s+(.+)$')  # e.g., "5. Politiques..."
section_re = re.compile(r'^(\d+\.\d+(?:\.\d+)*)\.?\s+(.+)$')  # e.g., "5.3. Appréciation..."
control_re = re.compile(r'^([a-z])\)\s+(.+)$')  # e.g., "a) Le prestataire..."

# Initialize tracking variables
current_chapter = None
current_section = None
controls = []

with pdfplumber.open('secnumcloud-referentiel-exigences-v3.2.pdf') as pdf:
    for page in pdf.pages:
        text = page.extract_text()
        lines = text.split('\n')
        
        for line in lines:
            line = line.strip()
            
            # Match chapter headers (e.g., "5. Politiques...")
            chapter_match = chapter_re.match(line)
            if chapter_match:
                current_chapter = chapter_match.group(1)
                current_section = None  # Reset section on new chapter
                continue
            
            # Match section headers (e.g., "5.3. Appréciation...")
            section_match = section_re.match(line)
            if section_match:
                current_section = section_match.group(1)
                continue
            
            # Match control items (e.g., "a) Le prestataire...")
            control_match = control_re.match(line)
            if control_match and current_section:
                control_letter = control_match.group(1)
                control_text = control_match.group(2)
                control_id = f"{current_section}.{control_letter}"
                controls.append({'ID': control_id, 'Description': control_text})
            elif controls and current_section:
                # Handle multi-line descriptions
                last_control = controls[-1]
                if not re.match(r'^[a-z]\)', line):  # Append if not a new control
                    last_control['Description'] += ' ' + line

# Write to CSV
with open('secnumcloud_controlsParsedFrance.csv', 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['ID', 'Description']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for control in controls:
        writer.writerow(control)

# Translate in english

In [5]:
!pip install deep_translator

[0mCollecting deep_translator
  Downloading deep_translator-1.11.4-py3-none-any.whl.metadata (30 kB)
Downloading deep_translator-1.11.4-py3-none-any.whl (42 kB)
Installing collected packages: deep_translator
Successfully installed deep_translator-1.11.4

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [6]:
import pandas as pd
from deep_translator import GoogleTranslator

# --- Configuration ---
INPUT_CSV_FILE = 'secnumcloud_controlsParsedFrance.csv'
OUTPUT_CSV_FILE = 'secnumcloud_controlsParsedAndTranslatedEnglish.csv'
COLUMN_TO_TRANSLATE = 'Description'
NEW_COLUMN_NAME = 'Description_EN'
SOURCE_LANG = 'fr'
TARGET_LANG = 'en'

# --- Translation Function ---
def translate_with_google(text):
    if pd.isna(text) or not str(text).strip():
        return text
    try:
        # You can also instantiate the translator outside for slight efficiency
        # translator = GoogleTranslator(source=SOURCE_LANG, target=TARGET_LANG)
        # return translator.translate(str(text))
        return GoogleTranslator(source=SOURCE_LANG, target=TARGET_LANG).translate(str(text))
    except Exception as e:
        print(f"Error translating '{str(text)[:50]}...': {e}")
        return str(text) # Return original text on error

# --- Main Script ---
try:
    df = pd.read_csv(INPUT_CSV_FILE)
except FileNotFoundError:
    print(f"Error: '{INPUT_CSV_FILE}' not found.")
    exit()

if COLUMN_TO_TRANSLATE not in df.columns:
    print(f"Error: Column '{COLUMN_TO_TRANSLATE}' not found in the CSV.")
    exit()

print(f"Starting translation of column '{COLUMN_TO_TRANSLATE}' using Google Translate via deep-translator...")

# Apply translation
# For very large datasets, translating text by text can be slow.
# deep-translator can translate a list of texts (batch):
# texts_to_translate = df[COLUMN_TO_TRANSLATE].astype(str).fillna('').tolist()
# # Filter out empty strings if necessary, or handle them if the API does
# valid_texts = [text for text in texts_to_translate if text.strip()]
# if valid_texts:
#    translator = GoogleTranslator(source=SOURCE_LANG, target=TARGET_LANG)
#    translated_batch = translator.translate_batch(valid_texts)
#    # Now you need to map these back carefully to the DataFrame,
#    # handling NaNs and empty strings that were skipped.
# else:
#    translated_batch = []
# For simplicity, applying row by row here:
df[NEW_COLUMN_NAME] = df[COLUMN_TO_TRANSLATE].apply(translate_with_google)

df.to_csv(OUTPUT_CSV_FILE, index=False)
print(f"Translation complete. Saved to '{OUTPUT_CSV_FILE}'")

Starting translation of column 'Description' using Google Translate via deep-translator...
Error translating 'Le prestataire doit informer formellement le comma...': Le prestataire doit informer formellement le commanditaire, et dans un délai d'un mois, de tout changement juridique, organisationnel ou technique pouvant avoir un impact sur la conformité de la prestation aux exigences du chapitre 19.6. Prestataires de services d'informatique en nuage (SecNumCloud) - référentiel d'exigences Version Date Critère de diffusion Page Annexe 1 Références documentaires I. Codes, textes législatifs et réglementaires Renvoi Document Loi du 6 janvier 1978 relative à l'informatique, aux fichiers et aux libertés. [LOI_IL] Disponible sur http://www.legifrance.gouv.fr Règlement (UE) 2016/679 du parlement européen et du conseil du 27 avril 2016 relatif à la protection des personnes physiques à l'égard du traitement [RGPD] des données à caractère personnel et à la libre circulation de ces données. Dispon