In [None]:
import pdfplumber
import pandas as pd
import re
from tqdm import tqdm

pdf_path = "Other/EUCS PDF official file/EUCS – Cloud Service candidate cybersecurity certification scheme.pdf"
output_csv = "Other/EUCS PDF official file/EUCS PDF official fileEUCS_AnnexA_Extraction.csv"

START_PAGE = 30
END_PAGE = 200

def clean(t):
    if not t:
        return ""
    return re.sub(r"\s+", " ", t.replace("\xa0", " ")).strip()

# Regex per righe dei requisiti
REQ_REGEX = re.compile(r"^([A-Z]{2,4}-\d{2}\.\d)(.*?)(Basic|Substantial|High)$")

data = []
unique_counter = {}  # Per gestire ID duplicati

with pdfplumber.open(pdf_path) as pdf:
    for page in tqdm(pdf.pages[START_PAGE:END_PAGE], desc="Scanning pages"):
        text = page.extract_text()
        if not text:
            continue

        lines = text.split("\n")

        buffer_id = None
        buffer_text = ""

        for line in lines:

            line = clean(line)
            if not line:
                continue

            # Prova match requisito
            m = REQ_REGEX.match(line)

            if m:
                # Se c'era un requisito precedente bufferizzato → salvalo
                if buffer_id is not None:
                    uid = f"{buffer_id}-{unique_counter[buffer_id]}"
                    data.append({
                        "controlId": uid,
                        "baseId": buffer_id,
                        "description": buffer_text.strip(),
                        "assurance_level": buffer_level
                    })

                # Inizio nuovo requisito
                base_id = m.group(1)
                desc = clean(m.group(2))
                level = m.group(3)

                if base_id not in unique_counter:
                    unique_counter[base_id] = 1
                else:
                    unique_counter[base_id] += 1

                buffer_id = base_id
                buffer_text = desc
                buffer_level = level
                continue

            # Se la riga NON ha un nuovo ID → è una continuazione della descrizione
            if buffer_id:
                buffer_text += " " + line

        # Fine pagina: salva ultimo requisito
        if buffer_id:
            uid = f"{buffer_id}-{unique_counter[buffer_id]}"
            data.append({
                "controlId": uid,
                "baseId": buffer_id,
                "description": buffer_text.strip(),
                "assurance_level": buffer_level
            })
            buffer_id = None

df = pd.DataFrame(data)
df.to_csv(output_csv, index=False)

print("Done. Saved:", output_csv)
print("Records:", len(df))


Scanning pages: 100%|██████████| 170/170 [00:37<00:00,  4.51it/s]

Done. Saved: Other/EUCS PDF official fileEUCS_AnnexA_Extraction.csv
Records: 534





In [None]:
import pandas as pd

df = pd.read_csv("Other/EUCS PDF official file/EUCS PDF official fileEUCS_AnnexA_Extraction.csv")

print("Totale requisiti estratti:", len(df))
print("\nAssurance Level counts:")
print(df["assurance_level"].value_counts())

print("\nEsempi random:")
print(df.sample(5, random_state=42)[["controlId","baseId","assurance_level","description"]])

print("\nControlli con descrizione molto breve (possibile problema):")
print(df[df["description"].str.len() < 30].head())

print("\nControlli duplicati per baseId:")
print(df["baseId"].value_counts().head(10))


Totale requisiti estratti: 534

Assurance Level counts:
assurance_level
Basic          219
Substantial    187
High           128
Name: count, dtype: int64

Esempi random:
      controlId    baseId assurance_level  \
222  IAM-02.3-1  IAM-02.3     Substantial   
131   PS-04.8-1   PS-04.8           Basic   
149  OPS-05.1-1  OPS-05.1           Basic   
244  IAM-04.5-1  IAM-04.5            High   
84    AM-01.5-1   AM-01.5            High   

                                           description  
222  The CSP shall document, communicate and make a...  
131  The CSP shall use encryption on the removable ...  
149  The CSP shall deploy malware protection, if te...  
244  The CSP shall document the incompatibility bet...  
84   The information about assets shall be consider...  

Controlli con descrizione molto breve (possibile problema):
Empty DataFrame
Columns: [controlId, baseId, description, assurance_level]
Index: []

Controlli duplicati per baseId:
baseId
IAM-03.1    4
PS-02.1     2
CS