In [1]:
!pip install tqdm



In [2]:
import pdfplumber
import pandas as pd
import re
from tqdm import tqdm

# --- CONFIGURAZIONE ---
pdf_path = "Other/EUCS PDF official file/EUCS ‚Äì Cloud Service candidate cybersecurity certification scheme.pdf"
output_csv = "EUCS_Full_Extraction.csv"

START_PAGE = 30  
END_PAGE = 160   

def clean_text(text):
    if not text:
        return ""
    text = text.replace('\n', ' ').replace('\r', ' ').replace('\xa0', ' ')
    return re.sub(' +', ' ', text).strip()

def is_control_id(text):
    if not text:
        return False
    return bool(re.match(r'^[A-Z]{2,4}-\d{2}', text.strip()))

print(f"üîÑ Inizio estrazione da: {pdf_path}")
print("   (Questa operazione pu√≤ richiedere un paio di minuti...)")

extracted_data = []
current_control = None

with pdfplumber.open(pdf_path) as pdf:
    pages = pdf.pages[START_PAGE:END_PAGE]

    for page in tqdm(pages, desc="Estrazione pagine", unit="pagina"):
        tables = page.extract_tables()

        for table in tables:
            for row in table:
                cleaned_row = [clean_text(cell) if cell else "" for cell in row]

                if not any(cleaned_row):
                    continue
                if "Control ID" in cleaned_row[0] or "Requirement" in cleaned_row[0]:
                    continue

                col_id = cleaned_row[0]

                # --- CASE A: New control row ---
                if is_control_id(col_id):
                    if current_control:
                        extracted_data.append(current_control)

                    description = cleaned_row[1] if len(cleaned_row) > 1 else ""

                    # Estraggo eventuali colonne Ass. Level
                    assurance_basic = cleaned_row[2] if len(cleaned_row) > 2 else ""
                    assurance_substantial = cleaned_row[3] if len(cleaned_row) > 3 else ""
                    assurance_high = cleaned_row[4] if len(cleaned_row) > 4 else ""

                    current_control = {
                        "controlId": col_id,
                        "description": description,
                        "assurance_basic": assurance_basic,
                        "assurance_substantial": assurance_substantial,
                        "assurance_high": assurance_high
                    }

                # --- CASE B: continuation of description ---
                elif col_id == "" and current_control is not None:
                    extra_desc = cleaned_row[1] if len(cleaned_row) > 1 else ""
                    if extra_desc:
                        current_control["description"] += " " + extra_desc

# Add last item  
if current_control:
    extracted_data.append(current_control)

print("\n‚úÖ Estrazione completata.")

df = pd.DataFrame(extracted_data)

df = df.drop_duplicates(subset=["controlId"])
df = df[df["description"].str.len() > 10]

df.to_csv(output_csv, index=False)

print(f"üíæ File salvato: {output_csv}")
print(f"üìä Totale Controlli Estratti: {len(df)}")


üîÑ Inizio estrazione da: Other/EUCS PDF official file/EUCS ‚Äì Cloud Service candidate cybersecurity certification scheme.pdf
   (Questa operazione pu√≤ richiedere un paio di minuti...)


Estrazione pagine: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 130/130 [00:29<00:00,  4.35pagina/s]


‚úÖ Estrazione completata.
üíæ File salvato: EUCS_Full_Extraction.csv
üìä Totale Controlli Estratti: 58





In [3]:
import pandas as pd
import random

df = pd.read_csv("EUCS_Full_Extraction.csv")

print("üìä RIEPILOGO ESTRATTI")
print("----------------------")
print(f"Totale controlli estratti: {len(df)}\n")

print("üîç Controlli con valori Assurance Level:")
print(f"- Basic compilato: {df['assurance_basic'].astype(bool).sum()}")
print(f"- Substantial compilato: {df['assurance_substantial'].astype(bool).sum()}")
print(f"- High compilato: {df['assurance_high'].astype(bool).sum()}\n")

print("üß™ Esempio di 5 controlli a caso:")
samples = df.sample(5, random_state=42)
print(samples[["controlId","assurance_basic","assurance_substantial","assurance_high","description"]])

# Controlli con pattern strani negli assurance
weird = df[(df["assurance_basic"].str.len() > 10) |
           (df["assurance_substantial"].str.len() > 10) |
           (df["assurance_high"].str.len() > 10)]

print("\n‚ö†Ô∏è Possibili valori sospetti negli Assurance:")
print(weird[["controlId","assurance_basic","assurance_substantial","assurance_high"]].head(10))


üìä RIEPILOGO ESTRATTI
----------------------
Totale controlli estratti: 58

üîç Controlli con valori Assurance Level:
- Basic compilato: 58
- Substantial compilato: 58
- High compilato: 58

üß™ Esempio di 5 controlli a caso:
   controlId  assurance_basic  \
0   ISP-02.7              NaN   
5    HR-02.4              NaN   
34  CCM-04.3              NaN   
13   PS-05.6              NaN   
44   PM-04.8              NaN   

                                assurance_substantial  \
0   After an update of procedures and policies, th...   
5   The competency and integrity of internal and e...   
34  The CSP shall automatically monitor the approv...   
13                                                NaN   
44                                                NaN   

                                       assurance_high         description  
0                                                 NaN   Guidance elements  
5                                                 NaN   Guidance elements  
3

AttributeError: Can only use .str accessor with string values!