# Generate training data

In [5]:
import pandas as pd
import re

# Funzione di pulizia per rimuovere spazi extra e newline
def clean_text(text):
    if not isinstance(text, str):
        return ""
    # Rimuove newline e spazi multipli
    text = text.replace('\n', ' ').replace('\r', ' ')
    return re.sub(' +', ' ', text).strip()

print("Librerie importate e funzioni definite.")

Librerie importate e funzioni definite.


In [6]:
# --- CARICAMENTO FILE ---
# Assicurati che i file siano nella stessa cartella del notebook
df_ens = pd.read_csv('Schemes/SpanishENS.csv')
df_secnum = pd.read_csv('Schemes/secnumcloud_controlsParsedAndTranslatedEnglish.csv')

# --- CREAZIONE DIZIONARI DI LOOKUP ---

# 1. Dizionario Spanish ENS
# Mappa 'Control ID' -> 'Description'
ens_lookup = {}
for _, row in df_ens.iterrows():
    cid = str(row['Control ID']).strip()
    desc = clean_text(row['Description'])
    if cid and desc:
        ens_lookup[cid] = desc

# 2. Dizionario SecNumCloud
# Mappa 'ID' -> 'Description_EN' (Usiamo l'inglese per allinearlo al Cisco Framework)
secnum_lookup = {}
for _, row in df_secnum.iterrows():
    sid = str(row['ID']).strip()
    desc = clean_text(row['Description_EN'])
    if sid and desc:
        secnum_lookup[sid] = desc

print(f"Dizionari creati.")
print(f"- ENS entries: {len(ens_lookup)}")
print(f"- SecNumCloud entries: {len(secnum_lookup)}")

Dizionari creati.
- ENS entries: 204
- SecNumCloud entries: 261


In [7]:
# Carica il file HUB (Cisco)
df_cisco = pd.read_csv('Schemes/Cisco.csv')

training_pairs = []

# Colonne di interesse nel file Cisco
col_anchor = 'Control Wording'
cols_ens = ['Spanish ENS BASIC Control', 'Spanish ENS Medium Control', 'Spanish ENS High Control']
col_secnum = 'SecNumCloud Control'

print("Inizio elaborazione righe Cisco...")

for idx, row in df_cisco.iterrows():
    # 1. Estrai l'Anchor (Il testo generico del controllo)
    anchor_text = clean_text(row[col_anchor])
    
    if not anchor_text:
        continue # Salta se non c'è testo
        
    # --- MAPPATURA SPANISH ENS ---
    for col in cols_ens:
            ids_raw = str(row[col])
            if ids_raw and ids_raw.lower() != 'nan':
                ids = [x.strip() for x in re.split(r'[,\n]', ids_raw) if x.strip()]
                
                for map_id in ids:
                    if map_id in ens_lookup:
                        positive_text = ens_lookup[map_id]
                        
                        # --- FILTRO QUALITÀ AGGIUNTO ---
                        # Ignora testi troppo corti (meno di 15 caratteri) o generici
                        if len(positive_text) < 15 or "shall be used" in positive_text.lower():
                            continue 
                        # -------------------------------

                        training_pairs.append({
                            'anchor': anchor_text,
                            'positive': positive_text,
                            'source_standard': 'Spanish ENS',
                            'control_id': map_id
                        })

    # --- MAPPATURA SECNUMCLOUD ---
    ids_raw_sec = str(row[col_secnum])
    if ids_raw_sec and ids_raw_sec.lower() != 'nan':
        ids_sec = [x.strip() for x in re.split(r'[,\n]', ids_raw_sec) if x.strip()]
        
        for map_id in ids_sec:
            # Cerca l'ID nel dizionario SecNumCloud
            if map_id in secnum_lookup:
                positive_text = secnum_lookup[map_id]
                training_pairs.append({
                    'anchor': anchor_text,
                    'positive': positive_text,
                    'source_standard': 'SecNumCloud',
                    'control_id': map_id
                })

# Converti in DataFrame
df_training = pd.DataFrame(training_pairs)

# Rimuovi eventuali duplicati esatti
df_training = df_training.drop_duplicates()

print(f"Generazione completata!")
print(f"Totale coppie di training generate: {len(df_training)}")
df_training.head()

Inizio elaborazione righe Cisco...
Generazione completata!
Totale coppie di training generate: 574


Unnamed: 0,anchor,positive,source_standard,control_id
0,Independent Control self-assessments are perfo...,The service provider must document and impleme...,SecNumCloud,18.2.1.a
1,Independent Control self-assessments are perfo...,The service provider via the information secur...,SecNumCloud,18.3.a
2,Independent Control self-assessments are perfo...,The service provider must document and impleme...,SecNumCloud,18.4.a
3,"At least quarterly, [The Organization] reviews...",The service provider must document the operati...,SecNumCloud,12.1.a
4,A three-year audit program is in place which d...,The service provider must document and impleme...,SecNumCloud,18.2.1.a


In [None]:
# Salvataggio su CSV
output_filename = 'TrainAndTestData/training.csv'
df_training.to_csv(output_filename, index=False)
print(f"File salvato come: {output_filename}")



File salvato come: TrainAndTestData/training_data_compliance.csv
