In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import ndcg_score
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

# Caricamento e preparazione dati
def prepare_data(unified_df):
    # Estrai tutti i codici unici per stratificazione
    unified_df['combined_labels'] = unified_df['eucs_code'] + '|' + unified_df['secNumCloud_id']
    
    # Prepara il testo
    unified_df['full_text'] = (
        unified_df['api_name'].fillna('') + ' ' +
        unified_df['description'].fillna('') + ' ' +
        unified_df['cisco_title'].fillna('')
    )
    
    # Binarizza le etichette per NDCG
    mlb_eucs = MultiLabelBinarizer()
    mlb_secnum = MultiLabelBinarizer()
    
    # Crea liste di etichette
    eucs_labels = [label.split(',') for label in unified_df['eucs_code']]
    secnum_labels = [[label] for label in unified_df['secNumCloud_id']]
    
    # Adatta i binarizzatori
    eucs_binary = mlb_eucs.fit_transform(eucs_labels)
    secnum_binary = mlb_secnum.fit_transform(secnum_labels)
    
    return {
        'texts': unified_df['full_text'].values,
        'eucs_labels': eucs_binary,
        'secnum_labels': secnum_binary,
        'mlb_eucs': mlb_eucs,
        'mlb_secnum': mlb_secnum,
        'combined_labels': unified_df['combined_labels'].values
    }

In [None]:
class SecurityControlModel(tf.keras.Model):
    def __init__(self, num_eucs_classes, num_secnum_classes):
        super().__init__()
        self.bert = TFBertModel.from_pretrained('bert-base-multilingual-cased')
        self.dropout = tf.keras.layers.Dropout(0.3)
        self.eucs_classifier = tf.keras.layers.Dense(num_eucs_classes, activation='sigmoid')
        self.secnum_classifier = tf.keras.layers.Dense(num_secnum_classes, activation='sigmoid')
    
    def call(self, inputs):
        input_ids, attention_mask = inputs
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        
        eucs_logits = self.eucs_classifier(pooled_output)
        secnum_logits = self.secnum_classifier(pooled_output)
        
        return eucs_logits, secnum_logits

In [None]:
class SecurityDataset(tf.keras.utils.Sequence):
    def __init__(self, texts, eucs_labels, secnum_labels, tokenizer, batch_size=8, max_len=256):
        self.texts = texts
        self.eucs_labels = eucs_labels
        self.secnum_labels = secnum_labels
        self.tokenizer = tokenizer
        self.batch_size = batch_size
        self.max_len = max_len
        self.indices = np.arange(len(texts))
    
    def __len__(self):
        return int(np.ceil(len(self.texts) / self.batch_size))
    
    def __getitem__(self, idx):
        batch_indices = self.indices[idx*self.batch_size:(idx+1)*self.batch_size]
        batch_texts = [self.texts[i] for i in batch_indices]
        
        encodings = self.tokenizer(
            batch_texts,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='tf'
        )
        
        batch_eucs = self.eucs_labels[batch_indices]
        batch_secnum = self.secnum_labels[batch_indices]
        
        return (encodings['input_ids'], encodings['attention_mask']), (batch_eucs, batch_secnum)
    
    def on_epoch_end(self):
        np.random.shuffle(self.indices)

In [None]:
def train_and_evaluate(data, num_folds=10):
    # Inizializzazione tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
    
    # Preparazione per cross-validation
    kfold = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)
    fold_results = []
    
    for fold, (train_idx, val_idx) in enumerate(kfold.split(data['texts'], data['combined_labels'])):
        print(f"\n{'='*40}")
        print(f"Training Fold {fold+1}/{num_folds}")
        print(f"{'='*40}")
        
        # Split dati
        train_texts = data['texts'][train_idx]
        val_texts = data['texts'][val_idx]
        
        train_eucs = data['eucs_labels'][train_idx]
        val_eucs = data['eucs_labels'][val_idx]
        
        train_secnum = data['secnum_labels'][train_idx]
        val_secnum = data['secnum_labels'][val_idx]
        
        # Creazione dataset
        train_dataset = SecurityDataset(
            train_texts, train_eucs, train_secnum, tokenizer, batch_size=8
        )
        
        val_dataset = SecurityDataset(
            val_texts, val_eucs, val_secnum, tokenizer, batch_size=8
        )
        
        # Inizializzazione modello
        model = SecurityControlModel(
            num_eucs_classes=data['eucs_labels'].shape[1],
            num_secnum_classes=data['secnum_labels'].shape[1]
        )
        
        # Compilazione modello
        model.compile(
            optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
            loss={
                'output_1': tf.keras.losses.BinaryCrossentropy(),
                'output_2': tf.keras.losses.BinaryCrossentropy()
            },
            loss_weights=[0.7, 0.3],  # Ponderazione per EUCS vs SecNumCloud
            metrics={
                'output_1': [
                    tf.keras.metrics.Precision(name='precision'),
                    tf.keras.metrics.Recall(name='recall'),
                    tf.keras.metrics.AUC(name='auc')
                ],
                'output_2': [
                    tf.keras.metrics.Precision(name='precision'),
                    tf.keras.metrics.Recall(name='recall'),
                    tf.keras.metrics.AUC(name='auc')
                ]
            }
        )
        
        # Callbacks
        callbacks = [
            tf.keras.callbacks.EarlyStopping(
                patience=3,
                restore_best_weights=True,
                monitor='val_output_1_auc'
            ),
            tf.keras.callbacks.ReduceLROnPlateau(
                monitor='val_loss',
                factor=0.1,
                patience=2
            )
        ]
        
        # Addestramento
        history = model.fit(
            train_dataset,
            validation_data=val_dataset,
            epochs=15,
            callbacks=callbacks,
            verbose=1
        )
        
        # Valutazione sul validation set
        val_preds = model.predict(val_dataset)
        
        # Calcolo NDCG per EUCS
        eucs_ndcg = ndcg_score(val_eucs, val_preds[0], k=10)
        
        # Calcolo NDCG per SecNumCloud
        secnum_ndcg = ndcg_score(val_secnum, val_preds[1], k=10)
        
        # Calcolo Precision-Recall AUC per EUCS
        eucs_auc = tf.keras.metrics.AUC(curve='PR')(
            val_eucs, val_preds[0]
        ).numpy()
        
        # Calcolo Precision-Recall AUC per SecNumCloud
        secnum_auc = tf.keras.metrics.AUC(curve='PR')(
            val_secnum, val_preds[1]
        ).numpy()
        
        # Salvataggio risultati
        fold_results.append({
            'fold': fold+1,
            'eucs_ndcg': eucs_ndcg,
            'secnum_ndcg': secnum_ndcg,
            'eucs_auc': eucs_auc,
            'secnum_auc': secnum_auc,
            'history': history.history
        })
        
        print(f"\nFold {fold+1} Results:")
        print(f"EUCS NDCG@10: {eucs_ndcg:.4f}")
        print(f"SecNumCloud NDCG@10: {secnum_ndcg:.4f}")
        print(f"EUCS PR-AUC: {eucs_auc:.4f}")
        print(f"SecNumCloud PR-AUC: {secnum_auc:.4f}")
    
    return fold_results

In [None]:
def evaluate_model(model, test_dataset, mlb_eucs, mlb_secnum):
    # Previsioni sul test set
    test_preds = model.predict(test_dataset)
    
    # Calcolo NDCG a diversi k
    ndcg_results = {}
    for k in [1, 3, 5, 10]:
        eucs_ndcg = ndcg_score(test_dataset.eucs_labels, test_preds[0], k=k)
        secnum_ndcg = ndcg_score(test_dataset.secnum_labels, test_preds[1], k=k)
        ndcg_results[f'eucs_ndcg@{k}'] = eucs_ndcg
        ndcg_results[f'secnum_ndcg@{k}'] = secnum_ndcg
    
    # Calcolo Precision-Recall AUC
    eucs_auc = tf.keras.metrics.AUC(curve='PR')(
        test_dataset.eucs_labels, test_preds[0]
    ).numpy()
    
    secnum_auc = tf.keras.metrics.AUC(curve='PR')(
        test_dataset.secnum_labels, test_preds[1]
    ).numpy()
    
    # Calcolo F1-score
    eucs_f1 = f1_score(
        test_dataset.eucs_labels, 
        test_preds[0] > 0.5, 
        average='samples'
    )
    
    secnum_f1 = f1_score(
        test_dataset.secnum_labels, 
        test_preds[1] > 0.5, 
        average='samples'
    )
    
    # Calcolo Hamming Loss
    eucs_hamming = hamming_loss(
        test_dataset.eucs_labels, 
        test_preds[0] > 0.5
    )
    
    secnum_hamming = hamming_loss(
        test_dataset.secnum_labels, 
        test_preds[1] > 0.5
    )
    
    # Metriche di ranking
    coverage = {}
    for k in [3, 5, 10]:
        # Per EUCS
        top_k_preds = np.argsort(test_preds[0], axis=1)[:, -k:]
        coverage[f'eucs_coverage@{k}'] = coverage_error(
            test_dataset.eucs_labels, 
            test_preds[0], 
            top_k=top_k_preds
        )
        
        # Per SecNumCloud
        top_k_preds = np.argsort(test_preds[1], axis=1)[:, -k:]
        coverage[f'secnum_coverage@{k}'] = coverage_error(
            test_dataset.secnum_labels, 
            test_preds[1], 
            top_k=top_k_preds
        )
    
    return {
        'ndcg': ndcg_results,
        'auc': {'eucs': eucs_auc, 'secnum': secnum_auc},
        'f1': {'eucs': eucs_f1, 'secnum': secnum_f1},
        'hamming_loss': {'eucs': eucs_hamming, 'secnum': secnum_hamming},
        'coverage': coverage
    }

def coverage_error(y_true, y_pred, top_k):
    """
    Calcola la copertura: quanti label rilevanti sono stati recuperati nel top-k
    """
    coverage_scores = []
    for i in range(len(y_true)):
        true_labels = set(np.where(y_true[i] == 1)[0])
        predicted_labels = set(top_k[i])
        coverage_scores.append(len(true_labels & predicted_labels) / len(true_labels) if len(true_labels) > 0 else 0)
    
    return np.mean(coverage_scores)

In [None]:
def full_training_pipeline(unified_df):
    # Preparazione dati
    data = prepare_data(unified_df)
    
    # Addestramento con 10-fold cross-validation
    fold_results = train_and_evaluate(data, num_folds=10)
    
    # Analisi dei risultati
    avg_metrics = {
        'eucs_ndcg': np.mean([r['eucs_ndcg'] for r in fold_results]),
        'secnum_ndcg': np.mean([r['secnum_ndcg'] for r in fold_results]),
        'eucs_auc': np.mean([r['eucs_auc'] for r in fold_results]),
        'secnum_auc': np.mean([r['secnum_auc'] for r in fold_results])
    }
    
    print("\nFinal Average Metrics:")
    print(f"EUCS NDCG@10: {avg_metrics['eucs_ndcg']:.4f}")
    print(f"SecNumCloud NDCG@10: {avg_metrics['secnum_ndcg']:.4f}")
    print(f"EUCS PR-AUC: {avg_metrics['eucs_auc']:.4f}")
    print(f"SecNumCloud PR-AUC: {avg_metrics['secnum_auc']:.4f}")
    
    # Addestramento del modello finale su tutti i dati
    print("\nTraining final model on full dataset...")
    tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
    full_dataset = SecurityDataset(
        data['texts'], 
        data['eucs_labels'], 
        data['secnum_labels'], 
        tokenizer
    )
    
    final_model = SecurityControlModel(
        num_eucs_classes=data['eucs_labels'].shape[1],
        num_secnum_classes=data['secnum_labels'].shape[1]
    )
    
    final_model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
        loss=tf.keras.losses.BinaryCrossentropy(),
        metrics=[
            tf.keras.metrics.Precision(name='precision'),
            tf.keras.metrics.Recall(name='recall'),
            tf.keras.metrics.AUC(name='auc')
        ]
    )
    
    final_model.fit(
        full_dataset,
        epochs=10,
        verbose=1
    )
    
    # Salvataggio del modello finale
    final_model.save('security_control_model')
    data['mlb_eucs'].dump('mlb_eucs.pkl')
    data['mlb_secnum'].dump('mlb_secnum.pkl')
    
    print("Final model saved successfully!")
    
    return final_model, data, fold_results, avg_metrics

In [None]:
class SecurityControlPredictor:
    def __init__(self, model_path, mlb_eucs_path, mlb_secnum_path):
        self.model = tf.keras.models.load_model(model_path)
        self.mlb_eucs = joblib.load(mlb_eucs_path)
        self.mlb_secnum = joblib.load(mlb_secnum_path)
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
    
    def predict(self, text, top_k=5):
        # Tokenizzazione
        encoding = self.tokenizer(
            text,
            max_length=256,
            padding='max_length',
            truncation=True,
            return_tensors='tf'
        )
        
        # Previsione
        eucs_probs, secnum_probs = self.model.predict([
            encoding['input_ids'],
            encoding['attention_mask']
        ])
        
        # Processamento risultati EUCS
        eucs_indices = np.argsort(eucs_probs[0])[::-1][:top_k]
        eucs_results = []
        for idx in eucs_indices:
            code = self.mlb_eucs.classes_[idx]
            prob = eucs_probs[0][idx]
            eucs_results.append({'code': code, 'probability': float(prob)})
        
        # Processamento risultati SecNumCloud
        secnum_indices = np.argsort(secnum_probs[0])[::-1][:top_k]
        secnum_results = []
        for idx in secnum_indices:
            code = self.mlb_secnum.classes_[idx]
            prob = secnum_probs[0][idx]
            secnum_results.append({'code': code, 'probability': float(prob)})
        
        return {
            'eucs_predictions': eucs_results,
            'secnum_predictions': secnum_results,
            'combined_confidence': self._calculate_confidence(eucs_probs, secnum_probs)
        }
    
    def _calculate_confidence(self, eucs_probs, secnum_probs):
        """Calcola una metrica di confidenza combinata"""
        # Media della probabilità massima per ciascun task
        eucs_max = np.max(eucs_probs)
        secnum_max = np.max(secnum_probs)
        return float((eucs_max + secnum_max) / 2)
    
    def evaluate_ndcg(self, test_data, k=10):
        """Valuta NDCG su un dataset di test"""
        test_texts = test_data['texts']
        true_eucs = test_data['eucs_labels']
        true_secnum = test_data['secnum_labels']
        
        pred_eucs = []
        pred_secnum = []
        
        # Genera previsioni per tutti i test samples
        for text in tqdm(test_texts):
            encoding = self.tokenizer(
                text,
                max_length=256,
                padding='max_length',
                truncation=True,
                return_tensors='tf'
            )
            eucs_prob, secnum_prob = self.model.predict([
                encoding['input_ids'],
                encoding['attention_mask']
            ])
            pred_eucs.append(eucs_prob[0])
            pred_secnum.append(secnum_prob[0])
        
        # Calcolo NDCG
        eucs_ndcg = ndcg_score(true_eucs, np.array(pred_eucs), k=k)
        secnum_ndcg = ndcg_score(true_secnum, np.array(pred_secnum), k=k)
        
        return {
            'eucs_ndcg': eucs_ndcg,
            'secnum_ndcg': secnum_ndcg,
            'average_ndcg': (eucs_ndcg + secnum_ndcg) / 2
        }

In [None]:
# Caricamento dei dati
unified_df = pd.read_csv('Output/unified_data.csv')

# Esecuzione della pipeline
final_model, prepared_data, fold_results, avg_metrics = full_training_pipeline(unified_df)

# Inizializzazione del predictor
predictor = SecurityControlPredictor(
    'security_control_model',
    'mlb_eucs.pkl',
    'mlb_secnum.pkl'
)

# Esempio di utilizzo
new_control = "Gestione sicura degli account utente e controllo degli accessi"
prediction = predictor.predict(new_control)

print("\nPrediction Results:")
print("EUCS Codes:")
for item in prediction['eucs_predictions']:
    print(f"  {item['code']}: {item['probability']:.4f}")

print("\nSecNumCloud Codes:")
for item in prediction['secnum_predictions']:
    print(f"  {item['code']}: {item['probability']:.4f}")

# Valutazione NDCG sul test set (se disponibile)
if 'test_data' in locals():
    ndcg_eval = predictor.evaluate_ndcg(test_data)
    print(f"\nNDCG Evaluation: EUCS={ndcg_eval['eucs_ndcg']:.4f}, SecNumCloud={ndcg_eval['secnum_ndcg']:.4f}")