# Verifica qualità delle etichette con Cleanlab

In [None]:
# Se sei su Colab, esegui:
# !pip install cleanlab


In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.feature_extraction.text import TfidfVectorizer
from cleanlab.classification import CleanLearning
from cleanlab.filter import find_label_issues

# Caricamento dataset (può essere pesato o base)
df = pd.read_csv("dataset_weighted.csv")

# Colonne dei sintomi
symptom_cols = [col for col in df.columns if "Symptom_" in col]


In [None]:
import numpy as np
np.random.seed(42)

# Encode labels numerici
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Introduci errori casuali su 10 campioni
n_errors = 10
idx_to_corrupt = np.random.choice(len(y_encoded), n_errors, replace=False)

original_labels = y_encoded.copy()
for idx in idx_to_corrupt:
    choices = list(set(y_encoded) - {y_encoded[idx]})
    y_encoded[idx] = np.random.choice(choices)

print("Etichette corrotte in posizione:", idx_to_corrupt)


In [None]:
# Features: vettore numerico da sintomi pesati
X = df[symptom_cols].values
y = df["Disease"].values


In [None]:
# Classificatore + CleanLearning
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clean_model = CleanLearning(clf, seed=42)

# Trova etichette sospette
issues = find_label_issues(labels=y, pred_probs=cross_val_predict(clf, X, y, method='predict_proba'), return_indices_ranked_by='self_confidence')

# Mostra problemi trovati
df_issues = df.iloc[issues][["Disease"] + symptom_cols]
df_issues.head()


In [None]:
df_issues.to_csv("label_issues_detected.csv", index=False)
print(f"Trovati {len(df_issues)} potenziali errori di etichettatura.")
