# Cleanlab: reporte de problemas en los datos

[Cleanlab](https://docs.cleanlab.ai/stable/tutorials/datalab/tabular.html) nos permite analizar problemas con los datos y solucionarlos

In [None]:
import pandas as pd
df_train = pd.read_csv('https://raw.githubusercontent.com/amiune/freecodingtour/main/cursos/espanol/datascience/data/diabetes/diabetes_train_procesado.csv')
df_train.head()

In [None]:
X_train = df_train.loc[:, df_train.columns != "diabetes"]
y_train = df_train.loc[:, "diabetes"]

In [None]:
import random
import numpy as np

from sklearn.model_selection import cross_val_predict
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.neighbors import NearestNeighbors

SEED = 42  # for reproducibility
np.random.seed(SEED)
random.seed(SEED)

In [None]:
clf = HistGradientBoostingClassifier()

In [None]:
num_crossval_folds = 5
pred_probs = cross_val_predict(
    clf,
    X_train,
    y_train,
    cv=num_crossval_folds,
    method="predict_proba",
)

In [None]:
KNN = NearestNeighbors(metric='euclidean')
KNN.fit(X_train.values)

knn_graph = KNN.kneighbors_graph(mode="distance")

In [None]:
!pip install 'cleanlab[datalab]' -U -q

In [None]:
from cleanlab import Datalab

In [None]:
data = {"X": X_train.values, "y": y_train}

lab = Datalab(data, label_name="y")
lab.find_issues(pred_probs=pred_probs, knn_graph=knn_graph)

In [None]:
lab.report()

# Fin: [Volver al contenido del curso](https://www.freecodingtour.com/cursos/espanol/datascience/datascience.html)