In [1]:
# Clean environment install
!pip install --upgrade --force-reinstall numpy==1.23.5 pandas scikit-learn cleanlab[datalab]


Collecting numpy==1.23.5
  Downloading numpy-1.23.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.3 kB)
Collecting pandas
  Downloading pandas-2.2.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.9/89.9 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scikit-learn
  Downloading scikit_learn-1.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting cleanlab[datalab]
  Downloading cleanlab-2.7.1-py3-none-any.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.7/60.7 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting python-dateutil>=2.8.2 (from pandas)
  Downloading python_dateutil-2.9.0.post0-py2.py3-none-any.whl.metadata (8.4 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  

In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from cleanlab.classification import CleanLearning

In [2]:
# Load the Iris dataset
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['target'] = iris.target
print(df.head())

   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

   target  
0       0  
1       0  
2       0  
3       0  
4       0  


In [3]:
# Inject anomalies into the 'petal length (cm)' column
np.random.seed(42)
anomaly_indices = np.random.choice(df.index, size=10, replace=False)
df.loc[anomaly_indices, 'petal length (cm)'] = np.random.uniform(5, 7, size=10)


In [4]:
X = df[iris.feature_names].values
y = df["target"].values

clf = CleanLearning()
clf.fit(X, y)

# Find potential label issues (used as a proxy for anomalies here)
label_issues = clf.find_label_issues(X, y)
anomalies = np.where(label_issues["is_label_issue"])[0]

print("Anomalies detected at indices:", anomalies)
print("Suspected anomaly values:")
print(X[anomalies])


Anomalies detected at indices: [ 18  31  68  82 106 119]
Suspected anomaly values:
[[5.7        3.8        5.82076585 0.3       ]
 [5.4        3.4        5.57950291 0.4       ]
 [6.2        2.2        6.61624076 1.5       ]
 [5.8        2.7        6.26680751 1.2       ]
 [4.9        2.5        4.5        1.7       ]
 [6.         2.2        5.         1.5       ]]


In [5]:
# Create a list to store suspected anomalous dataframes
suspect_dfs = []
flower_species = {0: "Setosa", 1: "Versicolor", 2: "Virginica"}

for idx in anomalies:
    df_suspect = pd.DataFrame([df.iloc[idx][iris.feature_names].values], columns=iris.feature_names)
    df_suspect.insert(0, "Index", idx)
    df_suspect["True Label"] = df.iloc[idx]["target"]
    df_suspect["Species"] = flower_species[df.iloc[idx]["target"]]
    suspect_dfs.append(df_suspect)

df_all_suspects = pd.concat(suspect_dfs, ignore_index=True)

print("\nSuspected Anomalous Data Points")
print("-----------------------------------------------------------")
print(df_all_suspects.to_string(index=False))



Suspected Anomalous Data Points
-----------------------------------------------------------
 Index  sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  True Label    Species
    18                5.7               3.8           5.820766               0.3         0.0     Setosa
    31                5.4               3.4           5.579503               0.4         0.0     Setosa
    68                6.2               2.2           6.616241               1.5         1.0 Versicolor
    82                5.8               2.7           6.266808               1.2         1.0 Versicolor
   106                4.9               2.5           4.500000               1.7         2.0  Virginica
   119                6.0               2.2           5.000000               1.5         2.0  Virginica
