In [None]:
# Step 1: Import libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Step 2: Simulate sample data with quality issues
data = {
    'age': [25, 30, np.nan, 40, 1000],  # 1000 is an outlier
    'income': [50000, 60000, 55000, None, 70000],
    'gender': ['M', 'F', 'F', 'M', 'Unknown']
}

df = pd.DataFrame(data)

# Step 3: Label rows with data quality issues (target variable)
# If any missing value, outlier or unknown in a row → label as 1 (issue), else 0 (clean)
def label_issues(row):
    if (pd.isnull(row['age']) or pd.isnull(row['income']) or
        row['age'] > 120 or row['gender'] == 'Unknown'):
        return 1
    return 0

df['issue'] = df.apply(label_issues, axis=1)

# Step 4: Preprocess
df_clean = df.copy()
df_clean['gender'] = df_clean['gender'].replace('Unknown', np.nan)
df_clean['gender'] = df_clean['gender'].map({'M': 0, 'F': 1})
df_clean = df_clean.fillna(df_clean.mean(numeric_only=True))

# Step 5: Train ML model
X = df_clean[['age', 'income', 'gender']]
y = df_clean['issue']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier()
model.fit(X_train, y_train)

# Step 6: Predict and evaluate
y_pred = model.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))