In [2]:
import pandas as pd
import numpy as np

In [3]:
columns = [
    "age", "workclass", "fnlwgt", "education", "education_num",
    "marital_status", "occupation", "relationship", "race", "sex",
    "capital_gain", "capital_loss", "hours_per_week",
    "native_country", "income"
]

df = pd.read_csv(
    "adult.data",
    names=columns,
    na_values="?",
    skipinitialspace=True
)

# Step 2: type coercion
numeric_columns = [
    "age", "fnlwgt", "education_num",
    "capital_gain", "capital_loss", "hours_per_week"
]

for col in numeric_columns:
    df[col] = pd.to_numeric(df[col], errors="coerce")

categorical_columns = [
    "workclass", "education", "marital_status", "occupation",
    "relationship", "race", "sex", "native_country", "income"
]

for col in categorical_columns:
    df[col] = df[col].str.strip().str.lower()

# Step 3: missing values
df["age"] = df["age"].fillna(df["age"].median())
df["education_num"] = df["education_num"].fillna(df["education_num"].median())
df["hours_per_week"] = df["hours_per_week"].fillna(df["hours_per_week"].median())

df["workclass"] = df["workclass"].fillna(df["workclass"].mode()[0])
df["occupation"] = df["occupation"].fillna("unknown")
df["native_country"] = df["native_country"].fillna("unknown")


In [4]:
def detect_outliers_iqr(series):
    q1 = series.quantile(0.25)
    q3 = series.quantile(0.75)
    iqr = q3 - q1
    lower = q1 - 1.5 * iqr
    upper = q3 + 1.5 * iqr
    return (series < lower) | (series > upper)

In [5]:
outlier_candidates = ["age", "education_num", "hours_per_week"]

In [6]:
outlier_flags = {}

for col in outlier_candidates:
    mask = detect_outliers_iqr(df[col])
    outlier_flags[col] = mask.mean()
    print(f"{col}: {mask.sum()} outliers ({mask.mean():.2%})")

age: 143 outliers (0.44%)
education_num: 1198 outliers (3.68%)
hours_per_week: 9008 outliers (27.66%)


In [7]:
for col in outlier_candidates:
    df[f"{col}_outlier"] = detect_outliers_iqr(df[col])

In [8]:
(df["capital_gain"] == 0).mean(), (df["capital_loss"] == 0).mean()

(0.9167101747489328, 0.9533490986149074)