In [1]:
import pandas as pd
import numpy as np

In [2]:
columns = [
    "age", "workclass", "fnlwgt", "education", "education_num",
    "marital_status", "occupation", "relationship", "race", "sex",
    "capital_gain", "capital_loss", "hours_per_week",
    "native_country", "income"
]

df = pd.read_csv(
    "adult.data",
    names=columns,
    na_values="?",
    skipinitialspace=True
)

# Step 2: type coercion
numeric_columns = [
    "age", "fnlwgt", "education_num",
    "capital_gain", "capital_loss", "hours_per_week"
]

for col in numeric_columns:
    df[col] = pd.to_numeric(df[col], errors="coerce")

categorical_columns = [
    "workclass", "education", "marital_status", "occupation",
    "relationship", "race", "sex", "native_country", "income"
]

for col in categorical_columns:
    df[col] = df[col].str.strip().str.lower()

# Step 3: missing values
df["age"] = df["age"].fillna(df["age"].median())
df["education_num"] = df["education_num"].fillna(df["education_num"].median())
df["hours_per_week"] = df["hours_per_week"].fillna(df["hours_per_week"].median())

df["workclass"] = df["workclass"].fillna(df["workclass"].mode()[0])
df["occupation"] = df["occupation"].fillna("unknown")
df["native_country"] = df["native_country"].fillna("unknown")

# Step 4: outlier flags
def detect_outliers_iqr(series):
    q1 = series.quantile(0.25)
    q3 = series.quantile(0.75)
    iqr = q3 - q1
    lower = q1 - 1.5 * iqr
    upper = q3 + 1.5 * iqr
    return (series < lower) | (series > upper)

for col in ["age", "education_num", "hours_per_week"]:
    df[f"{col}_outlier"] = detect_outliers_iqr(df[col])

In [3]:
initial_rows = len(df)
initial_rows

32561

In [4]:
dedup_columns = [
    "age",
    "workclass",
    "education",
    "education_num",
    "marital_status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "hours_per_week",
    "native_country",
    "income"
]

In [7]:
duplicate_mask = df.duplicated(subset=dedup_columns, keep="first")
duplicate_mask

0        False
1        False
2        False
3        False
4        False
         ...  
32556    False
32557     True
32558    False
32559     True
32560    False
Length: 32561, dtype: bool

In [8]:
df.loc[duplicate_mask].head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income,age_outlier,education_num_outlier,hours_per_week_outlier
352,33,private,111363,bachelors,13,married-civ-spouse,exec-managerial,husband,white,male,0,0,40,united-states,>50k,False,False,False
391,27,private,301302,bachelors,13,never-married,craft-repair,not-in-family,white,male,0,0,50,united-states,<=50k,False,False,False
526,42,private,221172,bachelors,13,married-civ-spouse,exec-managerial,husband,white,male,0,0,40,united-states,>50k,False,False,False
563,24,private,82804,hs-grad,9,never-married,handlers-cleaners,unmarried,black,female,0,0,40,united-states,<=50k,False,False,False
570,24,private,354351,hs-grad,9,never-married,craft-repair,own-child,white,male,0,0,40,united-states,<=50k,False,False,False


In [10]:
df_deduped = df.drop_duplicates(subset=dedup_columns, keep="first")

In [11]:
final_rows = len(df_deduped)

initial_rows, final_rows, initial_rows - final_rows

(32561, 28318, 4243)