In [1]:
import pandas as pd
import numpy as np

In [2]:
columns = [
    "age", "workclass", "fnlwgt", "education", "education_num",
    "marital_status", "occupation", "relationship", "race", "sex",
    "capital_gain", "capital_loss", "hours_per_week",
    "native_country", "income"
]

df = pd.read_csv(
    "adult.data",
    names=columns,
    na_values="?",
    skipinitialspace=True
)

In [3]:
numeric_columns = [
    "age", "fnlwgt", "education_num",
    "capital_gain", "capital_loss", "hours_per_week"
]

for col in numeric_columns:
    df[col] = pd.to_numeric(df[col], errors="coerce")

categorical_columns = [
    "workclass", "education", "marital_status", "occupation",
    "relationship", "race", "sex", "native_country", "income"
]

for col in categorical_columns:
    df[col] = df[col].str.strip().str.lower()

In [4]:
df.isnull().mean().sort_values(ascending=False)

occupation        0.056601
workclass         0.056386
native_country    0.017905
age               0.000000
fnlwgt            0.000000
education         0.000000
education_num     0.000000
marital_status    0.000000
relationship      0.000000
race              0.000000
sex               0.000000
capital_gain      0.000000
capital_loss      0.000000
hours_per_week    0.000000
income            0.000000
dtype: float64

In [5]:
numeric_impute_cols = ["age", "education_num", "hours_per_week"]

for col in numeric_impute_cols:
    median_value = df[col].median()
    df[col] = df[col].fillna(median_value)

In [6]:
low_card_cat_cols = ["workclass"]

for col in low_card_cat_cols:
    mode_value = df[col].mode()[0]
    df[col] = df[col].fillna(mode_value)


In [7]:
high_card_cat_cols = ["occupation", "native_country"]

for col in high_card_cat_cols:
    df[col] = df[col].fillna("unknown")

In [8]:
df.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64