In [1]:
import pandas as pd
import numpy as np

In [2]:
columns = [
    "age", "workclass", "fnlwgt", "education", "education_num",
    "marital_status", "occupation", "relationship", "race", "sex",
    "capital_gain", "capital_loss", "hours_per_week",
    "native_country", "income"
]

df = pd.read_csv(
    "adult.data",
    names=columns,
    na_values="?",
    skipinitialspace=True
)

# Step 2: type coercion
numeric_columns = [
    "age", "fnlwgt", "education_num",
    "capital_gain", "capital_loss", "hours_per_week"
]

for col in numeric_columns:
    df[col] = pd.to_numeric(df[col], errors="coerce")

categorical_columns = [
    "workclass", "education", "marital_status", "occupation",
    "relationship", "race", "sex", "native_country", "income"
]

for col in categorical_columns:
    df[col] = df[col].str.strip().str.lower()

# Step 3: missing values
df["age"] = df["age"].fillna(df["age"].median())
df["education_num"] = df["education_num"].fillna(df["education_num"].median())
df["hours_per_week"] = df["hours_per_week"].fillna(df["hours_per_week"].median())

df["workclass"] = df["workclass"].fillna(df["workclass"].mode()[0])
df["occupation"] = df["occupation"].fillna("unknown")
df["native_country"] = df["native_country"].fillna("unknown")

# Step 4: outlier flags
def detect_outliers_iqr(series):
    q1 = series.quantile(0.25)
    q3 = series.quantile(0.75)
    iqr = q3 - q1
    lower = q1 - 1.5 * iqr
    upper = q3 + 1.5 * iqr
    return (series < lower) | (series > upper)

for col in ["age", "education_num", "hours_per_week"]:
    df[f"{col}_outlier"] = detect_outliers_iqr(df[col])

# Step 5: deduplication
dedup_columns = [
    "age", "workclass", "education", "education_num",
    "marital_status", "occupation", "relationship",
    "race", "sex", "hours_per_week",
    "native_country", "income"
]

df = df.drop_duplicates(subset=dedup_columns, keep="first")


In [3]:
assert df.shape[0] > 0, "Dataset is empty"
assert df.shape[1] >= 15, "Unexpected column loss"

In [4]:
df.isna().sum()

age                       0
workclass                 0
fnlwgt                    0
education                 0
education_num             0
marital_status            0
occupation                0
relationship              0
race                      0
sex                       0
capital_gain              0
capital_loss              0
hours_per_week            0
native_country            0
income                    0
age_outlier               0
education_num_outlier     0
hours_per_week_outlier    0
dtype: int64

In [5]:
assert df.isnull().sum().sum() == 0, "Unexpected missing values remain"

In [6]:
df.dtypes

age                        int64
workclass                 object
fnlwgt                     int64
education                 object
education_num              int64
marital_status            object
occupation                object
relationship              object
race                      object
sex                       object
capital_gain               int64
capital_loss               int64
hours_per_week             int64
native_country            object
income                    object
age_outlier                 bool
education_num_outlier       bool
hours_per_week_outlier      bool
dtype: object

In [7]:
assert pd.api.types.is_numeric_dtype(df["age"])
assert pd.api.types.is_numeric_dtype(df["education_num"])
assert pd.api.types.is_numeric_dtype(df["hours_per_week"])

In [8]:
assert (df["age"] >= 0).all()
assert (df["age"] <= 120).all()

assert (df["hours_per_week"] >= 0).all()
assert (df["hours_per_week"] <= 168).all()


In [9]:
df["income"].value_counts(normalize=True)

income
<=50k    0.762271
>50k     0.237729
Name: proportion, dtype: float64

In [10]:
summary = {
    "rows": df.shape[0],
    "columns": df.shape[1],
    "numeric_columns": df.select_dtypes(include="number").columns.tolist(),
    "categorical_columns": df.select_dtypes(include="object").columns.tolist()
}

summary

{'rows': 28318,
 'columns': 18,
 'numeric_columns': ['age',
  'fnlwgt',
  'education_num',
  'capital_gain',
  'capital_loss',
  'hours_per_week'],
 'categorical_columns': ['workclass',
  'education',
  'marital_status',
  'occupation',
  'relationship',
  'race',
  'sex',
  'native_country',
  'income']}

In [11]:
df.to_csv("adult_cleaned.csv", index=False)