In [49]:
from sklearn.datasets import fetch_openml
import pandas as pd

data = fetch_openml("adult", version=2, as_frame=True)
df = data.frame

df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,,103497,Some-college,10,Never-married,,Own-child,White,Female,0,0,30,United-States,<=50K


In [50]:
df = df.replace("?", pd.NA)


In [51]:
df.dtypes


age                  int64
workclass         category
fnlwgt               int64
education         category
education-num        int64
marital-status    category
occupation        category
relationship      category
race              category
sex               category
capital-gain         int64
capital-loss         int64
hours-per-week       int64
native-country    category
class             category
dtype: object

In [52]:
df.isna().sum()


age                  0
workclass         2799
fnlwgt               0
education            0
education-num        0
marital-status       0
occupation        2809
relationship         0
race                 0
sex                  0
capital-gain         0
capital-loss         0
hours-per-week       0
native-country     857
class                0
dtype: int64

In [53]:
X = df.drop("class", axis=1)
y = df["class"]


In [54]:
categorical_cols = X.select_dtypes(include="category").columns.tolist()
numerical_cols = X.select_dtypes(include="number").columns.tolist()


In [55]:
import pandas as pd
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

In [56]:
preprocess = ColumnTransformer(
    transformers=[
        ("num", SimpleImputer(strategy="median"), numerical_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
    ]
)


In [57]:
model = Pipeline([
    ("preprocess", preprocess),
    ("clf", RandomForestClassifier(random_state=42))
])

In [58]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

model.fit(X_train, y_train) 

0,1,2
,steps,"[('preprocess', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [59]:
ohe = model.named_steps["preprocess"].named_transformers_["cat"]

In [60]:
print("\n=== CATEGORICAL COLUMNS ===")
for col, cats in zip(categorical_cols, ohe.categories_):
    print(f"{col}: {cats}")


=== CATEGORICAL COLUMNS ===
workclass: ['Federal-gov' 'Local-gov' 'Never-worked' 'Private' 'Self-emp-inc'
 'Self-emp-not-inc' 'State-gov' 'Without-pay' nan]
education: ['10th' '11th' '12th' '1st-4th' '5th-6th' '7th-8th' '9th' 'Assoc-acdm'
 'Assoc-voc' 'Bachelors' 'Doctorate' 'HS-grad' 'Masters' 'Preschool'
 'Prof-school' 'Some-college']
marital-status: ['Divorced' 'Married-AF-spouse' 'Married-civ-spouse'
 'Married-spouse-absent' 'Never-married' 'Separated' 'Widowed']
occupation: ['Adm-clerical' 'Armed-Forces' 'Craft-repair' 'Exec-managerial'
 'Farming-fishing' 'Handlers-cleaners' 'Machine-op-inspct' 'Other-service'
 'Priv-house-serv' 'Prof-specialty' 'Protective-serv' 'Sales'
 'Tech-support' 'Transport-moving' nan]
relationship: ['Husband' 'Not-in-family' 'Other-relative' 'Own-child' 'Unmarried' 'Wife']
race: ['Amer-Indian-Eskimo' 'Asian-Pac-Islander' 'Black' 'Other' 'White']
sex: ['Female' 'Male']
native-country: ['Cambodia' 'Canada' 'China' 'Columbia' 'Cuba' 'Dominican-Republic'
 'E

In [61]:
print("=== DEBUG: What transformers exist inside preprocess? ===")
print(model.named_steps["preprocess"].named_transformers_)


=== DEBUG: What transformers exist inside preprocess? ===
{'num': SimpleImputer(strategy='median'), 'cat': OneHotEncoder(handle_unknown='ignore')}


In [62]:
print("Categorical columns:", categorical_cols)
print("Numerical columns:", numerical_cols)


Categorical columns: ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
Numerical columns: ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']


In [63]:
import pandas as pd
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier


# 1. Load dataset
data = fetch_openml("adult", version=2, as_frame=True)
df = data.frame

# 2. Clean missing values ("?" → NaN)
df = df.replace("?", pd.NA)

# 3. Split X and y
X = df.drop("class", axis=1)
y = df["class"]

# 4. Column types
categorical_cols = X.select_dtypes(include="category").columns.tolist()
numerical_cols = X.select_dtypes(include="number").columns.tolist()

# 5. Preprocessing
preprocess = ColumnTransformer(
    transformers=[
        ("num", SimpleImputer(strategy="median"), numerical_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
    ]
)

# 6. Pipeline with RandomForest
model = Pipeline([
    ("preprocess", preprocess),
    ("clf", RandomForestClassifier(random_state=42))
])

# 7. Train model
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

model.fit(X_train, y_train)  # <-- NOW encoder is fitted

# 8. Extract the OneHotEncoder (AFTER FIT)
ohe = model.named_steps["preprocess"].named_transformers_["cat"]

# 9. Print categories learned
print("\n=== CATEGORICAL COLUMNS ===")
for col, cats in zip(categorical_cols, ohe.categories_):
    print(f"{col}: {cats}")

# 10. Print feature names after encoding
feature_names = model.named_steps["preprocess"].get_feature_names_out()
print("\n=== FEATURE NAMES AFTER PREPROCESSING ===")
print(feature_names)

# 11. Transform a small sample to inspect numeric matrix
X_sample = model.named_steps["preprocess"].transform(X_train.head())

# Convert to DataFrame for human readability
X_sample_df = pd.DataFrame(
    X_sample.toarray(),
    columns=feature_names
)

print("\n=== TRANSFORMED SAMPLE (NUMERIC MATRIX) ===")
print(X_sample_df.head())



=== CATEGORICAL COLUMNS ===
workclass: ['Federal-gov' 'Local-gov' 'Never-worked' 'Private' 'Self-emp-inc'
 'Self-emp-not-inc' 'State-gov' 'Without-pay' nan]
education: ['10th' '11th' '12th' '1st-4th' '5th-6th' '7th-8th' '9th' 'Assoc-acdm'
 'Assoc-voc' 'Bachelors' 'Doctorate' 'HS-grad' 'Masters' 'Preschool'
 'Prof-school' 'Some-college']
marital-status: ['Divorced' 'Married-AF-spouse' 'Married-civ-spouse'
 'Married-spouse-absent' 'Never-married' 'Separated' 'Widowed']
occupation: ['Adm-clerical' 'Armed-Forces' 'Craft-repair' 'Exec-managerial'
 'Farming-fishing' 'Handlers-cleaners' 'Machine-op-inspct' 'Other-service'
 'Priv-house-serv' 'Prof-specialty' 'Protective-serv' 'Sales'
 'Tech-support' 'Transport-moving' nan]
relationship: ['Husband' 'Not-in-family' 'Other-relative' 'Own-child' 'Unmarried' 'Wife']
race: ['Amer-Indian-Eskimo' 'Asian-Pac-Islander' 'Black' 'Other' 'White']
sex: ['Female' 'Male']
native-country: ['Cambodia' 'Canada' 'China' 'Columbia' 'Cuba' 'Dominican-Republic'
 'E

In [64]:
from sklearn.metrics import accuracy_score

preds = model.predict(X_test)
accuracy_score(y_test, preds)


0.8570989865902344