In [None]:
import pandas as pd
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

# Load dataset (e.g., Adult Income)
data = fetch_openml(name="adult", version=2, as_frame=True)
X = data.data.select_dtypes(include=["number"])  # use numerical only for now
y = (data.target == ">50K").astype(int)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

from sklearn.preprocessing import KBinsDiscretizer

# Apply equal-frequency binning
binning = KBinsDiscretizer(n_bins=10, encode="ordinal", strategy="quantile")
X_train_binned = binning.fit_transform(X_train)
X_test_binned = binning.transform(X_test)

# Convert to integer tensors
import torch

X_train_binned_tensor = torch.tensor(X_train_binned, dtype=torch.long)
X_test_binned_tensor = torch.tensor(X_test_binned, dtype=torch.long)



In [5]:
X

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
0,25,226802,7,0,0,40
1,38,89814,9,0,0,50
2,28,336951,12,0,0,40
3,44,160323,10,7688,0,40
4,18,103497,10,0,0,30
...,...,...,...,...,...,...
48837,27,257302,12,0,0,38
48838,40,154374,9,0,0,40
48839,58,151910,9,0,0,40
48840,22,201490,9,0,0,20


In [2]:
import torch.nn as nn


# Raw MLP
class RawMLP(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.model = nn.Sequential(nn.Linear(input_dim, 64), nn.ReLU(), nn.Linear(64, 1))

    def forward(self, x):
        return self.model(x)


# Binned MLP with embeddings
class BinnedMLP(nn.Module):
    def __init__(self, num_bins, num_features, emb_dim=4):
        super().__init__()
        self.embeddings = nn.ModuleList([nn.Embedding(num_bins, emb_dim) for _ in range(num_features)])
        self.fc = nn.Sequential(nn.Linear(num_features * emb_dim, 64), nn.ReLU(), nn.Linear(64, 1))

    def forward(self, x_cat):
        x = torch.cat([emb(x_cat[:, i]) for i, emb in enumerate(self.embeddings)], dim=1)
        return self.fc(x)

In [3]:
from sklearn.metrics import accuracy_score
import torch.optim as optim


def train_model(model, X_train, y_train, X_test, y_test, epochs=10):
    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    for epoch in range(epochs):
        model.train()
        y_pred = model(X_train).squeeze()
        loss = criterion(y_pred, y_train)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Evaluate
    model.eval()
    with torch.no_grad():
        pred = torch.sigmoid(model(X_test)).squeeze() > 0.5
        acc = accuracy_score(y_test, pred)
    return acc

In [None]:
# Convert raw data
scaler = StandardScaler()
X_train_scaled = torch.tensor(scaler.fit_transform(X_train), dtype=torch.float32)
X_test_scaled = torch.tensor(scaler.transform(X_test), dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)

# Train raw model
raw_model = RawMLP(input_dim=X_train.shape[1])
raw_acc = train_model(raw_model, X_train_scaled, y_train_tensor, X_test_scaled, y_test_tensor)

# Train binned model
binned_model = BinnedMLP(num_bins=10, num_features=X_train.shape[1])
binned_acc = train_model(binned_model, X_train_binned_tensor, y_train_tensor, X_test_binned_tensor, y_test_tensor)

# Train Random Forest
rf_model = RandomForestClassifier(random_state=0)
rf_model.fit(X_train, y_train)
rf_acc = accuracy_score(y_test, rf_model.predict(X_test))

print(f"Raw MLP Accuracy: {raw_acc:.3f}")
print(f"Binned MLP Accuracy: {binned_acc:.3f}")
print(f"RF Accuracy: {rf_acc:.3f}")

Raw MLP Accuracy: 0.540
Binned MLP Accuracy: 0.761
