In [6]:
#Notebook for quickly exploring the data
import pandas as pd
from ml.data import process_data

df = pd.read_csv("data/census.csv")
print(df.shape)
print(df.columns)

X, y, encoder, lb = process_data(
    df, 
    categorical_features=['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country'],
    label='salary',
    training=True
)

print(X.shape)
print(set(y))


(32561, 15)
Index(['age', 'workclass', 'fnlgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'salary'],
      dtype='object')
(32561, 108)
{np.int64(0), np.int64(1)}


In [7]:
from ml.data import process_data

# Define categorical feature names from df.columns
cat_features = [
    "workclass",
    "education",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "native-country"
]

# Run the process_data function in training mode
X, y, encoder, lb = process_data(
    df,
    categorical_features=cat_features,
    label="salary",
    training=True
)

print("Transformed X shape:", X.shape)
print("Target class values:", set(y))


Transformed X shape: (32561, 108)
Target class values: {np.int64(0), np.int64(1)}


In [8]:
# Check the one-hot encoded feature names
encoded_features = encoder.get_feature_names_out(cat_features)
print(encoded_features[:10])  # preview


['workclass_?' 'workclass_Federal-gov' 'workclass_Local-gov'
 'workclass_Never-worked' 'workclass_Private' 'workclass_Self-emp-inc'
 'workclass_Self-emp-not-inc' 'workclass_State-gov'
 'workclass_Without-pay' 'education_10th']


--Testing Model.py functions--

In [9]:
from ml.model import (
    train_model,
    performance_on_categorical_slice
)
from ml.data import process_data
from sklearn.model_selection import train_test_split

In [10]:
cat_features = [
    "workclass",
    "education",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "native-country",
]

# Train/test split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Process training data
X_train, y_train, encoder, lb = process_data(
    train_df,
    categorical_features=cat_features,
    label="salary",
    training=True
)

# Train the model
model = train_model(X_train, y_train)

In [11]:
precision, recall, f1 = performance_on_categorical_slice(
    test_df,
    column_name="education",
    slice_value="Bachelors",
    categorical_features=cat_features,
    label="salary",
    encoder=encoder,
    lb=lb,
    model=model
)

print(f"education=Bachelors | Precision: {precision:.4f} | Recall: {recall:.4f} | F1: {f1:.4f}")

education=Bachelors | Precision: 0.7523 | Recall: 0.7289 | F1: 0.7404


-- --