# Load Data
The `.names` file is a convention used by the UCI Machine Learning Repository to provide essential metadata about the datasets they host.

In [2]:
import pandas as pd
import numpy as np

from sklearn.metrics import classification_report, balanced_accuracy_score, roc_auc_score
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

from sklearn.ensemble import RandomForestClassifier

import time

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import ClusterCentroids
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN

In [3]:
def parse_metadata():
    result = {}

    # The .names file doesn't follow a strict format.
    with open('../data/census/census-income.names', 'r') as file:
        metadata = file.read()

    # Just take lines that define attributes.
    etch = list(filter(lambda x: not (x.startswith('|') or x.startswith('-') or x.strip() == ''), metadata.splitlines()))
    for line in etch:
        key, value = map(lambda x: x.rstrip(".").strip(), line.split(':'))

        if value == "continuous":
            result[key] = {"type": "continuous"}
        else:
            values = list(map(lambda x: x.strip(), value.split(",")))

            # Try converting to an integer.
            try:
                values = list(map(int, values))
            except:
                pass
            result[key] = {"type": "nominal", "values": values}

    return result

In [4]:
# I'm getting names this way because I was interested in looking for unexpected values as well.
codebook = parse_metadata()

# The target is the last column in the dataset.
column_names = list(codebook.keys()) + ["income"]

In [5]:
train_df = pd.read_csv('../data/census/census-income.data', names=column_names, index_col=False)
train_df.shape

(199523, 42)

In [6]:
test_df = pd.read_csv('../data/census/census-income.test', names=column_names, index_col=False)
test_df.shape

(99762, 42)

# Data Exploration

In [8]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199523 entries, 0 to 199522
Data columns (total 42 columns):
 #   Column                                      Non-Null Count   Dtype  
---  ------                                      --------------   -----  
 0   age                                         199523 non-null  int64  
 1   class of worker                             199523 non-null  object 
 2   detailed industry recode                    199523 non-null  int64  
 3   detailed occupation recode                  199523 non-null  int64  
 4   education                                   199523 non-null  object 
 5   wage per hour                               199523 non-null  int64  
 6   enroll in edu inst last wk                  199523 non-null  object 
 7   marital stat                                199523 non-null  object 
 8   major industry code                         199523 non-null  object 
 9   major occupation code                       199523 non-null  object 
 

In [9]:
def print_unexpected_values(df, codebook):
    df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

    for key, value in codebook.items():
        if value["type"] != "nominal":
            continue

        expected_values = set(value["values"])
        actual_values = set(df[key].unique())

        unexpected_values = actual_values - expected_values

        if unexpected_values:
            print(f"{key}: {unexpected_values}")

print_unexpected_values(train_df, codebook)

state of previous residence: {'?'}
migration code-change in msa: {'?'}
migration code-change in reg: {'?'}
migration code-move within reg: {'?'}
migration prev res in sunbelt: {'?'}
country of birth father: {'?'}
country of birth mother: {'?'}
country of birth self: {'?'}


In [10]:
def calculate_missingness(df):
    for column in df.columns:
        if df[column].dtype != 'object':
            continue
        missing_count = (df[column].str.strip() == "?").sum()
        total_count = len(df[column])
        missing_percentage = (missing_count / total_count) * 100
        if missing_count > 0:
            print(f"{column}: {missing_percentage:.2f}% missing")

calculate_missingness(train_df)

state of previous residence: 0.35% missing
migration code-change in msa: 49.97% missing
migration code-change in reg: 49.97% missing
migration code-move within reg: 49.97% missing
migration prev res in sunbelt: 49.97% missing
country of birth father: 3.36% missing
country of birth mother: 3.07% missing
country of birth self: 1.70% missing


In [11]:
# Maybe these values are capped.
train_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,199523.0,34.494199,22.310895,0.0,15.0,33.0,50.0,90.0
detailed industry recode,199523.0,15.35232,18.067129,0.0,0.0,0.0,33.0,51.0
detailed occupation recode,199523.0,11.306556,14.454204,0.0,0.0,0.0,26.0,46.0
wage per hour,199523.0,55.426908,274.896454,0.0,0.0,0.0,0.0,9999.0
capital gains,199523.0,434.71899,4697.53128,0.0,0.0,0.0,0.0,99999.0
capital losses,199523.0,37.313788,271.896428,0.0,0.0,0.0,0.0,4608.0
dividends from stocks,199523.0,197.529533,1984.163658,0.0,0.0,0.0,0.0,99999.0
instance weight,199523.0,1740.380269,993.768156,37.87,1061.615,1618.31,2188.61,18656.3
num persons worked for employer,199523.0,1.95618,2.365126,0.0,0.0,1.0,4.0,6.0
own business or self employed,199523.0,0.175438,0.553694,0.0,0.0,0.0,0.0,2.0


# Binary Classification Task

The **Census-Income (KDD)** dataset is derived from the **1994 and 1995 Current Population Surveys (CPS)** conducted by the U.S. Census Bureau.

It includes **199,523 training instances** and **99,762 test instances**.

The goal is to predict the income level based on the data provided in each row. Incomes have been binned at the $50K level to create a binary classification problem.

In [13]:
nominal = list(filter(lambda x: codebook[x]["type"] == "nominal", codebook.keys()))
continuous = list(filter(lambda x: codebook[x]["type"] == "continuous" and x != "instance weight", codebook.keys()))

In [14]:
# Encode categorical features.
encoder = OneHotEncoder(drop='first', sparse_output=False)
label_encoder = LabelEncoder()

# Fit the encoders to the training data.
encoder.fit(train_df[nominal])
label_encoder.fit(train_df["income"])

# Feature names
one_hot_feature_names = encoder.get_feature_names_out(nominal)

In [15]:
# Encode training data.
X = pd.concat([pd.DataFrame(encoder.transform(train_df[nominal]), columns=one_hot_feature_names), train_df[continuous]], axis=1)
y = label_encoder.transform(train_df["income"])


In [16]:
# Encode testing data.
X_test = pd.concat([pd.DataFrame(encoder.transform(test_df[nominal]), columns=one_hot_feature_names), test_df[continuous]], axis=1)
y_test = label_encoder.transform(test_df["income"])


In [17]:
# Fit with training data.
model = RandomForestClassifier(random_state=42)
model.fit(X, y);


In [18]:
# Predict with testing data.
predictions = model.predict(X_test)

In [19]:
print(classification_report(predictions, y_test))

              precision    recall  f1-score   support

           0       0.99      0.96      0.98     96413
           1       0.40      0.74      0.52      3349

    accuracy                           0.95     99762
   macro avg       0.70      0.85      0.75     99762
weighted avg       0.97      0.95      0.96     99762



In [20]:
print(balanced_accuracy_score(predictions, y_test))

0.8496500202793771


In [21]:
pred_probas = model.predict_proba(X_test)
pred_probas_firsts = [prob[1] for prob in pred_probas]
print(roc_auc_score(y_test, pred_probas_firsts))

0.9377514134927171


## Imbalance

In [23]:
def display_class_imbalance(target):
    class_0_count = np.count_nonzero(target == 0)
    class_1_count = np.count_nonzero(target == 1)
    total_count = len(target)

    # Calculate percentages
    class_0_percentage = (class_0_count / total_count) * 100
    class_1_percentage = (class_1_count / total_count) * 100

    print(f"Class 0: {class_0_count} instances, {class_0_percentage}% of the total")
    print(f"Class 1: {class_1_count} instances, {class_1_percentage}% of the total")

In [24]:
display_class_imbalance(y_test)

Class 0: 93576 instances, 93.7992421964275% of the total
Class 1: 6186 instances, 6.200757803572503% of the total


In [25]:
def resample(X, y, sampler_class, random_state=42):
    start_time = time.time()
    sampler = sampler_class(random_state=random_state)
    X_resampled, y_resampled = sampler.fit_resample(X, y)
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"{elapsed_time:.2f} seconds")
    return X_resampled, y_resampled

#### Random Oversampling

In [27]:
X_ros, y_ros = resample(X, y, RandomOverSampler)

0.62 seconds


#### Random Undersampling

In [29]:
X_rus, y_rus = resample(X, y, RandomUnderSampler)

0.06 seconds


#### Cluster Centroids

In [31]:
X_cc, y_cc = resample(X, y, ClusterCentroids)

677.62 seconds


#### Synthetic Minority Oversampling Technique (SMOTE)

In [33]:
X_smote, y_smote = resample(X, y, SMOTE)

2.29 seconds


#### SMOTE and Edited Nearest Neighbors (SMOTEENN)

In [35]:
X_smoteenn, y_smoteenn = resample(X, y, SMOTEENN)

375.13 seconds


### Evaluate Sampling

In [36]:
model_ros = RandomForestClassifier(random_state=42)
model_ros.fit(X_ros, y_ros);

In [38]:
model_rus = RandomForestClassifier(random_state=42)
model_rus.fit(X_rus, y_rus);

In [40]:
model_cc = RandomForestClassifier(random_state=42)
model_cc.fit(X_cc, y_cc);

In [42]:
model_smote = RandomForestClassifier(random_state=42)
model_smote.fit(X_smote, y_smote);

In [44]:
model_smoteenn = RandomForestClassifier(random_state=42)
model_smoteenn.fit(X_smoteenn, y_smoteenn);

In [105]:
models = [
    (model_ros, "Random Undersampling"),
    (model_rus, "Random Oversampling"),
    (model_cc, "Cluster Centroids"),
    (model_smote, "SMOTE"),
    (model_smoteenn, "SMOTEENN")
]

for model, text in models:
    print(f"{text}: ")
    predictions = model.predict(X_test)
    print(classification_report(predictions, y_test))
    print(f"Balanced Accuracy: {balanced_accuracy_score(predictions, y_test)}")
    print("")

Random Undersampling: 
              precision    recall  f1-score   support

           0       0.98      0.97      0.97     94827
           1       0.49      0.62      0.55      4935

    accuracy                           0.95     99762
   macro avg       0.74      0.79      0.76     99762
weighted avg       0.96      0.95      0.95     99762

Balanced Accuracy: 0.7937609275544271

Random Oversampling: 
              precision    recall  f1-score   support

           0       0.85      0.99      0.91     79770
           1       0.90      0.28      0.42     19992

    accuracy                           0.85     99762
   macro avg       0.87      0.63      0.67     99762
weighted avg       0.86      0.85      0.81     99762

Balanced Accuracy: 0.6347565050841698

Cluster Centroids: 
              precision    recall  f1-score   support

           0       0.53      0.99      0.69     50143
           1       0.95      0.12      0.21     49619

    accuracy                           