<a href="https://colab.research.google.com/github/beta-cancri/beta-cancri/blob/main/fairness_income_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install \
    tensorflow==2.15 \
    tensorflow-model-remediation \
    fairness-indicators==0.46.0 \
    tensorflow-model-analysis==0.46.0 \
    pandas



In [6]:
import pandas as pd
import tensorflow as tf
import tensorflow_model_analysis as tfma
from google.protobuf import text_format
from tensorflow_model_remediation import min_diff

# Step 1: Load Dataset
url = "https://download.mlcc.google.com/mledu-datasets/acsincome_raw_2018.csv"
acs_df = pd.read_csv(url)

# Step 2: Convert target to binary
LABEL_KEY = 'PINCP'
LABEL_THRESHOLD = 50000.0
acs_df[LABEL_KEY] = acs_df[LABEL_KEY].apply(lambda x: 1 if x > LABEL_THRESHOLD else 0)

# Step 3: Define the model
inputs = {}
for name in acs_df.columns.drop(LABEL_KEY):
    inputs[name] = tf.keras.Input(shape=(1,), name=name, dtype=tf.float64)

x = tf.stack(list(inputs.values()), axis=-1)
normalizer = tf.keras.layers.Normalization(axis=-1)
normalizer.adapt(tf.stack([acs_df[c].astype('float64') for c in acs_df.columns.drop(LABEL_KEY)], axis=1))

x = normalizer(x)
x = tf.keras.layers.Dense(64, activation='relu')(x)
x = tf.keras.layers.Dense(32, activation='relu')(x)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)
outputs = tf.squeeze(outputs, axis=-1)  # Flatten from (batch, 1) to (batch,)

base_model = tf.keras.Model(inputs=inputs, outputs=outputs)
base_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', tf.keras.metrics.AUC()])

# Step 4: Convert pandas to tf.data.Dataset
def dataframe_to_dataset(df):
    df = df.copy()
    labels = df.pop(LABEL_KEY)
    return tf.data.Dataset.from_tensor_slices((dict(df), labels))

# Step 5: Split and train
train_df = acs_df.sample(frac=0.8, random_state=42)
test_df = acs_df.drop(train_df.index)
train_ds = dataframe_to_dataset(train_df).batch(32)
test_ds = dataframe_to_dataset(test_df).batch(32)

base_model.fit(train_ds, epochs=10)

# Step 6: Predict and add to test_df
preds = base_model.predict(test_ds)
test_df['PRED'] = preds
test_df['SEX'] = test_df['SEX'].replace({1.0: "Male", 2.0: "Female"})

# Step 7: Fairness Indicators config
PREDICTION_KEY = 'PRED'
SENSITIVE_ATTRIBUTE_KEY = 'SEX'

eval_config_pbtxt = """
model_specs {
  prediction_key: "%s"
  label_key: "%s"
}
metrics_specs {
  metrics { class_name: "ExampleCount" }
  metrics { class_name: "BinaryAccuracy" }
  metrics { class_name: "AUC" }
  metrics { class_name: "FairnessIndicators" config: '{ "thresholds": [0.5] }' }
}
slicing_specs { feature_keys: "%s" }
""" % (PREDICTION_KEY, LABEL_KEY, SENSITIVE_ATTRIBUTE_KEY)

eval_config = text_format.Parse(eval_config_pbtxt, tfma.EvalConfig())

# Step 8: Run fairness evaluation
fairness_result = tfma.analyze_raw_data(test_df, eval_config)

print("âœ… Fairness evaluation complete.")


Epoch 1/10
 1695/41613 [>.............................] - ETA: 3:03 - loss: 0.4559 - accuracy: 0.7770 - auc_2: 0.8504

KeyboardInterrupt: 

In [7]:
# Female group, positively labeled (earned more than 50K)
sensitive_group_pos = train_df[
    (train_df[SENSITIVE_ATTRIBUTE_KEY] == 2.0) &
    (train_df[LABEL_KEY] == 1)
]

# Male group, positively labeled (earned more than 50K)
non_sensitive_group_pos = train_df[
    (train_df[SENSITIVE_ATTRIBUTE_KEY] == 1.0) &
    (train_df[LABEL_KEY] == 1)
]

# Optional: print how many are in each group
print(len(sensitive_group_pos), 'positively labeled Female examples')
print(len(non_sensitive_group_pos), 'positively labeled Male examples')


184882 positively labeled Female examples
306264 positively labeled Male examples


In [9]:
MIN_DIFF_BATCH_SIZE = 50

# Convert DataFrames to tf.data.Dataset
sensitive_group_ds = dataframe_to_dataset(sensitive_group_pos)
non_sensitive_group_ds = dataframe_to_dataset(non_sensitive_group_pos)

# Batch them
sensitive_group_batches = sensitive_group_ds.batch(MIN_DIFF_BATCH_SIZE, drop_remainder=True)
non_sensitive_group_batches = non_sensitive_group_ds.batch(MIN_DIFF_BATCH_SIZE, drop_remainder=True)
