In [1]:
import pandas as pd
import numpy as np

from sklearn import metrics
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from bedrock_client.bedrock.analyzer.model_analyzer import ModelAnalyzer
from bedrock_client.bedrock.analyzer import ModelTypes
from bedrock_client.bedrock.api import BedrockApi
from bedrock_client.bedrock.metrics.service import ModelMonitoringService
import logging

pip install 'aif360[AdversarialDebiasing]'


In [2]:
def load_dataset(filepath, target):
    df = pd.read_csv(filepath)
    df['large_rings'] = (df['Rings'] > 10).astype(int)

    # Ensure nothing missing
    original_len = len(df)
    df.dropna(how="any", axis=0, inplace=True)
    num_rows_dropped = original_len - len(df)
    if num_rows_dropped > 0:
        print(f"Warning - dropped {num_rows_dropped} rows with NA data.")

    y = df[target].values
    df.drop(target, axis=1, inplace=True)

    return df, y

In [3]:
def train_log_reg_model(X, y, seed=0, C=1, verbose=False):
    verbose and print('\nTraining\nScaling...')
    scaling = StandardScaler()
    X = scaling.fit_transform(X)

    verbose and print('Fitting...')
    verbose and print('C:', C)
    model = LogisticRegression(random_state=seed, C=C, max_iter=4000)
    model.fit(X, y)

    verbose and print('Chaining pipeline...')
    pipe = Pipeline([('scaling', scaling), ('model', model)])

    verbose and print('Training Done.')

    return pipe

In [4]:
def compute_log_metrics(pipe,
                        x_test,
                        y_test,
                        y_test_onehot):
    test_prob = pipe.predict_proba(x_test)
    test_pred = pipe.predict(x_test)

    acc = metrics.accuracy_score(y_test, test_pred)
    precision = metrics.precision_score(y_test, test_pred, average='macro')
    recall = metrics.recall_score(y_test, test_pred, average='macro')
    f1_score = metrics.f1_score(y_test, test_pred, average='macro')
    roc_auc = metrics.roc_auc_score(y_test_onehot, test_prob, average='macro', multi_class='ovr')
    avg_prc = metrics.average_precision_score(y_test_onehot, test_prob, average='macro')
    print("\nEvaluation\n"
          f"\tAccuracy                  = {acc:.4f}\n"
          f"\tPrecision (macro)         = {precision:.4f}\n"
          f"\tRecall (macro)            = {recall:.4f}\n"
          f"\tF1 score (macro)          = {f1_score:.4f}\n"
          f"\tROC AUC (macro)           = {roc_auc:.4f}\n"
          f"\tAverage precision (macro) = {avg_prc:.4f}")


    # Bedrock Logger: captures model metrics
    bedrock = BedrockApi(logging.getLogger(__name__))
    
    # `log_chart_data` assumes binary classification
    # For multiclass labels, we can use a "micro-average" by 
    # quantifying score on all classes jointly 
    # See https://scikit-learn.org/stable/auto_examples/model_selection/plot_precision_recall.html
    # This will allow us to use the same `log_chart_data` method
    bedrock.log_chart_data(
        y_test_onehot.ravel().astype(int).tolist(),  # list of int
        test_prob.ravel().astype(float).tolist()  # list of float
    )

    bedrock.log_metric("Accuracy", acc)
    bedrock.log_metric("Precision (macro)", precision)
    bedrock.log_metric("Recall (macro)", recall)
    bedrock.log_metric("F1 Score (macro)", f1_score)
    bedrock.log_metric("ROC AUC (macro)", roc_auc)
    bedrock.log_metric("Avg precision (macro)", avg_prc)

    return test_prob, test_pred

In [5]:
x_train, y_train = load_dataset(
    filepath="data/abalone_train.csv",
    target="Type"
)
x_test, y_test = load_dataset(
    filepath="data/abalone_test.csv",
    target="Type"
)

In [6]:
enc = OneHotEncoder(handle_unknown='ignore', sparse=False)
# sklearn `roc_auc_score` and `average_precision_score` expects
# binary label indicators with shape (n_samples, n_classes)
y_train_onehot = enc.fit_transform(y_train.reshape(-1, 1))
y_test_onehot = enc.fit_transform(y_test.reshape(-1, 1))

# Convert target variable to numeric values
# ModelMonitoringService.export_text expect both features
# and inference to be numeric values
y_train = np.argmax(y_train_onehot, axis=1)
y_test = np.argmax(y_test_onehot, axis=1)

In [7]:
for value, category in enumerate(enc.categories_[0]):
    print(f'{category} : {value}')

F : 0
I : 1
M : 2


In [8]:
pipe = train_log_reg_model(x_train,
                           y_train,
                           seed=0,
                           C=1e-1,
                           verbose=True)


Training
Scaling...
Fitting...
C: 0.1
Chaining pipeline...
Training Done.


In [9]:
test_prob, test_pred = compute_log_metrics(pipe,
                                           x_test,
                                           y_test,
                                           y_test_onehot)
# Ignore ERROR, this is for testing purposes

ERROR:__main__:BEDROCK API TOKEN not found
ERROR:__main__:BEDROCK API TOKEN not found
ERROR:__main__:BEDROCK API TOKEN not found
ERROR:__main__:BEDROCK API TOKEN not found
ERROR:__main__:BEDROCK API TOKEN not found
ERROR:__main__:BEDROCK API TOKEN not found
ERROR:__main__:BEDROCK API TOKEN not found



Evaluation
	Accuracy                  = 0.5450
	Precision (macro)         = 0.5315
	Recall (macro)            = 0.5447
	F1 score (macro)          = 0.5305
	ROC AUC (macro)           = 0.7617
	Average precision (macro) = 0.6046


In [10]:
CONFIG_FAI = {
    'large_rings': {
        'privileged_attribute_values': [1],
        # privileged group name corresponding to values=[1]
        'privileged_group_name': 'Large',  
        'unprivileged_attribute_values': [0],
        # unprivileged group name corresponding to values=[0]
        'unprivileged_group_name': 'Small', 
    }
}

# Train Shap model and calculate xafai metrics
analyzer = (
    ModelAnalyzer(pipe[1],
                  model_name='logistic',
                  model_type=ModelTypes.LINEAR)
    .train_features(x_train)
    .test_features(x_test)
    .fairness_config(CONFIG_FAI)
    .test_labels(y_test)
    .test_inference(test_pred)
)
analyzer.analyze()

([array([[ 0.01169004, -0.00898631, -0.00083427, ..., -0.00414216,
          -0.357427  , -0.00378622],
         [ 0.04107236, -0.02472033, -0.00801725, ..., -0.01115385,
          -0.72562784, -0.00378622],
         [ 0.01436116, -0.01003524, -0.00083427, ..., -0.00479139,
           0.01077384, -0.00378622],
         ...,
         [-0.00700781,  0.00255198, -0.00173214, ..., -0.0041097 ,
          -0.357427  , -0.00378622],
         [ 0.05709909, -0.04045435, -0.01160875, ..., -0.01316647,
          -0.54152742, -0.00378622],
         [ 0.01436116, -0.00688844, -0.00173214, ..., -0.0028437 ,
           0.56307511,  0.00715674]]),
  array([[-2.99918879e-02,  1.36817245e-03,  1.00029796e-03, ...,
           4.84883631e-03,  7.86112813e-01, -1.16156767e-02],
         [-1.05375002e-01,  3.76369031e-03,  9.61280818e-03, ...,
           1.30567701e-02,  1.59592125e+00, -1.16156767e-02],
         [-3.68448983e-02,  1.52787364e-03,  1.00029796e-03, ...,
           5.60883018e-03, -2.36956268

In [11]:
ModelMonitoringService.export_text(
    features=x_train.iteritems(),  # assumes numeric values
    inference=test_pred.tolist(),  # assumes numeric values
)

In [12]:
for item in x_train.iteritems():
    print(item)

('LongestShell', 0       0.245
1       0.620
2       0.455
3       0.480
4       0.430
        ...  
2917    0.455
2918    0.665
2919    0.440
2920    0.505
2921    0.580
Name: LongestShell, Length: 2922, dtype: float64)
('Diameter', 0       0.195
1       0.510
2       0.345
3       0.355
4       0.325
        ...  
2917    0.350
2918    0.535
2919    0.350
2920    0.405
2921    0.450
Name: Diameter, Length: 2922, dtype: float64)
('Height', 0       0.060
1       0.180
2       0.105
3       0.115
4       0.115
        ...  
2917    0.140
2918    0.225
2919    0.135
2920    0.140
2921    0.120
Name: Height, Length: 2922, dtype: float64)
('WholeWeight', 0       0.0950
1       1.3315
2       0.4005
3       0.4725
4       0.3865
         ...  
2917    0.5725
2918    2.1835
2919    0.4350
2920    0.8750
2921    0.8685
Name: WholeWeight, Length: 2922, dtype: float64)
('ShuckedWeight', 0       0.0445
1       0.5940
2       0.1640
3       0.2065
4       0.1475
         ...  
2917    0.1965
2918