In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
import zipfile

In [2]:
from packaging import version
from raiutils.dataset import fetch_dataset
import sklearn
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# for older scikit-learn versions use sparse, for newer sparse_output:
if version.parse(sklearn.__version__) < version.parse('1.2'):
    ohe_params = {"sparse": False}
else:
    ohe_params = {"sparse_output": False}

def split_label(dataset, target_feature):
    X = dataset.drop([target_feature], axis=1)
    y = dataset[[target_feature]]
    return X, y

def clean_data(X, y, target_feature):
    features = X.columns.values.tolist()
    classes = y[target_feature].unique().tolist()
    pipe_cfg = {
        'num_cols': X.dtypes[X.dtypes == 'int64'].index.values.tolist(),
        'cat_cols': X.dtypes[X.dtypes == 'object'].index.values.tolist(),
    }
    num_pipe = Pipeline([
        ('num_imputer', SimpleImputer(strategy='median'))
    ])
    cat_pipe = Pipeline([
        ('cat_imputer', SimpleImputer(strategy='constant', fill_value='?')),
        ('cat_encoder', OneHotEncoder(handle_unknown='ignore', **ohe_params))
    ])
    feat_pipe = ColumnTransformer([
        ('num_pipe', num_pipe, pipe_cfg['num_cols']),
        ('cat_pipe', cat_pipe, pipe_cfg['cat_cols'])
    ])
    X = feat_pipe.fit_transform(X)
    print(pipe_cfg['cat_cols'])
    return X, feat_pipe, features, classes

target_feature = 'Sold_HigherThan_Median'
categorical_features = []

outdirname = 'responsibleai.12.28.21'
zipfilename = outdirname + '.zip'

fetch_dataset('https://publictestdatasets.blob.core.windows.net/data/' + zipfilename, zipfilename)

with zipfile.ZipFile(zipfilename, 'r') as unzip:
    unzip.extractall('datasets')

all_data = pd.read_csv('datasets/apartments-train.csv')
all_data = all_data.drop(['SalePrice','SalePriceK'], axis=1)
X, y = split_label(all_data, target_feature)


X_train_original, X_test_original, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=7, stratify=y)

X_train, feat_pipe, features, classes = clean_data(X_train_original, y_train, target_feature)
y_train = y_train[target_feature].to_numpy()

X_test = feat_pipe.transform(X_test_original)
y_test = y_test[target_feature].to_numpy()

train_data = X_train_original.copy()
train_data[target_feature] = y_train

test_data = X_test_original.copy()
test_data[target_feature] = y_test

Dataset download attempt 1 of 4
[]


In [3]:
print(X_test.shape)
print(X_train.shape)

(365, 32)
(1095, 32)


In [4]:
clf = LGBMClassifier(random_state=0)
model = clf.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 546, number of negative: 549
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000581 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2377
[LightGBM] [Info] Number of data points in the train set: 1095, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.498630 -> initscore=-0.005479
[LightGBM] [Info] Start training from score -0.005479


In [5]:
from raiwidgets import ResponsibleAIDashboard
from responsibleai import RAIInsights

In [6]:
from responsibleai.feature_metadata import FeatureMetadata
feature_metadata = FeatureMetadata(categorical_features=categorical_features, dropped_features=[])

In [7]:
from sklearn.pipeline import Pipeline

dashboard_pipeline = Pipeline(steps=[('preprocess', feat_pipe), ('model', model)])
rai_insights = RAIInsights(dashboard_pipeline, train_data, test_data, target_feature, 'classification',
                             feature_metadata=feature_metadata, 
                             classes=['Less than median', 'More than median'])

In [8]:
# Interpretability
rai_insights.explainer.add()
# Error Analysis
rai_insights.error_analysis.add()
# Counterfactuals: accepts total number of counterfactuals to generate, the label that they should have, and a list of 
                # strings of categorical feature names
rai_insights.counterfactual.add(total_CFs=10, desired_class='opposite')

In [9]:
rai_insights.compute()

Causal Effects
Current Status: Generating Causal Effects.
Current Status: Finished generating causal effects.
Time taken: 0.0 min 3.650004509836435e-05 sec
Counterfactual
Current Status: Generating 10 counterfactuals for 365 samples


100%|██████████| 365/365 [01:31<00:00,  3.99it/s]


Current Status: Generated 10 counterfactuals for 365 samples.
Time taken: 1.0 min 36.79977929999586 sec
Error Analysis
Current Status: Generating error analysis reports.
Current Status: Finished generating error analysis reports.
Time taken: 0.0 min 0.11606820003362373 sec
Explanations
Current Status: Explaining 32 features
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000426 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2377
[LightGBM] [Info] Number of data points in the train set: 1095, number of used features: 30
[LightGBM] [Info] Start training from score -0.020103
Current Status: Explained 32 features.
Time taken: 0.0 min 0.32461419998435304 sec


In [11]:
import asyncio
import nest_asyncio

nest_asyncio.apply()
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())  # Only on Windows

# Then try to start your dashboard
ResponsibleAIDashboard(rai_insights)


ValueError: Invalid async_mode specified