In [1]:
import os
import json
import argparse
from random import Random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


from lightgbm import LGBMClassifier

import shap

In [3]:
config_path = 'config.json'
label = 1
split_ratio = 0.8
seed = 22

# load config file
with open(config_path, "r") as config_file:
    config = json.load(config_file)

# forest_preprocessing = config["forest_preprocessing"] == 1

# Get drug AST label
with open(config['label_path'], 'rb') as label_file:
    labels = pickle.load(label_file)

# labels = ['EUCASTv11_P/TZ', 'EUCASTv11_TOL/TZ']

label_index  = int(label) - 1
label = labels[label_index]

# Get feature list
with open(config['feature_path'], 'rb') as feature_file:
    features = pickle.load(feature_file)


df = pd.read_csv(config["input_path"])
X = df[features]
y = df[label]

mask = y.notnull()

# filter out isolates without AST label
y = y[mask].reset_index(drop=True)
X = X[mask].reset_index(drop=True)

Columns (17498,17504) have mixed types. Specify dtype option on import or set low_memory=False.


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y,
                                                    stratify=y, 
                                                    train_size=split_ratio,
                                                    random_state=seed)


print(f"X train: {X_train.shape}\nX test: {X_test.shape}\nY train: {y_train.shape}\nY test: {y_test.shape}")

X train: (684, 17485)
X test: (171, 17485)
Y train: (684,)
Y test: (171,)


In [28]:
gbm_clf = LGBMClassifier(min_data_in_leaf=3)
gbm_clf.fit(X_train, y_train)

explainer_gbm = shap.TreeExplainer(gbm_clf, l1_reg=False)
shap_values = explainer_gbm.shap_values(X_test)

LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray


In [29]:
shap_values

[array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])]

In [20]:
feature_imp = pd.DataFrame(sorted(zip(gbm_clf.feature_importances_, X_train.columns)), columns=['Value','Feature']).sort_values(by='Value', ascending=False)


In [24]:
most_30_features = feature_imp.head(30)

In [27]:
most_30_features['Feature'] = most_30_features.apply(lambda row: row['Feature'].split('~')[-1], axis=1)

most_30_features


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,Value,Feature
17484,33,group_5931
17483,31,folP_1
17482,27,group_4146
17481,27,group_23088
17480,27,group_11179
17479,25,group_16087
17478,24,hcpA_1
17477,24,group_12865
17476,23,group_12425
17475,21,group_19078
