In [3]:
import shap
from sklearn.tree import DecisionTreeRegressor, plot_tree, _tree
from matplotlib import pyplot as plt
import pandas as pd
from sklearn.model_selection import  train_test_split
import pickle
import xgboost
import numpy as np
from sklearn import preprocessing

In [4]:
def get_df():
    df = pd.read_csv("./data/titanic.csv")
    df = df[["Pclass", "Sex", "Age", "SibSp", "Parch","Fare", "Cabin", "Embarked", "Survived"]]
    df["Cabin"] = [x[0] if type(x) == str else "No Data" for x in df["Cabin"].values]
    df = df.dropna().reset_index(drop=True)

    for column in ["Sex", "Cabin", "Embarked"]:
        le = preprocessing.LabelEncoder()
        le.fit(df[column].values)
        df[column] = le.transform(df[column])

    return df

In [6]:
def get_X_y(df):
    X = df.drop("Survived", axis=1)
    y = df["Survived"]
    return X, y

def get_model(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model_xgb = xgboost.XGBClassifier(n_estimators=100, max_depth=2).fit(X_train, y_train)
    return model_xgb

In [27]:
def fit_decition_tree(X, y, depth):
    clf = DecisionTreeRegressor(max_depth=depth)
    clf = clf.fit(X, y)

    return clf

def get_shap_values_for_tree(model, X, y):
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X, y=y)
    return shap_values

def get_y_proba_for_tree(model, X):
    return model.predict_proba(X)[:,1]

def get_cleaned_data_for_shap(df):
    clean_df = df.dropna()
    X = clean_df.drop("Survived", axis=1)
    y = clean_df["Survived"]
    return X, y

def get_feature_index(feature_name, X):
    return list(X.columns).index(feature_name)

def get_feature_shap_vals(shap_values, feature_index):
    return shap_values[:,feature_index]

def plot_fig_for_tree(clf, feature_names):
    fig = plt.figure(figsize=(15,5))
    _ = plot_tree(clf, filled=True, feature_names=feature_names)
    

def get_decition_tree_for_feature(model, df, feature_name, depth, should_remove_original_feature=False):
    X, y = get_cleaned_data_for_shap(df)
    shap_values = get_shap_values_for_tree(model, X, y)

    feature_index = get_feature_index(feature_name, X)
    feature_shap_vals = get_feature_shap_vals(shap_values, feature_index)
    if should_remove_original_feature:
        X = X.drop(feature_name, axis=1)

    clf = fit_decition_tree(X, feature_shap_vals, depth)

    plot_fig_for_tree(clf, X.columns)

def get_decition_tree_text_for_feature(model, df, feature_name, depth, should_remove_original_feature=False):
    X, y = get_cleaned_data_for_shap(df)
    shap_values = get_shap_values_for_tree(model, X, y)
    
    feature_index = get_feature_index(feature_name, X)
    feature_shap_vals = get_feature_shap_vals(shap_values, feature_index)   
    
    if should_remove_original_feature:
        X = X.drop(feature_name, axis=1)

    clf = fit_decition_tree(X, feature_shap_vals, depth)

    return clf, X

def get_decition_tree_model(model, df, depth):
    X, y = get_cleaned_data_for_shap(df)
    y_proba = get_y_proba_for_tree(model, X)
    
    clf = fit_decition_tree(X, y_proba, depth)

    return clf, X




In [66]:
def get_rules(tree, feature_names, index, res_name="Важность"):
    tree_ = tree.tree_
    feature_name = [feature_names[i] for i in tree_.feature]

    paths = []
    path = []
    
    def recurse(node, path, paths):
        
        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            p1, p2 = list(path), list(path)
            p1 += [f"({name} <= {np.round(threshold, 3)})"]
            recurse(tree_.children_left[node], p1, paths)
            p2 += [f"({name} > {np.round(threshold, 3)})"]
            recurse(tree_.children_right[node], p2, paths)
        else:
            path += [(tree_.value[node], tree_.n_node_samples[node])]
            paths += [path]
            
    recurse(0, path, paths)

    samples_count = [p[-1][1] for p in paths]
    ii = list(np.argsort(samples_count))
    paths = [paths[i] for i in reversed(ii)]
    
    rules = []
    for path in paths:
        rule = "Если "
        
        for p in path[:-1]:
            if rule != "Если ":
                rule += " И "
            rule += f"{p}"
        val = int(np.round(path[-1][0][0][0] * 100,0))
        rule = f"{res_name}={val} ({rule})"
        
        
        if index:
            rule += f" Тогда ({res_name}_{index}={np.round(path[-1][0][0][0],3)})"
        rules += [rule]
        
    return rules

def get_rules_for_features(model_xgb, df, features, depth):
    rules = []

    for i, f in enumerate(features):
        clf, X = get_decition_tree_text_for_feature(model_xgb, df, f, depth)
        rules += get_rules(clf, X.columns, i)

    return rules

def get_rules_for_y_proba(model_xgb, df, depth):
    clf, X = get_decition_tree_model(model_xgb, df, depth)
    rules = get_rules(clf, X.columns, None, "Proba")

    return rules

def format_rules_as_str(rules):
    return "\n".join(rules)

def fix_comparison(rules_str):
    replace_rules = {
        "Sex > 0.5": "Sex = M",
        "Sex <= 0.5": "Sex = F",
        "Pclass > 2.5": "Pclass = High",
        "Pclass <= 2.5": "Pclass = Low",
        "Вероятность_0": "Вероятность"
    }
    for from_, to in replace_rules.items():
        rules_str = rules_str.replace(from_, to)

    return rules_str

In [9]:
df = get_df()
X, y = get_X_y(df)
model = get_model(X, y)

In [67]:
rules = get_rules_for_y_proba(model, df, 3)
rules_str = format_rules_as_str(rules)
rules_str = fix_comparison(rules_str)
print(rules_str)

Proba=12 (Если (Sex = M) И (Cabin > 6.0) И (Age > 3.5))
Proba=41 (Если (Sex = M) И (Cabin <= 6.0) И (Age > 14.0))
Proba=88 (Если (Sex = F) И (Pclass = Low) И (Fare <= 51.74))
Proba=56 (Если (Sex = F) И (Pclass = High) И (Fare <= 20.8))
Proba=97 (Если (Sex = F) И (Pclass = Low) И (Fare > 51.74))
Proba=23 (Если (Sex = F) И (Pclass = High) И (Fare > 20.8))
Proba=58 (Если (Sex = M) И (Cabin > 6.0) И (Age <= 3.5))
Proba=94 (Если (Sex = M) И (Cabin <= 6.0) И (Age <= 14.0))


In [34]:
rules = get_rules_for_features(model, df, ['Sex', 'Age', 'Pclass'], 2)
rules_str = format_rules_as_str(rules)
rules_str = fix_comparison(rules_str)
print(rules_str)

Если (Sex = M) И (Pclass = High) Тогда Важность_0=-0.921
Если (Sex = M) И (Pclass = Low) Тогда Важность_0=-1.273
Если (Sex = F) И (Pclass = Low) Тогда Важность_0=2.294
Если (Sex = F) И (Pclass = High) Тогда Важность_0=1.495
Если (Age > 8.5) И (Age <= 52.5) Тогда Важность_1=-0.172
Если (Age > 8.5) И (Age > 52.5) Тогда Важность_1=-1.182
Если (Age <= 8.5) И (SibSp <= 2.5) Тогда Важность_1=3.143
Если (Age <= 8.5) И (SibSp > 2.5) Тогда Важность_1=1.459
Если (Pclass = High) И (Sex = M) Тогда Важность_2=-0.466
Если (Pclass = Low) И (Pclass <= 1.5) Тогда Важность_2=1.018
Если (Pclass = Low) И (Pclass > 1.5) Тогда Важность_2=0.194
Если (Pclass = High) И (Sex = F) Тогда Важность_2=-1.042


Proba=12 (Если (Sex = M) И (Cabin > 6.0) И (Age > 3.5))
Proba=41 (Если (Sex = M) И (Cabin <= 6.0) И (Age > 14.0))
Proba=88 (Если (Sex = F) И (Pclass = Low) И (Fare <= 51.74))
Proba=56 (Если (Sex = F) И (Pclass = High) И (Fare <= 20.8))
Proba=97 (Если (Sex = F) И (Pclass = Low) И (Fare > 51.74))
Proba=23 (Если (Sex = F) И (Pclass = High) И (Fare > 20.8))
Proba=58 (Если (Sex = M) И (Cabin > 6.0) И (Age <= 3.5))
Proba=94 (Если (Sex = M) И (Cabin <= 6.0) И (Age <= 14.0))

In [None]:
Proba = 96% Если ((Sex = F) И (Pclass = Low) И (Fare > 51.74));
Proba = 93% Если ((Sex = M) И (Cabin <= 6.0) И (Age <= 14.0));
Proba = 88% Если ((Sex = F) И (Pclass = Low) И (Fare <= 51.74));
Proba = 58% Если ((Sex = M) И (Cabin > 6.0) И (Age <= 3.5));
Proba = 56% Если ((Sex = F) И (Pclass = High) И (Fare <= 20.8));
Proba = 12% Если ((Sex = M) И (Cabin > 6.0) И (Age > 3.5));
Proba = 41% Если ((Sex = M) И (Cabin <= 6.0) И (Age > 14.0));
Proba = 23% Если ((Sex = F) И (Pclass = High) И (Fare > 20.8));
