## SHAP VALUES

In [56]:
import shap
import datetime
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as stats
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from collections import defaultdict
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, FunctionTransformer
from sklearn.impute import SimpleImputer
import pickle

In [57]:
from sklearn.base import BaseEstimator, TransformerMixin

# Calcula el número de prestamos por wallet_address
class borrow_times(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.borrow_per_cli = None

    def fit(self, X, y=None):
        data = X.copy()
        object_var = data.select_dtypes(include='object').columns.to_list()
        vars = object_var + ['borrow_timestamp']

        self.borrow_per_cli = data[vars].groupby(*object_var).count()
        self.borrow_per_cli.rename(columns={'borrow_timestamp': 'borrow_times'}, inplace=True)
        self.borrow_per_cli.reset_index(inplace=True)
        return self

    def transform(self, X, y=None):
        data = X.copy()
        new_X = pd.merge(data, self.borrow_per_cli, on='wallet_address', how='left').fillna(0)
        new_X = new_X.sort_index(axis=1)
        return new_X

    def set_output(self,transform='default'):
        #No modificar este método
        return self
class tx_diff(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.ts_diff_tx = None

    def fit(self, X, y=None):
        data = X.copy()
        self.data = data
        return self

    def transform(self, X, y=None):
        data = X.copy()
        data['ts_diff_tx'] = data['last_tx_timestamp'] - data['first_tx_timestamp']
        data.rename(columns={'risky_first_last_tx_timestamp_diff':'ts_diff_risky_tx'}, inplace=True)
        data.drop(columns=['last_tx_timestamp',
                           'first_tx_timestamp',
                           'risky_last_tx_timestamp',
                           'risky_first_tx_timestamp',
                        #    'borrow_timestamp'
                           ], inplace=True)

        new_data = data.sort_index(axis=1)
        return new_data

    def set_output(self,transform='default'):
        return self
class search_binary(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.binary_cols = []

    def fit(self, X, y=None):
        data = X.copy()
        for col in data.columns:
            diff_values = len(data[col].value_counts())
            is_binary = diff_values == 2
            if is_binary:
                self.binary_cols.append(col)
        return self

    def transform(self, X, y=None):
        data = X.copy()
        if self.binary_cols:
            binary_col = self.binary_cols[0] if isinstance(self.binary_cols, list) else self.binary_cols
            data[binary_col] = data[binary_col].astype('category')
        return data

    def set_output(self, transform='default'):
        return self
class time_tranf(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.time_cols = []

    def fit(self, X, y=None):
        data = X.copy()
        tcols = [col for col in data.columns if "timestamp" in col]
        self.time_cols = tcols
        return self

    def transform(self, X, y=None):
        data = X.copy()
        for col in self.time_cols:
            min_ms = data[col].min()
            data[col] = data[col] - min_ms
        return data

    def set_output(self,transform='default'):
        return self
class CategoryToInt(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X.astype(np.int64)

    def set_output(self,transform='default'):
        return self

In [58]:
X_t0 = pd.read_parquet('X_t0.parquet')
X_t0 = X_t0[sorted(X_t0.columns)]
X_t0

Unnamed: 0,avg_gas_paid_per_tx_eth,avg_risk_factor,avg_weighted_risk_factor,borrow_amount_avg_eth,borrow_amount_sum_eth,borrow_block_number,borrow_count,borrow_repay_diff_eth,borrow_timestamp,deposit_amount_sum_eth,...,total_balance_eth,total_collateral_avg_eth,total_collateral_eth,total_gas_paid_eth,unique_borrow_protocol_count,unique_lending_protocol_count,wallet_address,wallet_age,withdraw_amount_sum_eth,withdraw_deposit_diff_if_positive_eth
0,0.000981,0.000000,0.000000e+00,0.000000,0.000000,7711117,0,0.000000,1.557197e+09,44.410991,...,58.317987,0.000000,44.479139,0.397391,0,1,0x502cb8985b2c92a8d4bf309cdaa89de9be442708,19973049.0,0.000000,0.0
1,0.000983,0.000001,0.000000e+00,0.157110,0.157110,7711123,1,0.157110,1.557197e+09,44.410991,...,58.317987,44.479139,44.479139,0.399063,1,1,0x502cb8985b2c92a8d4bf309cdaa89de9be442708,19973188.0,0.000000,0.0
2,0.000985,0.002477,3.873783e-04,0.271637,0.543275,7711126,2,0.543275,1.557197e+09,44.410991,...,58.317987,44.479139,44.479139,0.400895,1,1,0x502cb8985b2c92a8d4bf309cdaa89de9be442708,19973238.0,0.000000,0.0
3,0.010786,0.000000,0.000000e+00,0.000000,0.000000,7711672,0,0.000000,1.557205e+09,0.000772,...,0.000000,0.000000,0.000767,0.604001,0,1,0xa7ff0d561cd15ed525e31bbe0af3fe34ac2059f6,13922.0,0.000000,0.0
4,0.001003,0.000000,0.000000e+00,0.000000,0.000000,7712572,0,0.000000,1.557217e+09,0.010000,...,0.000000,0.000000,0.010000,0.003008,0,1,0xbd9ed130a53cfafcf81502e4d35329a6c4d53410,264.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44291,0.005283,0.593777,2.731692e+02,119.897815,1079.080336,10336599,9,127.117317,1.593110e+09,1807.190939,...,0.132790,675.530612,721.151850,0.280005,1,1,0xa821dee44fe91e79535762a466f6a09fc6727891,315154.0,1102.842264,0.0
44292,0.009445,0.442679,1.802430e+02,166.350258,8151.162652,10336621,49,415.398208,1.593110e+09,12857.632963,...,0.666812,527.826927,990.611586,5.723699,1,1,0xc25c5e0495287cc3a380703b2b665da5964d35ea,3290294.0,12522.010996,0.0
44293,0.005540,0.398174,8.364824e+02,440.746450,2203.732249,10336624,5,2203.732249,1.593110e+09,8361.107430,...,7073.097271,4761.150203,8201.693929,2.343277,1,1,0xa0f75491720835b36edc92d06ddc468d201e9b73,24797983.0,110.309024,0.0
44294,0.006901,0.000002,4.354998e-14,0.004003,0.004003,10336627,1,0.000000,1.593110e+09,0.021024,...,36.836826,0.021536,0.021359,0.331227,1,1,0x07582f51171839586e42a46d4f68c70a5eb72f93,31451728.0,0.000000,0.0


In [59]:
X_t0 = X_t0.drop(columns='wallet_address')

In [60]:
with open(".//xgb_best_pipe2.pkl", 'rb') as file:
    xgb_pipe = pickle.load(file)

xgb_step = xgb_pipe.named_steps['xgb_clf']
xgb_step.enable_categorical = True

In [61]:
explainer = shap.TreeExplainer(xgb_step)
shap_values = explainer(X_t0)

XGBoostError: [00:28:30] C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0c55ff5f71b100e98-1\xgboost\xgboost-ci-windows\src\c_api\c_api_utils.h:129: Check failed: std::accumulate(shape.cbegin(), shape.cend(), static_cast<bst_ulong>(1), std::multiplies<>{}) == chunksize * rows (3410792 vs. 3322200) : 