In [1]:
import os
import sys
from dotenv import load_dotenv
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np

scripts_path = os.getcwd()
env_file_path = os.path.join(scripts_path, 'env.txt')
load_dotenv(env_file_path)
main_path = os.getenv("MAIN_PATH")

sys.path.append(main_path)
data_path = os.path.join(main_path, 'data')

In [2]:
samples_file = os.path.join(data_path, 'EW_BusinessXai_v2.xlsx')
samples = pd.read_excel(samples_file, sheet_name = 'EW_MODULI_SCORE_CORPORATE')
predictions = pd.read_excel(samples_file, sheet_name = 'EW_SCORE_CORPORATE')
kpi_map = pd.read_excel(os.path.join(data_path, 'mapping descrizioni KPI_v6.1.xlsx'))
kpi_map = kpi_map.rename(columns={'ID':'KPI'})
kpi_map = kpi_map[kpi_map.NOME_COLONNA_OUTPUT=='VALORE'].drop(['TIPOLOGIA_OUTPUT','COL_TO_TRANSPOSE','VALUE','CONSTANT_TO_ADD','TO_DROP','DESCRIZIONE','DATA_INIZIO_VALIDITA','DATA_FINE_VALIDITA'], axis = 1)

weights = pd.read_excel(os.path.join(data_path, 'logit_weights.xlsx'), sheet_name='weights')
weights = weights.set_index('Variabile').join(kpi_map[(kpi_map.KPI.isin(samples.KPI_CD.unique()))].set_index('CAMPO')).reset_index().sort_values(['MODELLO','MODULO_DS']).set_index('KPI')

data = samples.pivot(index = 'ID', columns = 'KPI_CD', values='VALORE_QT')

In [3]:
logit = lambda coefs, intercept, samples: 1/(1+np.exp(-1*(np.matmul(coefs,samples.T)+intercept)))
log_odds_ratio = lambda coefs, intercept, samples: np.log(logit(coefs, intercept, samples)/(1-logit(coefs, intercept, samples)))
contrib = lambda coefs, samples: coefs * samples

In [205]:
model = 'CORPORATE' 
module = 'Andamentale Interno'

module_weights = weights[(weights.MODELLO == model) & (weights.MODULO_DS == module)]
intercept = module_weights[module_weights.Variabile == 'intercept'].Coefficiente.values[0]*0.5
coefs = module_weights[module_weights.Variabile!='intercept'].Coefficiente.sort_index()

input_data = data.loc[:,coefs.index]
prediction_ai = log_odds_ratio(coefs.values.reshape(1,-1), intercept, input_data.values)
contribs_ai = contrib(coefs, input_data)
prediction_ai

lr_ai = LogisticRegression()
lr_ai.fit(input_data, np.array([0,1]))
lr_ai.coef_ = coefs.values.reshape(1,-1)
lr_ai.intercept_ = np.array([intercept])

lr_ai.predict_proba(input_data), prediction_ai

(array([[0.5201366 , 0.4798634 ],
        [0.46080023, 0.53919977]]),
 array([[-0.08058999,  0.15712153]]))

In [183]:
distr_2543 = [0.2805,  -0.1635]
distr_2544 = [ -0.5660]
distr_2545 = [0.4860, -0.5930, -0.8117, -0.8117]
distr_2546 = [0.2830, -0.7847, -0.7847]
distr_2547 = [0.3227, -0.6792, -0.6792]
distr_2548 = [1.8174, 0.4985, -0.2373, -0.5157,  -0.5157]
distr_2549 = [0.9022, 0.1788, -0.1544,  -0.8492, -0.8492]
distr_2550 = [0.7308, 0.3218, -0.3269, -0.9101,  -0.9101, -0.9101]

In [206]:
ai_distr = np.array([distr_2543, distr_2544, distr_2545, distr_2546, distr_2547, distr_2548, distr_2549, distr_2550])
xx, yy = create_dataset(ai_distr, 200)
sum(lr_ai.predict(xx))

Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.


78

In [212]:
kpi_map.set_index('KPI').loc[coefs.index,'CAMPO'].values.tolist()

['binned_ID45_UTIL_SUM2_SU_ACC_SUM2',
 'binned_ID204_MAX_GG_SCONF_INC_AVGW6',
 'binned_ID40_IMP_TOT_UTIL_SU_GIACE_MIN_AVGM2',
 'binned_ID140_IMP_EFF_PROT_RICH_SU_SCAD_SUM3',
 'binned_ID58_GG_DISP_CC_AVG3',
 'binned_ID223_IMP_INSOLUTI_SU_SCADUTI_TS_SUM3',
 'binned_ID35_FLAG_MAX_IMP_SCONF_GRT100_M1_RESPLIM',
 'binned_ID180_NUM_RATE_IMPAGATE_MAX3_altro']

In [213]:
from omnixai.data.tabular import Tabular
from omnixai.explainers.tabular import TabularExplainer
from omnixai.preprocessing.tabular import TabularTransform
from omnixai.explainers.tabular import GPTExplainer

concat_data= pd.concat([pd.DataFrame(xx,columns = kpi_map.set_index('KPI').loc[coefs.index,'CAMPO'].values.tolist()), 
                        pd.Series(yy).to_frame('label')], axis = 1)

tabular_concat_data = Tabular(
    concat_data,
    categorical_columns=[],
    target_column='label'
)

tabular_xx = tabular_concat_data.remove_target_column()

transformer = TabularTransform().fit(tabular_concat_data)
class_names = transformer.class_names

explainers = TabularExplainer(
   explainers=["shap", "mace"], # The explainers to apply
   mode="classification",                             # The task type
   data = tabular_xx,                                   # The data for initializing the explainers
   model = lr_ai,                                   # The ML model to explain
   preprocess=lambda z: transformer.transform(z),     # Converts raw features into the model inputs
)

# analysis = explainers.explain(tabular_xx)

Feature 1 is constant and will be replaced with 0.


In [217]:
lr_ai.predict_proba(transformer.transform(tabular_xx))

array([[0.55837021, 0.44162979],
       [0.51037284, 0.48962716],
       [0.79030226, 0.20969774],
       [0.55019718, 0.44980282],
       [0.29602912, 0.70397088],
       [0.67109172, 0.32890828],
       [0.3597023 , 0.6402977 ],
       [0.67346857, 0.32653143],
       [0.71904906, 0.28095094],
       [0.43752009, 0.56247991],
       [0.46364717, 0.53635283],
       [0.77754952, 0.22245048],
       [0.59680407, 0.40319593],
       [0.78806844, 0.21193156],
       [0.86175285, 0.13824715],
       [0.63837642, 0.36162358],
       [0.39149055, 0.60850945],
       [0.6393787 , 0.3606213 ],
       [0.93417653, 0.06582347],
       [0.22128617, 0.77871383],
       [0.28513922, 0.71486078],
       [0.67346857, 0.32653143],
       [0.40040796, 0.59959204],
       [0.90299388, 0.09700612],
       [0.96244926, 0.03755074],
       [0.42527772, 0.57472228],
       [0.50852741, 0.49147259],
       [0.39511996, 0.60488004],
       [0.75398049, 0.24601951],
       [0.41467054, 0.58532946],
       [0.

In [222]:
np.random.rand(1,2)

array([[0.40763509, 0.824896  ]])

In [230]:
explainer = GPTExplainer(
    training_data=tabular_xx,
    # predict_function=lambda x: lr_ai.predict_proba(transformer.transform(x)),
    predict_function=lambda x: np.random.rand(transformer.transform(x).shape[0],2),
    apikey=""
)

explanations = explainer.explain(tabular_xx[11])
print(explanations.get_explanations(index=0)["text"])

Using 150 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.
Feature 1 is constant and will be replaced with 0.


100%|██████████| 1/1 [00:00<00:00,  8.23it/s]


Based on the provided feature importance scores, the example is classified as label_1 primarily due to the following factors:

1. "binned_ID223_IMP_INSOLUTI_SU_SCADUTI_TS_SUM3 = 0.4985": This feature has the highest positive importance score, indicating a strong positive influence on the prediction towards label_1.

2. "binned_ID140_IMP_EFF_PROT_RICH_SU_SCAD_SUM3 = 0.283": This feature also has a positive importance score, contributing to the prediction towards label_1.

To change the predicted label, you can modify the values of the following features:

1. To change the predicted label to label_0, set "binned_ID40_IMP_TOT_UTIL_SU_GIACE_MIN_AVGM2" to a value lower than -0.68868125.

2. To change the predicted label to label_0, set "binned_ID35_FLAG_MAX_IMP_SCONF_GRT100_M1_RESPLIM" to a value lower than -0.24125000000000002.


In [None]:

from omnixai.data.tabular import Tabular

import sklearn
import xgboost

transformer = TabularTransform().fit(tabular_data)
class_names = transformer.class_names
x = transformer.transform(tabular_data)

train, test, train_labels, test_labels = \
    sklearn.model_selection.train_test_split(x[:, :-1], x[:, -1], train_size=0.80)

mm = xgboost.XGBClassifier(n_estimators=300, max_depth=5)
mm.fit(train, train_labels)

train_data = transformer.invert(train)
test_data = transformer.invert(test)
datas = transformer.invert(input_data.values)

# Initialize a TabularExplainer
explainers = TabularExplainer(
   explainers=["shap","mace"], # The explainers to apply
   mode="classification",                             # The task type
   data=train_data,                                   # The data for initializing the explainers
   model=lr_legacy,                                   # The ML model to explain
   preprocess=lambda z: transformer.transform(z),     # Converts raw features into the model inputs
)

In [None]:
analysis = explainers.explain(train_data)

In [None]:
explainer = GPTExplainer(
    training_data=tabular_data,
    predict_function=lambda x: mm.predict_proba(transformer.transform(x)),
    apikey="sk-"
)

explanations = explainer.explain(train_data[38])
print(explanations.get_explanations(index=0)["text"])

In [5]:
module = 'Centrale Rischi'

module_weights = weights[(weights.MODELLO == model) & (weights.MODULO_DS == module)]
intercept = module_weights[module_weights.Variabile == 'Intercept'].Coefficiente.values[0]
coefs = module_weights[module_weights.Variabile!='Intercept'].Coefficiente.sort_index()

input_data = data.loc[:,coefs.index]
prediction_cr = log_odds_ratio(coefs.values.reshape(1,-1), intercept, input_data.values)
contribs_cr = contrib(coefs, input_data)
prediction_cr

lr_cr = LogisticRegression()
lr_cr.fit(input_data, np.array([0,1]))
lr_cr.coef_ = coefs.values.reshape(1,-1)
lr_cr.intercept_ = np.array([intercept])

lr_cr.predict_proba(input_data), prediction_cr

(array([[0.85981482, 0.14018518],
        [0.83281633, 0.16718367]]),
 array([[-1.81375277, -1.60572007]]))

In [6]:
module = 'Bilanci'
module_data = samples[(samples.MODELLO == model) & (samples.MODULO_DS == module)].pivot(index = 'ID', columns = 'KPI_CD', values='VALORE_QT')

module_weights = weights[(weights.MODELLO == model) & (weights.MODULO_DS == module)]
intercept = module_weights[module_weights.Variabile == 'Intercept'].Coefficiente.values[0]
coefs = module_weights[module_weights.Variabile!='Intercept'].Coefficiente.sort_index()

input_data = data.loc[:,coefs.index]
prediction_bil = log_odds_ratio(coefs.values.reshape(1,-1), intercept, input_data.values)
contribs_bil = contrib(coefs, input_data)
prediction_bil

lr_bil = LogisticRegression()
lr_bil.fit(input_data, np.array([0,1]))
lr_bil.coef_ = coefs.values.reshape(1,-1)
lr_bil.intercept_ = np.array([intercept])

lr_bil.predict_proba(input_data), prediction_bil

(array([[0.91001194, 0.08998806],
        [0.9280303 , 0.0719697 ]]),
 array([[-2.31378076, -2.55681914]]))

In [7]:
prediction_trans = np.array([0.732320845, 0.643310368])

## LEGACY

In [8]:
path_transformation = os.path.join(data_path,'integrazione_transformation.xlsx')
transf = pd.read_excel(path_transformation)

In [9]:
def apply_binning(vals, table):
    tt = list(table.itertuples(index = False, name = None))
    bins_mapper = {pd.IntervalIndex.from_tuples([(x[0], x[1])], closed='left')[0]: x[2] for x in tt}
    bins = pd.IntervalIndex.from_tuples([(x[0], x[1]) for x in tt], closed = 'left')
    return pd.Series(pd.cut(vals.reshape(-1), bins)).replace(bins_mapper).astype(float).values.reshape(-1,1)

table_ai = transf[(transf.Variabile == 'score_AI')& (transf.Bin != 'Missing')].loc[:, ['LB', 'UB', 'WoE']]
bin_score_ai = apply_binning(prediction_ai, table_ai)

table_cr = transf[(transf.Variabile == 'score_cr')& (transf.Bin != 'Missing')].loc[:, ['LB', 'UB', 'WoE']]
bin_score_cr = apply_binning(prediction_cr,table_cr)

table_bil = transf[(transf.Variabile == 'score_bil') & (transf.Bin != 'Missing')].loc[:, ['LB', 'UB', 'WoE']]
bin_score_bil = apply_binning(prediction_bil,table_bil)

table_trans = transf[(transf.Variabile == 'score_trans') & (transf.Bin != 'Missing')].loc[:, ['LB', 'UB', 'WoE']]
bin_score_trans = apply_binning(prediction_trans,table_trans)

In [63]:
module = 'Integrazione'
legacy_data = pd.DataFrame(np.concatenate([bin_score_ai, bin_score_cr, bin_score_bil, bin_score_trans], axis = 1), columns = ['binned_score_AI','binned_score_cr','binned_score_bil', 'binned_score_trans'])

module_weights = weights[(weights.MODELLO == model) & (weights.MODULO_DS == module)]
intercept = module_weights[module_weights.Variabile == 'Intercept'].Coefficiente.values[0]
coefs = module_weights[module_weights.Variabile!='Intercept'].set_index('Variabile').Coefficiente

input_data = legacy_data.loc[:,coefs.index]
prediction_legacy = log_odds_ratio(coefs.values.reshape(1,-1), intercept, input_data.values)
contribs_legacy = contrib(coefs, input_data)
prediction_legacy, contribs_legacy

lr_legacy = LogisticRegression()
lr_legacy.fit(input_data.values, np.array([0,1]))
lr_legacy.coef_ = coefs.values.reshape(1,-1)
lr_legacy.intercept_ = np.array([intercept])

In [56]:
ai_values = [2.0646, 1.2387, 0.3632, 0.0000, -0.4019, -0.9806, -1.4795]
bil_values = [1.3922, 0.7261, 0.0777, -0.1502, -0.7039]
cr_values = [1.7992, 1.0283, 0.3965, 0.1738, 0.0055, -0.5626, -1.0893, -1.5130]
trans_values = [1.7341, 0.9211, 0.2307, -0.2502, -0.6687, -0.6836]

In [60]:
import numpy as np

def create_dataset(distr, size):
    """
    Crea un dataset basato sui valori possibili per n variabili.

    Args:
    - distr (list of lists): Una lista di liste, dove ogni lista interna contiene 
      i valori possibili per una feature.
    - size (int): La dimensione del dataset di output.

    Returns:
    - dataset (ndarray): Il dataset generato.
    """

    # Verifica che distr sia una lista di liste
    if not all(isinstance(i, list) for i in distr):
        raise ValueError("Ogni elemento di distr deve essere una lista")

    # Estrazione casuale dei valori per ciascuna feature
    features = [np.random.choice(feature_values, size) for feature_values in distr]

    # Combinazione delle features per formare il dataset
    dataset = np.column_stack(features)

    labels = [np.random.choice([0,1]) for x in range(size)]

    return dataset, labels

vv, yy = create_dataset([ai_values, bil_values, cr_values, trans_values],100)

In [17]:
from sklearn.datasets import make_classification
upsampled_data, target = make_classification(n_samples=100, n_features=4)