In [20]:
import pandas as pd
import numpy as np

#Methods
from sklearn.model_selection import train_test_split
from interpret.glassbox import ExplainableBoostingClassifier
# Pipeline preprocessing
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

In [2]:
#import breast dataset
X = pd.read_csv("./datasets/cleaned/breast_X.csv")
X = X.drop("Unnamed: 0", axis=1)
y = pd.read_csv("./datasets/cleaned/breast_y.csv")
y = y.drop("Unnamed: 0", axis=1)
X.head()

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [3]:
y.head()

Unnamed: 0,diagnosis
0,1
1,1
2,1
3,1
4,1


In [5]:
y = y[y.columns[0]]

In [6]:
y

0      1
1      1
2      1
3      1
4      1
      ..
564    1
565    1
566    1
567    1
568    0
Name: diagnosis, Length: 569, dtype: int64

In [12]:
features_types_df = pd.read_csv(f"datasets/cleaned/datatypes/breast.csv")

feature_inidices = list(map(int, list(features_types_df)))
features_names = pd.Series(list(features_types_df.T[0]))
features_types = pd.Series(list(map(int, list(features_types_df.T[1]))))

In [13]:
feature_inidices

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29]

In [15]:
features_names

0                 radius_mean
1                texture_mean
2              perimeter_mean
3                   area_mean
4             smoothness_mean
5            compactness_mean
6              concavity_mean
7         concave points_mean
8               symmetry_mean
9      fractal_dimension_mean
10                  radius_se
11                 texture_se
12               perimeter_se
13                    area_se
14              smoothness_se
15             compactness_se
16               concavity_se
17          concave points_se
18                symmetry_se
19       fractal_dimension_se
20               radius_worst
21              texture_worst
22            perimeter_worst
23                 area_worst
24           smoothness_worst
25          compactness_worst
26            concavity_worst
27       concave points_worst
28             symmetry_worst
29    fractal_dimension_worst
dtype: object

In [16]:
features_types

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0
18    0
19    0
20    0
21    0
22    0
23    0
24    0
25    0
26    0
27    0
28    0
29    0
dtype: int64

In [17]:
#define data containers for features
#input_queue: indices of features to be check
#output_queue: indices of checked features (order is coresponding to given loss)
#data_losses: data container for each step losses storing
input_queue = pd.Series(feature_inidices, dtype=int)
output_queue = pd.Series([], dtype=int)
run_losses = pd.Series([], dtype=float)

In [18]:
def get_output_col_name(output_queue, col_names):
  output_features_names = pd.Series(dtype="string")
  for i in range(len(output_queue)):
    output_features_names = pd.concat([output_features_names, pd.Series([col_names[output_queue].iloc[i]])])
  return output_features_names

# return preprocesing for num features only
def num_feat_preprocessing(num_names):
  preprocess = make_column_transformer(
      (MinMaxScaler(), num_names)
  )
  return preprocess

# return preprocesing for cat features only
def cat_feat_preprocessing(cat_names):
  preprocess = make_column_transformer(
      (OneHotEncoder(), cat_names)
  )
  return preprocess

# return preprocesing for all features
def feat_preprocessing(num_names, cat_names):
  preprocess = make_column_transformer(
      (OneHotEncoder(), cat_names),
      (MinMaxScaler(), num_names)
  )
  return preprocess

def select_preprocessing_for_single_feat(init_index, col_names, col_types):
  #tested
  cat_feat = []
  num_feat = []

  if col_types[int(init_index)] == 0:
    num_feat.append(col_names[int(init_index)])
    #run StandardScaler function
    preprocess = num_feat_preprocessing(num_feat)
  else:
    cat_feat.append(col_names[int(init_index)])
    preprocess = cat_feat_preprocessing(cat_feat)
  return preprocess

def select_preprocessing_for_many_feat(output_col_names, col_types, col_names):
  cat_feat = []
  num_feat = []

  for feat_index in output_col_names:
    if col_types[feat_index] == 0:
      num_feat.append(col_names[feat_index])
    else:
      cat_feat.append(col_names[feat_index])
  
  print(cat_feat)
  print(num_feat)
  
  #select preprocesing
  if len(cat_feat) == 0 and len(num_feat) != 0:
    preprocess = num_feat_preprocessing(num_feat)
    print("Jestem tu!!!")
  if len(cat_feat) != 0 and len(num_feat) == 0:
    preprocess = cat_feat_preprocessing(cat_feat)
  else:
    preprocess = feat_preprocessing(num_feat, cat_feat)
  return preprocess

def create_data_frame_for_feat(output_col_names, dataset_df):
  # if len(output_col_names) == 1:
  #   return pd.DataFrame(dataset_df[output_col_names], columns=[output_col_names])
  # else:
    return dataset_df[output_col_names]

def calculate_loss_for_single_feat(X_df, y_lab, init_index):
  X = X_df
  y = y_lab

  preprocess = select_preprocessing_for_single_feat(init_index=int(init_index),
                                                  col_names=features_names,
                                                  col_types=features_types)

    # Split beetwen three dataset (test, train, val)
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1337, shuffle=True)
  X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=1337)

  adult_ebm = make_pipeline(
      preprocess,
      ExplainableBoostingClassifier()
  )

  adult_ebm.fit(X_train, y_train)

  #Prediction
  y_preds = adult_ebm.predict(X_test)

  #Calculate logloss
  p = np.clip(y_preds, 1e-12, 1. - 1e-12)
  result= np.mean(y_test * -np.log(p) + (1. - y_test) * (-np.log(1. - p)))

  return(result, X.columns[0])

def calculate_loss_for_multi_feat(X_df, y_lab, output_with_to_pred_feat):
  print(X_df)
  print(y_lab)
  X = X_df
  y = y_lab

  preprocess = select_preprocessing_for_many_feat(output_col_names=output_with_to_pred_feat,
                                                  col_types=features_types,
                                                  col_names=features_names)
  print(preprocess)
    # Split beetwen three dataset (test, train, val)
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1337, shuffle=True)
  X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=1337)

  adult_ebm = make_pipeline(
      preprocess,
      ExplainableBoostingClassifier()
  )

  print(f"X_train shape: {X_train.shape, y_train.shape}")
  print(f"types: {type(X_train), type(y_train)}")
  print(f"head: {X_train.head(), y_train.head()}")
  
  adult_ebm.fit(X_train, y_train)


  #Prediction
  y_preds = adult_ebm.predict(X_test)

  #Calculate logloss
  p = np.clip(y_preds, 1e-12, 1. - 1e-12)
  result= np.mean(y_test * -np.log(p) + (1. - y_test) * (-np.log(1. - p)))

  return(result, X.columns[0])

In [21]:
#define data variable for initial loss
# initial_index = [np.random.choice(input_queue)]
# initial_index = np.random.choice(input_queue)
initial_index = 0
test_df = create_data_frame_for_feat(get_output_col_name([initial_index], features_names), X)
result, name = calculate_loss_for_single_feat(test_df, y, initial_index)

In [22]:
initial_error = result
initial_name = name

In [29]:
losses_vector = np.zeros(len(input_queue))
for index in feature_inidices:
  test_df = create_data_frame_for_feat(get_output_col_name([index], features_names), X)
  result, name = calculate_loss_for_single_feat(test_df, y, index)
  losses_vector[index] = result
  print(name)


radius_mean
texture_mean
perimeter_mean
area_mean
smoothness_mean
compactness_mean
concavity_mean
concave points_mean
symmetry_mean
fractal_dimension_mean
radius_se
texture_se
perimeter_se
area_se
smoothness_se
compactness_se
concavity_se
concave points_se
symmetry_se
fractal_dimension_se
radius_worst
texture_worst
perimeter_worst
area_worst
smoothness_worst
compactness_worst
concavity_worst
concave points_worst
symmetry_worst
fractal_dimension_worst


In [30]:
run_losses[0] = losses_vector
# get index of smallest loses feature
feature_selected_index = input_queue.iloc[run_losses[0].argmin()]
#pop index from input queue
input_queue.pop(feature_selected_index)
#add selected index to output_queue
output_queue = pd.concat([output_queue, pd.Series(feature_selected_index)])

In [32]:
def concat_data_indices(output_queue, input_queue):
  temp_list = list(output_queue)
  indices_list = []
  for index_input in input_queue:
    temp_list.append(index_input)
    indices_list.append(temp_list)
    temp_list = list(output_queue)
  return indices_list

In [34]:
for i in range(len(input_queue)):
  losses_vector = np.zeros(len(input_queue))
  lista_test = concat_data_indices(output_queue, input_queue)

  for j in range(len(input_queue)):
    test_df = create_data_frame_for_feat(get_output_col_name(list(lista_test[j]), features_names), X)
    result, name = calculate_loss_for_multi_feat(test_df, y, list(lista_test[j]))
    losses_vector[j] = result
  
  run_losses[i+1] = losses_vector
  # get index of smallest loses feature
  feature_selected_index = input_queue.iloc[run_losses[i+1].argmin()]
  input_queue.pop(feature_selected_index)
  #add selected index to output_queue
  output_queue = pd.concat([output_queue, pd.Series(feature_selected_index)])

     radius_worst  radius_mean
0          25.380        17.99
1          24.990        20.57
2          23.570        19.69
3          14.910        11.42
4          22.540        20.29
..            ...          ...
564        25.450        21.56
565        23.690        20.13
566        18.980        16.60
567        25.740        20.60
568         9.456         7.76

[569 rows x 2 columns]
0      1
1      1
2      1
3      1
4      1
      ..
564    1
565    1
566    1
567    1
568    0
Name: diagnosis, Length: 569, dtype: int64
[]
['radius_worst', 'radius_mean']
Jestem tu!!!
ColumnTransformer(transformers=[('onehotencoder', OneHotEncoder(), []),
                                ('standardscaler', StandardScaler(),
                                 ['radius_worst', 'radius_mean'])])
X_train shape: ((398, 2), (398,))
types: (<class 'pandas.core.frame.DataFrame'>, <class 'pandas.core.series.Series'>)
head: (     radius_worst  radius_mean
376         10.85        10.57
383         14.18 

In [35]:
output_queue

0    20
0    24
0     1
0     9
0     4
0     5
0     0
0     2
0     3
0     6
0     8
0    10
0     7
0    12
0    13
0    14
0    11
0    16
0    15
0    17
0    18
0    19
0    21
0    22
0    23
0    25
0    28
0    29
0    26
0    27
dtype: int64

In [36]:
initial_error

1.9504250199488233

In [37]:
features_names[output_queue]

20               radius_worst
24           smoothness_worst
1                texture_mean
9      fractal_dimension_mean
4             smoothness_mean
5            compactness_mean
0                 radius_mean
2              perimeter_mean
3                   area_mean
6              concavity_mean
8               symmetry_mean
10                  radius_se
7         concave points_mean
12               perimeter_se
13                    area_se
14              smoothness_se
11                 texture_se
16               concavity_se
15             compactness_se
17          concave points_se
18                symmetry_se
19       fractal_dimension_se
21              texture_worst
22            perimeter_worst
23                 area_worst
25          compactness_worst
28             symmetry_worst
29    fractal_dimension_worst
26            concavity_worst
27       concave points_worst
dtype: object

In [64]:
sorted_results = np.zeros(len(run_losses))
# sorted_results[0] = initial_error
for i in range(len(run_losses)):
    print(run_losses[i].min())
    sorted_results[i] = run_losses[i].min()

1.625354183290852
0.6501416733169433
0.3250708366589699
0.3250708366589699
0.3250708366589699
0.3250708366589699
0.3250708366589699
0.3250708366589699
0.3250708366589699
0.3250708366589699
0.3250708366589699
0.3250708366589699
0.3250708366589699
0.3250708366589699
0.3250708366589699
0.3250708366589699
0.3250708366589699
0.3250708366589699
0.3250708366589699
0.3250708366589699
0.3250708366589699
0.3250708366589699
0.3250708366589699
0.3250708366589699
0.3250708366589699
0.3250708366589699
0.3250708366589699
0.3250708366589699
0.9752127702333225
0.9752127702333225


In [54]:
pd.DataFrame(["Initial Error", initial_error])

Unnamed: 0,0
0,Initial Error
1,1.950425


In [71]:
output_queue.iloc[0],features_names[output_queue].iloc[0], sorted_results[0]

(20, 'radius_worst', 1.625354183290852)

In [76]:
final_results = []
initial_result = [initial_index, "initial_error", initial_error]
final_results.append(initial_result)

for i in range(len(output_queue)):
    temp_result = [output_queue.iloc[i],features_names[output_queue].iloc[i], sorted_results[i]]
    final_results.append(temp_result)

In [79]:
pd.DataFrame(final_results).to_csv(index=False, path_or_buf="./test_results/feature_density/ebm_breast.csv")