#### Mount Drive

In [50]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


#### Installations

In [2]:
!pip install graphviz



In [3]:
pip install ydata-synthetic

Collecting ydata-synthetic
  Downloading ydata_synthetic-0.5.0-py2.py3-none-any.whl (43 kB)
[?25l[K     |███████▌                        | 10 kB 17.1 MB/s eta 0:00:01[K     |███████████████                 | 20 kB 13.2 MB/s eta 0:00:01[K     |██████████████████████▍         | 30 kB 9.8 MB/s eta 0:00:01[K     |█████████████████████████████▉  | 40 kB 8.5 MB/s eta 0:00:01[K     |████████████████████████████████| 43 kB 1.6 MB/s 
Collecting pmlb==1.0.*
  Downloading pmlb-1.0.1.post3-py3-none-any.whl (19 kB)
Collecting tensorflow==2.4.*
  Downloading tensorflow-2.4.4-cp37-cp37m-manylinux2010_x86_64.whl (394.5 MB)
[K     |████████████████████████████████| 394.5 MB 36 kB/s 
Collecting matplotlib==3.3.2
  Downloading matplotlib-3.3.2-cp37-cp37m-manylinux1_x86_64.whl (11.6 MB)
[K     |████████████████████████████████| 11.6 MB 10.6 MB/s 
[?25hCollecting pandas==1.2.*
  Downloading pandas-1.2.5-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (9.9 MB)
[K     |██████████████████

In [4]:
pip install pymc3



In [5]:
!pip install pyyaml==5.4.1

Collecting pyyaml==5.4.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 8.2 MB/s 
[?25hInstalling collected packages: pyyaml
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 6.0
    Uninstalling PyYAML-6.0:
      Successfully uninstalled PyYAML-6.0
Successfully installed pyyaml-5.4.1


#### Import

In [51]:
#Importing required libraries
import pandas as pd
import numpy as np
import pymc3
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn import svm
from xgboost import XGBClassifier
from sklearn import metrics
from sklearn import tree
from ydata_synthetic.synthesizers.regular import WGAN_GP
from ydata_synthetic.synthesizers import ModelParameters, TrainParameters
import statsmodels.formula.api as smf

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
import graphviz
from scipy.stats import norm
%matplotlib inline

#### Data Processing Pipeline

In [52]:
# initalize data-set object 
def initalize_data_set(target_column, predicted_column, categorical_threshold = 0.001, csv_file_path = None , df = None):
  if csv_file_path is None and df is None:
    raise Exception("expected csv file path or data frame object")
  if df is None:
    df = pd.read_csv(csv_file_path)
  else:
    if not isinstance(df, pd.DataFrame):
      raise Exception("data frame object must be of type 'pandas.core.frame.DataFrame'")
      
  if predicted_column == "":
    X = df.drop([target_column], axis=1)
    Y = df[target_column]
  else:
    X = df.drop([target_column, predicted_column], axis=1)
    Y = (df[predicted_column] == df[target_column])

  categorical_features = {}
  for feature in X.columns:
      categorical_features[feature] = 1.*X[feature].nunique()/X[feature].count() < categorical_threshold or X[feature].dtype == "object"

  X_encoded = pd.get_dummies(X, columns= [key for (key, value) in categorical_features.items() if value ])

  return X_encoded, Y, categorical_features

####HDR

In [53]:
def hdr(df_, Y, column, eps = 0.05, threshold = 0.01):
  df = df_.copy()
  df['is_correct_pred'] = Y
  
  range_diff_history = []
  start, end = pymc3.stats.hdi(df[column].values, hdi_prob=0.5)

  data_interval = df[df[column].between(start, end)]
  error_rate = (data_interval['is_correct_pred'] == False).sum() / df.shape[0]

  while (df[df[column].between(start, end)].shape[0] / df.shape[0] > 0.1):

    prev_start = start
    prev_end = end

    start = start * (1 + eps)
    end = end * (1 - eps)

    data_interval = df[df[column].between(start, end)]
    new_error_rate = (data_interval['is_correct_pred'] == False).sum() / df.shape[0]

    if (error_rate - new_error_rate > threshold):
      range_diff = {
          'error_rate' : error_rate - new_error_rate,
          'range_start' : (prev_start, start),
          'range_end': (end, prev_end)
      }
      range_diff_history.append(range_diff)
    error_rate = new_error_rate
  N = int(0.25 * len(range_diff_history))
  n_largest_diffs = sorted(range_diff_history, key=lambda t: t['error_rate'], reverse=True)[:N]

  return n_largest_diffs


In [54]:
def hdr_ranges_to_slice(df, column, ranges):
  df_slices = []
  for range in ranges:
    df_range = df[df[column].between(range['range_start'][0], range['range_start'][1]) | df[column].between(range['range_end'][0], range['range_end'][1])]
    df_slices.append(df_range)
  return pd.concat(df_slices)

#### Decision Tree

In [55]:
def get_lineage(tree, feature_names):
     left = tree.tree_.children_left
     right     = tree.tree_.children_right
     threshold = tree.tree_.threshold
     features  = [feature_names[i] if i != -2 else -5 for i in tree.tree_.feature]

     # get ids of child nodes
     idx = np.argwhere(left == -1)[:,0]     

     def recurse(left, right, child, lineage=None):          
          if lineage is None:
               lineage = [child]
          if child in left:
               parent = np.where(left == child)[0].item()
               split = 'l'
          else:
               parent = np.where(right == child)[0].item()
               split = 'r'

          lineage.append((parent, split, threshold[parent], features[parent]))

          if parent == 0:
               lineage.reverse()
               return lineage
          else:
               return recurse(left, right, parent, lineage)
    
     childs = {}

     for child in idx:
          child_rules = []
          for node in recurse(left, right, child):
               child_rules.append(node)
          childs[child] = child_rules
     return childs
    
               

In [56]:
def get_leaves_props(leaves, tree_clf):
  leaves_range_dict = {}
  for key,value in leaves.items():
    range_min_max_dict = {}
    for range_tuple in value:
      if type(range_tuple) == tuple:
        if range_tuple[3] not in range_min_max_dict:
          range_min_max_dict[range_tuple[3]] = {}
        if (range_tuple[1] == 'r'):
          if 'min' not in range_min_max_dict[range_tuple[3]]:
            range_min_max_dict[range_tuple[3]]['min'] = range_tuple[2]
          else:
            if range_tuple[2] < range_min_max_dict[range_tuple[3]]['min']:
              range_min_max_dict[range_tuple[3]]['min'] = range_tuple[2]
        elif (range_tuple[1] == 'l'):
          if 'max' not in range_min_max_dict[range_tuple[3]]:
            range_min_max_dict[range_tuple[3]]['max'] = range_tuple[2]
          else:
            if range_tuple[2] > range_min_max_dict[range_tuple[3]]['max']:
              range_min_max_dict[range_tuple[3]]['max'] = range_tuple[2]
    leaves_range_dict[key] = {}
    leaves_range_dict[key]['range'] = range_min_max_dict
    leaves_range_dict[key]['relative_error_rate'] = tree_clf.tree_.value[key][0][0] / tree_clf.tree_.value[0][0][0]
    leaves_range_dict[key]['error_rate'] = tree_clf.tree_.value[key][0][0] / (tree_clf.tree_.value[key][0][0] + tree_clf.tree_.value[key][0][1])
  return leaves_range_dict


In [57]:
def get_stat_important_leaves(leaves, tree_clf):
  return dict(filter(lambda elem: elem[1]['relative_error_rate'] * tree_clf.tree_.value[0][0][1] > max(2, 0.05 * tree_clf.tree_.value[0][0][1]), leaves.items()))

In [58]:
def get_decision_tree_slices(X, Y, cols):
  tree_clf = tree.DecisionTreeClassifier()
  tree_clf = tree_clf.fit(X[cols], Y)
  dot_data = tree.export_graphviz(tree_clf, out_file=None, 
                      filled=True, rounded=True,  
                      special_characters=True)  
  graph = graphviz.Source(dot_data)  
  leaves = get_lineage(tree_clf, X[cols].columns)
  slices = get_stat_important_leaves(get_leaves_props(leaves, tree_clf), tree_clf)
  slices_list = []
  for key in slices.keys():
    slices_list.append(slices[key])
  return slices_list, graph

In [59]:
def get_slices_by_range(df, range):
  vec = np.ones(df.shape[0], dtype=bool)
  for key in range.keys():
    vec = vec & df[key].between(range[key].get('min', float('-inf')), range[key].get('max', float('inf')))
  return df[vec]

def tree_indexes_by_slices(df, slices):
  if len(slices) == 0:
    return []
  df_slices = []
  for slice_ in slices:
    for inner_slice in slice_:
      df_range = get_slices_by_range(df, inner_slice['range'])
      df_slices.append(df_range)
  return pd.concat(df_slices)

#### Apply Heuristics Pipeline

In [95]:
from heapq import nlargest

def apply_heuristics(X, Y, features, options = {}):
  high_rate_columns = [column for column in df.columns if df[column].value_counts().max() / df.shape[0] > 0.7 ]

  categorical_features = [key for (key, value) in features.items() if (value and (key not in high_rate_columns))]
  continious_features = [key for (key, value) in features.items() if ((value == False) and (key not in high_rate_columns))]
  #categorical_features
  categorical_features_error_rates_single = {}
  categorical_features_slices_single = {}
  for feature in categorical_features:
    cols = [c for c in X.columns if f'{feature}_' in c]
    slices, graph = get_decision_tree_slices(X, Y, cols)
    if len(slices) > 0:
      categorical_features_error_rates_single[feature] = np.mean([slice_dict['error_rate'] for slice_dict in slices])
      categorical_features_slices_single[feature] = slices
    
  #continious_features
  continious_feature_error_rates_single = {}
  continious_feature_slices_single = {}
  for feature in continious_features:
    slices = hdr(X, Y, feature, options.get('eps', 0.05), options.get('hdr_threshold', 0.001)) 
    if len(slices) > 0:
      continious_feature_error_rates_single[feature] = np.mean([slice_dict['error_rate'] for slice_dict in slices])
      continious_feature_slices_single[feature] = slices

  #combined top quarter
  combined_features_slices_single = categorical_features_slices_single.copy()
  combined_features_slices_single.update(continious_feature_slices_single)
  combined_features_error_rates_single = categorical_features_error_rates_single.copy()
  combined_features_error_rates_single.update(continious_feature_error_rates_single)
  N = int(0.25 * len(combined_features_error_rates_single.keys()))
  largest_feature_error_rates_single = nlargest(N, combined_features_error_rates_single, key = combined_features_error_rates_single.get)
  
  #feature pairs
  filtered_features = [x for x in features if (x not in set(largest_feature_error_rates_single) and x not in high_rate_columns)]
  feature_error_rates_pairs = {}
  feature_slices_pairs = {}
  for feature_largest in largest_feature_error_rates_single:
    for feature in filtered_features:
      cols = [c for c in X.columns if f'{feature_largest}_' in c or f'{feature}_' in c]
      slices, graph = get_decision_tree_slices(X, Y, cols)

      if len([slice_dict['error_rate'] for slice_dict in slices])!=0:
        feature_error_rates_pairs[f'{feature_largest}_{feature}'] = np.mean([slice_dict['error_rate'] for slice_dict in slices])
        feature_slices_pairs[f'{feature_largest}_{feature}'] = slices

  N = int(0.25 * len(feature_error_rates_pairs.keys()))
  largest_feature_error_rates_pairs = nlargest(N, feature_error_rates_pairs, key = feature_error_rates_pairs.get)

  combined_features_slices_single.update(feature_slices_pairs)
  combined_features_error_rates_single.update(feature_error_rates_pairs)
  N = int(0.25 * len(combined_features_slices_single.keys()))
  largest_feature_error_rates_combined = nlargest(N, combined_features_error_rates_single, key = combined_features_error_rates_single.get)

  top_features_slices = {field:max(slice_list, key=lambda x:x['error_rate']) for (field,slice_list) in combined_features_slices_single.items()}

  return top_features_slices, largest_feature_error_rates_combined

### Solutions

####reweighting


In [61]:
def get_all_indexes_from_all_slices(df, slices):
  categorical_slices = []
  continious_slices = []
  for key in slices.keys():
    # for categorical
    if 'range' in slices[key][0]:
      categorical_slices.append(slices[key])
    else:
      continious_slices.append(hdr_ranges_to_slice(df, key, slices[key]))

  df_categorical = tree_indexes_by_slices(df, categorical_slices)
  df_continious = None
  if len(continious_slices) > 0:
    df_continious = pd.concat(continious_slices)
  if len(continious_slices) > 0 and len(df_categorical) > 0:
    return pd.concat([df_categorical, df_continious]).index.unique()
  elif len(df_categorical) == 0:
    return df_continious.index.unique()
  else:
    return df_categorical.index.unique()

In [62]:
def reweighting(train_df, target_column, indexes, weight = 5):
  sample_weights = np.ones(train_df.shape[0]) 
  sample_weights[indexes] = weight

  clf = XGBClassifier()
  clf.fit(train_df.drop(columns = [target_column], axis = 0), train_df[target_column], sample_weight = sample_weights)

  return clf

####Synthesized Data


In [63]:
def synthesized_data(train_samples, generator_sample_size):

  #config
  noise_dim = 32
  dim = 128
  batch_size = 256 if 256 <= train_samples.shape[0] else train_samples.shape[0]
  
  #train config
  log_step = 20
  epochs = 5+1
  learning_rate = 5e-4
  beta_1 = 0.5
  beta_2 = 0.9

  gan_args = ModelParameters(batch_size=batch_size,
                           lr=learning_rate,
                           betas=(beta_1, beta_2),
                           noise_dim=noise_dim,
                           n_cols=train_samples.shape[1],
                           layers_dim=dim)
  train_args = TrainParameters(epochs=epochs,
                             sample_interval=log_step)
  # Train GAN
  model = WGAN_GP
  synthesizer = model(gan_args, n_critic=5)
  synthesizer.train(train_samples, train_args)

  # Generate records based on random noise
  generator = synthesizer.generator
  rand_noise = np.random.normal(size=(generator_sample_size, noise_dim))
  generated_samples = generator.predict(rand_noise)
  return generated_samples

In [64]:
def apply_synthesized_data(train_df, target_column, indexes, options, data = False):
  train_df_drop_target = train_df.drop(columns = [target_column])
  train_samples = train_df[train_df.index.isin(indexes)]

  train_samples_positive = train_samples[train_samples[target_column] == 1].drop(columns = [target_column])
  train_samples_negative = train_samples[train_samples[target_column] == 0].drop(columns = [target_column])

  synthesized_positive = synthesized_data(train_samples_positive, options.get('generator_sample_size' ,train_samples_positive.shape[0] * 2))
  synthesized_negative = synthesized_data(train_samples_negative, options.get('generator_sample_size' ,train_samples_negative.shape[0] * 2))

  synthesized_positive_df = pd.DataFrame(synthesized_positive, columns = train_df_drop_target.columns)
  synthesized_negative_df = pd.DataFrame(synthesized_negative, columns = train_df_drop_target.columns)

  synthesized_positive_df[target_column] = np.ones(synthesized_positive_df.shape[0])
  synthesized_negative_df[target_column] = np.zeros(synthesized_negative_df.shape[0])

  if data:
    return pd.concat([train_samples, synthesized_positive_df, synthesized_negative_df])

  df_combined = pd.concat([train_df, synthesized_positive_df, synthesized_negative_df])

  clf = XGBClassifier()
  clf.fit(df_combined.drop(columns = [target_column]), df_combined[target_column])

  return clf

####Ad Hoc Model

In [65]:
def ad_hoc_model(train_df, target_column, indexes, options):

  train_samples = apply_synthesized_data(train_df, target_column, indexes, options, True)

  clf = XGBClassifier()
  clf.fit(train_samples.drop(columns = [target_column]), train_samples[target_column])

  return clf

### Pipeline Class 

In [66]:
class CustomFreyaAI:
  def __init__(self, df, target_column = 'y'):
    if not isinstance(df, pd.DataFrame):
      raise Exception("data frame object must be of type 'pandas.core.frame.DataFrame'")
    if not target_column in df:
      raise Exception(f"The specified target column '{target_column}', was not found in the data frame")
    self.df = df.copy()
    self.target_column = target_column

  def get_slices_report(self, options = {}):
    X_encoded, Y, categorical_features = initalize_data_set(target_column = self.target_column, predicted_column = '',df = self.df, categorical_threshold = options.get('categorical_threshold', 0.001))

    X_train, X_test, y_train, y_test = train_test_split(X_encoded, Y, test_size=0.2, random_state=2, stratify=Y)

    self.X_train, self.X_test, self.y_train, self.y_test = X_train, X_test, y_train, y_test

    clf = XGBClassifier()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    feature_slices, largest_feature_error_rates  = apply_heuristics(X_test, y_pred == y_test, categorical_features, options)

    self.feature_slices = feature_slices
    return feature_slices, largest_feature_error_rates

  def apply_solution(self, feature_name, solution_name, options = {}):
    if not feature_name in self.feature_slices:
        raise Exception(f"The specified feature name '{feature_name}', was not found among the features")
    if not solution_name in ["reweighting", "data_synthesizer", "Ad_Hoc"]:
        raise Exception(f"The specified solution name '{solution_name}', was not found among the available solutions")

    report = {}

    X_train_copy = self.X_train.copy()
    X_train_copy[self.target_column] = self.y_train
    X_train_copy.reset_index(inplace = True)
    X_train_copy.drop(columns = ['index'], inplace = True)

    train_indexes = get_all_indexes_from_all_slices(X_train_copy, {feature_name:[self.feature_slices[feature_name]]})
    test_indexes = get_all_indexes_from_all_slices(self.X_test, {feature_name:[feature_slices[feature_name]]})

    clf = XGBClassifier()
    clf.fit(self.X_train, self.y_train)
    pred = clf.predict(self.X_test[self.X_test.index.isin(test_indexes)])
    before_slice_score = metrics.accuracy_score(self.y_test[self.y_test.index.isin(test_indexes)], pred)
    report['before_slice_score'] = before_slice_score

    pred = clf.predict(self.X_test)
    before_overall_score = metrics.accuracy_score(self.y_test, pred)
    report['before_overall_score'] = before_overall_score

    if solution_name == 'reweighting':
      clf = reweighting(X_train_copy, self.target_column, train_indexes, options.get('weight', 5))
    elif solution_name == 'data_synthesizer':
      clf = apply_synthesized_data(X_train_copy, self.target_column, train_indexes, options)
    elif solution_name == 'Ad_Hoc':
      clf = ad_hoc_model(X_train_copy, self.target_column, train_indexes, options)

    pred = clf.predict(self.X_test[self.X_test.index.isin(test_indexes)])
    after_slice_score = metrics.accuracy_score(self.y_test[self.y_test.index.isin(test_indexes)], pred)
    report['after_slice_score'] = after_slice_score

    after_overall_score = None
    
    if solution_name == 'Ad_Hoc':
      after_overall_score = before_overall_score
    else:
      pred = clf.predict(self.X_test)
      after_overall_score = metrics.accuracy_score(self.y_test, pred)

    report['after_overall_score'] = after_overall_score

    report['after_to_before_slice_performance_ratio'] = after_slice_score / before_slice_score
    report['after_to_before_overall_performance_ratio'] = after_overall_score / before_overall_score

    return clf, report
    


### Demo 

####German Credit Risk Data-set

In [74]:
PATH = "{PATH}"
columns = ['Account Balance', 'Duration of Credit (month)',
       'Payment Status of Previous Credit', 'Purpose', 'Credit Amount',
       'Value Savings/Stocks', 'Length of current employment',
       'Instalment per cent', 'Sex & Marital Status', 'Guarantors',
       'Duration in Current address', 'Most valuable available asset',
       'Age (years)', 'Concurrent Credits', 'Type of apartment',
       'No of Credits at this Bank', 'Occupation', 'No of dependents',
       'Telephone', 'Foreign Worker', 'Creditability']
df = pd.read_csv(PATH + 'german.data', header = None, delimiter= ' ', names = columns)
target = 'Creditability'
df[target].replace((2), (0), inplace=True)

In [75]:
cf = CustomFreyaAI(df, target)

In [84]:
feature_slices, largest_feature_error_rates = cf.get_slices_report({'categorical_threshold': 0.001, 'eps': 0.05, 'hdr_threshold': 0.001})

In [88]:
field = "Sex & Marital Status_Value Savings/Stocks"

In [89]:
clf, report = cf.apply_solution(field, "reweighting", {'weight':50})
report

{'after_overall_score': 0.69,
 'after_slice_score': 0.5,
 'after_to_before_overall_performance_ratio': 0.9718309859154929,
 'after_to_before_slice_performance_ratio': 1.5,
 'before_overall_score': 0.71,
 'before_slice_score': 0.3333333333333333}

In [90]:
clf, report = cf.apply_solution(field, "data_synthesizer", {'generator_sample_size': 500})
report

 33%|███▎      | 2/6 [00:05<00:08,  2.15s/it]

Epoch: 0 | disc_loss: -1428.65966796875 | gen_loss: 0.21422988176345825
Epoch: 1 | disc_loss: -4095.9697265625 | gen_loss: -0.047602329403162


 67%|██████▋   | 4/6 [00:05<00:01,  1.17it/s]

Epoch: 2 | disc_loss: -8701.75390625 | gen_loss: -3.314283609390259
Epoch: 3 | disc_loss: -14647.568359375 | gen_loss: -11.792234420776367
Epoch: 4 | disc_loss: -19999.69140625 | gen_loss: -29.263248443603516


100%|██████████| 6/6 [00:05<00:00,  1.10it/s]


Epoch: 5 | disc_loss: -22676.404296875 | gen_loss: -61.65106964111328


 67%|██████▋   | 4/6 [00:05<00:01,  1.00it/s]

Epoch: 0 | disc_loss: -2414.059814453125 | gen_loss: -0.0750674232840538
Epoch: 1 | disc_loss: -7389.974609375 | gen_loss: -0.27321910858154297
Epoch: 2 | disc_loss: -17127.05859375 | gen_loss: -1.9202475547790527
Epoch: 3 | disc_loss: -33216.64453125 | gen_loss: -10.759723663330078


100%|██████████| 6/6 [00:05<00:00,  1.13it/s]


Epoch: 4 | disc_loss: -56368.76171875 | gen_loss: -22.741825103759766
Epoch: 5 | disc_loss: -86963.71875 | gen_loss: -34.859527587890625


{'after_overall_score': 0.695,
 'after_slice_score': 0.3333333333333333,
 'after_to_before_overall_performance_ratio': 0.9788732394366197,
 'after_to_before_slice_performance_ratio': 1.0,
 'before_overall_score': 0.71,
 'before_slice_score': 0.3333333333333333}

In [91]:
clf, report = cf.apply_solution(field, "Ad_Hoc", {'generator_sample_size': 500})
report

 50%|█████     | 3/6 [00:05<00:04,  1.41s/it]

Epoch: 0 | disc_loss: -1757.570068359375 | gen_loss: 0.031981322914361954
Epoch: 1 | disc_loss: -4800.7314453125 | gen_loss: -1.0744318962097168
Epoch: 2 | disc_loss: -9859.59375 | gen_loss: -5.260913372039795


100%|██████████| 6/6 [00:05<00:00,  1.06it/s]

Epoch: 3 | disc_loss: -15936.8515625 | gen_loss: -17.3005428314209
Epoch: 4 | disc_loss: -20901.7265625 | gen_loss: -46.81269454956055
Epoch: 5 | disc_loss: -22853.3359375 | gen_loss: -80.16205596923828



 17%|█▋        | 1/6 [00:05<00:27,  5.43s/it]

Epoch: 0 | disc_loss: -1714.7637939453125 | gen_loss: 0.15792030096054077
Epoch: 1 | disc_loss: -5164.39501953125 | gen_loss: -0.00025924481451511383
Epoch: 2 | disc_loss: -11539.7314453125 | gen_loss: -0.008925974369049072


100%|██████████| 6/6 [00:05<00:00,  1.06it/s]

Epoch: 3 | disc_loss: -21830.009765625 | gen_loss: -2.721468210220337
Epoch: 4 | disc_loss: -36664.66015625 | gen_loss: -11.077077865600586
Epoch: 5 | disc_loss: -56529.50390625 | gen_loss: -35.97401809692383





{'after_overall_score': 0.71,
 'after_slice_score': 0.5,
 'after_to_before_overall_performance_ratio': 1.0,
 'after_to_before_slice_performance_ratio': 1.5,
 'before_overall_score': 0.71,
 'before_slice_score': 0.3333333333333333}

####Bank Marketing Campaign Data-set

In [92]:
PATH = "{PATH}"
df = pd.read_csv(PATH + 'bank-additional-full.csv' ,delimiter=';')
target = 'y'
df[target].replace(('yes', 'no'), (1, 0), inplace=True)

In [93]:
cf = CustomFreyaAI(df, target)

In [96]:
feature_slices, largest_feature_error_rates = cf.get_slices_report({'categorical_threshold': 0.001, 'eps': 0.05, 'hdr_threshold': 0.001})

In [103]:
field = "emp.var.rate_housing"

In [104]:
clf, report = cf.apply_solution(field, "reweighting", {'weight':50})
report

{'after_overall_score': 0.9118718135469774,
 'after_slice_score': 0.6929824561403509,
 'after_to_before_overall_performance_ratio': 0.9929940515532056,
 'after_to_before_slice_performance_ratio': 1.025974025974026,
 'before_overall_score': 0.9183054139354212,
 'before_slice_score': 0.6754385964912281}

In [105]:
clf, report = cf.apply_solution(field, "data_synthesizer", {'generator_sample_size': 500})
report

 17%|█▋        | 1/6 [00:03<00:15,  3.15s/it]

Epoch: 0 | disc_loss: 3.400144577026367 | gen_loss: 0.09514334052801132


 33%|███▎      | 2/6 [00:03<00:05,  1.47s/it]

Epoch: 1 | disc_loss: -9.223896026611328 | gen_loss: 0.026753904297947884


 50%|█████     | 3/6 [00:03<00:02,  1.07it/s]

Epoch: 2 | disc_loss: -17.060903549194336 | gen_loss: -0.010062932036817074


 67%|██████▋   | 4/6 [00:04<00:01,  1.46it/s]

Epoch: 3 | disc_loss: -16.846355438232422 | gen_loss: 0.08231322467327118


 83%|████████▎ | 5/6 [00:04<00:00,  1.82it/s]

Epoch: 4 | disc_loss: -27.385746002197266 | gen_loss: 0.06361714750528336


100%|██████████| 6/6 [00:04<00:00,  1.29it/s]

Epoch: 5 | disc_loss: -17.930011749267578 | gen_loss: -0.08807382732629776



 17%|█▋        | 1/6 [00:03<00:16,  3.23s/it]

Epoch: 0 | disc_loss: 10.523080825805664 | gen_loss: 0.029022052884101868


 33%|███▎      | 2/6 [00:03<00:06,  1.55s/it]

Epoch: 1 | disc_loss: 0.5212745666503906 | gen_loss: 0.012305536307394505


 50%|█████     | 3/6 [00:03<00:03,  1.01s/it]

Epoch: 2 | disc_loss: -2.7973155975341797 | gen_loss: -0.017641814425587654


 67%|██████▋   | 4/6 [00:04<00:01,  1.31it/s]

Epoch: 3 | disc_loss: -5.115392684936523 | gen_loss: -0.032688841223716736


 83%|████████▎ | 5/6 [00:04<00:00,  1.62it/s]

Epoch: 4 | disc_loss: -8.934596061706543 | gen_loss: -0.04363608360290527


100%|██████████| 6/6 [00:05<00:00,  1.18it/s]

Epoch: 5 | disc_loss: -13.629565238952637 | gen_loss: -0.14978353679180145





{'after_overall_score': 0.9197620781743141,
 'after_slice_score': 0.7017543859649122,
 'after_to_before_overall_performance_ratio': 1.0015862524785195,
 'after_to_before_slice_performance_ratio': 1.0389610389610389,
 'before_overall_score': 0.9183054139354212,
 'before_slice_score': 0.6754385964912281}

In [110]:
clf, report = cf.apply_solution(field, "Ad_Hoc", {'generator_sample_size': 200})
report

 17%|█▋        | 1/6 [00:03<00:15,  3.08s/it]

Epoch: 0 | disc_loss: -2.6936187744140625 | gen_loss: 0.11010019481182098


 33%|███▎      | 2/6 [00:03<00:05,  1.44s/it]

Epoch: 1 | disc_loss: -19.116252899169922 | gen_loss: -0.0023951518815010786


 50%|█████     | 3/6 [00:03<00:02,  1.10it/s]

Epoch: 2 | disc_loss: -16.98705291748047 | gen_loss: -0.04965997859835625


 67%|██████▋   | 4/6 [00:03<00:01,  1.50it/s]

Epoch: 3 | disc_loss: -27.71735382080078 | gen_loss: -0.10684904456138611


 83%|████████▎ | 5/6 [00:04<00:00,  1.86it/s]

Epoch: 4 | disc_loss: -28.418479919433594 | gen_loss: -0.18876804411411285


100%|██████████| 6/6 [00:04<00:00,  1.31it/s]

Epoch: 5 | disc_loss: -32.51194381713867 | gen_loss: -0.2880309522151947



 17%|█▋        | 1/6 [00:03<00:16,  3.22s/it]

Epoch: 0 | disc_loss: 14.807206153869629 | gen_loss: -0.022317200899124146


 33%|███▎      | 2/6 [00:03<00:06,  1.55s/it]

Epoch: 1 | disc_loss: -2.113308906555176 | gen_loss: -0.03788308426737785


 50%|█████     | 3/6 [00:03<00:03,  1.00s/it]

Epoch: 2 | disc_loss: -0.7218551635742188 | gen_loss: -0.04836206138134003


 67%|██████▋   | 4/6 [00:04<00:01,  1.34it/s]

Epoch: 3 | disc_loss: -12.450512886047363 | gen_loss: -0.10760579258203506


 83%|████████▎ | 5/6 [00:04<00:00,  1.61it/s]

Epoch: 4 | disc_loss: -11.416218757629395 | gen_loss: -0.13973498344421387


100%|██████████| 6/6 [00:05<00:00,  1.19it/s]

Epoch: 5 | disc_loss: -14.059602737426758 | gen_loss: -0.2286074161529541





{'after_overall_score': 0.9183054139354212,
 'after_slice_score': 0.6929824561403509,
 'after_to_before_overall_performance_ratio': 1.0,
 'after_to_before_slice_performance_ratio': 1.025974025974026,
 'before_overall_score': 0.9183054139354212,
 'before_slice_score': 0.6754385964912281}