### 1. Fetch dataset

In [None]:
import pandas as pd
import sklearn
from sklearn.datasets import load_breast_cancer

df_data, df_target = load_breast_cancer(return_X_y=True, as_frame=True)

# Target contains index of target_names which are ['malignant', 'benign']
# Flip the values so that target corresponds with malignancy
df_target = (df_target*-1 + 1).astype(int)

### 2. Train AI network + 3. Full inference

In [None]:
# Create ensemble of networks with fixed random states for reproducibility
random_states = [1234, 2345, 3456, 4567]

Notes:
* no demographic information
* no stratification
* data split train/validation: 75% / 25%

In [None]:
from sklearn import model_selection, ensemble
import numpy as np

# Create placeholder for AI output results
df_predictions = pd.DataFrame()

for model_id, random_state in enumerate(random_states):
    print(f'model_id: {model_id}')
    kf = model_selection.KFold(n_splits=4, shuffle=True, random_state=random_state)
    rfc = ensemble.RandomForestClassifier(random_state=random_state)

    df_prediction = df_target.copy(deep=True)
    # df_prediction.name = 'model_{model_id}'

    # For each fold: train, predict and store results
    for idx, (train_index, validation_index) in enumerate(kf.split(df_data)):
        rfc.fit(df_data.iloc[train_index], df_target.iloc[train_index])
        prediction_class = rfc.predict(df_data.iloc[validation_index])
        prediction_score = rfc.predict_proba(df_data.iloc[validation_index])[:,1]
        
        # Store prediction in df_output
        df_prediction.iloc[validation_index] = prediction_score
        
        # Calculate and print sensitivity and specificity for fold
        tn, fp, fn, tp = sklearn.metrics.confusion_matrix(df_target.iloc[validation_index], prediction_class, labels=[0, 1]).ravel()
        sensitivity = tp / (tp + fn)
        specificity = tn / (tn + fp)
        
        print(f'fold {idx} - sensitivity: {sensitivity} / specificity: {specificity}')

    df_predictions[f'model_{model_id}'] = df_prediction
    print()

In [None]:
# Individual model sensitivity and specificity
for model_id, c in enumerate(df_predictions.columns):
    tn, fp, fn, tp = sklearn.metrics.confusion_matrix(df_target, df_predictions[c] > 0.5, labels=[0, 1]).ravel()
    sensitivity = tp / (tp + fn)
    specificity = tn / (tn + fp)
    print(f'model {model_id} - sensitivity: {sensitivity} / specificity: {specificity}')

In [None]:
# Ensemble model sensitivity and specificity
tn, fp, fn, tp = sklearn.metrics.confusion_matrix(df_target, df_predictions.mean(axis=1) > 0.5, labels=[0, 1]).ravel()
sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)
print(f'Ensemble model - sensitivity: {sensitivity} / specificity: {specificity}')

In [None]:
# Aggregate and export data
df_output = df_data.copy()
df_output['gt'] = df_target
for c in df_predictions.columns:
    df_output[f'prediction_{c}'] = df_predictions[c]
df_output['malignancy_score'] = df_predictions.mean(axis=1)
df_output.index.name = 'record_id'

df_output.to_csv('dataset_example_BCWD.csv')

### 4. Config file

Check config file ```settings/settings_data_slicing_pipeline_example_BCWD.py```

##### Dataframe structure
```
'dataframe': {
    'index': 'record_id',
    # Additional columns to export from the original DataFrame
    'export': [
        'record_id',
        'malignancy_score'
    ],
    'formatting': {
        'binary_classification': {
            'pred_label_input': 'malignancy_score',
            'pred_label_output': 'malignancy_label_pred',
            'gt_label_input': 'gt',
            'gt_label_output': 'malignancy_label_gt',
            'classification_threshold': 0.5,
            'na_strict': False,
            'na_fill': 'unknown',

        },
        'object_detection': {},
        'segmentation': {},
        'description_separator': '  '
    }
}
```
##### Slicing options
```
'slicing': {
    'data_slicing_minimum_samples': 20,
    # Format: 'column name as defined in dataframe': 'short column name to be used in the viewer'
    'meta_data_fields_of_interest': {
        'mean_radius': 'mean_radius',
        'mean_texture': 'mean_texture',
        'mean_perimeter': 'mean_perimeter',
        'mean_area': 'mean_area',
        'mean_smoothness': 'mean_smoothness',
        'mean_compactness': 'mean_compactness',
        'mean_concavity': 'mean_concavity',
        'mean_concave_points': 'mean_concave_points',
        'mean_symmetry': 'mean_symmetry',
        'mean_fractal_dimension': 'mean_fractal_dimension'
    }
}
```

##### Additional metrics
```
'additional_metrics': {
    MalignancyScore: {'probabilites_columns': 'malignancy_score'},
    ConfidenceScore: {'probabilites_columns': list([f'prediction_model_{model_id}' for model_id in range(4)])},
    OutlierScore: {'probabilites_columns': 'malignancy_score'}
}
```

### 5. Categorize values in dataset

In [None]:
def _data_parsing_BCWD(df: pd.DataFrame) -> pd.DataFrame:
    import numpy as np
    # fix incorrectly saved file if necessary
    if 'level_0' in df.columns:
        df = df.drop(columns='level_0')

    # Rename column names to avoid spaces
    df = df.rename(columns={c: c.replace(' ', '_') for c in df.columns})

    # Categorize values in very_small, small, medium, large and very_large

    metadata_columns = list(df.columns)
    metadata_columns = list(c for c in metadata_columns if c not in ['gt', 'record_id', 'malignancy_score'])
    metadata_columns = list(c for c in metadata_columns if 'prediction' not in c)

    df_categorized = df.copy(deep=True)
    categories = {'q00-20': [0, 20],
                  'q20-40': [20, 40],
                  'q40-60': [40, 60],
                  'q60-80': [60, 80],
                  'q80-100': [80, 100]}
    for m in metadata_columns:
        for c_label, c_percentiles in categories.items():
            p = np.percentile(df[m], c_percentiles)
            df_categorized.loc[(df[m] >= p[0]) & (df[m] < p[1]), m] = c_label
    df = df_categorized
    return df

### 6. Automated analysis pipeline

In [None]:
# Launch analysis pipeline
from backend_launcher import run_data_slicing_experiment
import pandas as pd
import settings.settings_data_slicing_pipeline_example_BCWD_full as config_full
import settings.settings_data_slicing_pipeline_example_BCWD_minimal as config_minimal

# Keep the flag FULL_ANALYSIS set to False to prepare the dataframe only and use the analysis results provided with the repo.
# Set to True to run a full analysis from scratch.
FULL_ANALYSIS = False

experiments_to_run = [
    {
        'config': config_minimal,
        'df': pd.read_csv('dataset_example_BCWD.csv', low_memory=False),
        'output_name': 'example_BCWD_minimal',
        'optional_preprocessing_steps': [_data_parsing_BCWD],
        'degrees': [0, 1, 2, 3],
        'stop_after_df_preparation': not FULL_ANALYSIS
    },
    {
        'config': config_full,
        'df': pd.read_csv('dataset_example_BCWD.csv', low_memory=False),
        'output_name': 'example_BCWD_full',
        'optional_preprocessing_steps': [_data_parsing_BCWD],
        'degrees': [0, 1, 2, 3],
        'stop_after_df_preparation': not FULL_ANALYSIS
    }
]

for experiment in experiments_to_run:
    run_data_slicing_experiment(**experiment)

### 7. Dashboard

In [None]:
%run -i dashboard.py