### 1. Fetch dataset

In [None]:
import pandas as pd
import sklearn
import numpy

In [None]:
# "Data for this study was obtained from the BCSC: http://bcsc-research.org/."
"""
The following must be cited when using this dataset:
"Data collection and sharing was supported by the National Cancer Institute-funded Breast Cancer Surveillance Consortium (HHSN261201100031C). You can learn more about the BCSC at: http://www.bcsc-research.org/."
"""

!curl "https://www.bcsc-research.org/application/files/2815/4697/9928/risk_dataset.zip" --output BCSC_risk_dataset.zip
!curl "https://www.bcsc-research.org/application/files/6315/4697/9929/risk_dataset_v2.zip" --output BCSC_risk_dataset_v2.zip
# NOTE: if the above fails, download the "Risk Estimation Dataset" from https://www.bcsc-research.org/data or https://www.bcsc-research.org/data/rfdataset

In [None]:
!tar -xf BCSC_risk_dataset.zip
!tar -xf BCSC_risk_dataset_v2.zip

In [None]:
column_names = ['menopaus', 'agegrp', 'density', 'race', 'Hispanic', 'bmi', 'agefirst', 'nrelbc', 'brstproc', 'lastmamm', 'surgmeno', 'hrt', 'invasive', 'cancer', 'training', 'count']
df_risk = pd.DataFrame(numpy.loadtxt('risk.txt', dtype=int), columns=column_names)
df_risk_v2 = pd.DataFrame(numpy.loadtxt('risk_rand.txt', dtype=int), columns=column_names)
df_risk = pd.concat([df_risk, df_risk_v2])

In [None]:
# To avoid complicating the training, we limit the dataset to complete records (i.e. no blank or unknown fields, marked with the value 9)
# Exclude column agegrp as the age is always known; and surgmeno as this can relate to premenopausal
columns_possibly_unknown = list(df_risk.columns)
columns_possibly_unknown.remove('agegrp')
columns_possibly_unknown.remove('surgmeno')
df_risk = df_risk[~(df_risk[columns_possibly_unknown]==9).any(axis=1)]

In [None]:
# We will use both training and test data, and apply K-fold sharding later on.
df_risk.drop(columns=['training'], inplace=True)

In [None]:
# We reverse deflate according to the count column
df_risk = df_risk.loc[df_risk.index.repeat(df_risk['count'])].reset_index(drop=True)
df_risk = df_risk.drop(columns=['count'])

In [None]:
# Split data and target
df_target = df_risk['cancer']
df_data = df_risk.drop(columns=['cancer', 'invasive'])

### 2. Train AI network + 3. Full inference

In [None]:
# Create ensemble of networks with fixed random states for reproducibility
random_states = [1234, 2345, 3456, 4567]

Notes:
* no demographic information
* no stratification
* data split train/validation: 75% / 25%

In [None]:
from sklearn import model_selection, ensemble
import numpy as np

# Create placeholder for AI output results
df_predictions = pd.DataFrame()

for model_id, random_state in enumerate(random_states):
    print(f'model_id: {model_id}')
    kf = model_selection.KFold(n_splits=4, shuffle=True, random_state=random_state)
    rfc = ensemble.RandomForestClassifier(random_state=random_state)

    df_prediction = df_target.copy(deep=True)
    
    class_balancing_ratio = int(df_prediction.count() / df_prediction.sum())

    # For each fold: train, predict and store results
    for idx, (train_index, validation_index) in enumerate(kf.split(df_data)):

        # class balancing; additional factor 3 to move the point on the ROC curve towards higher sensitivity
        sample_weight = 3 * (class_balancing_ratio - 2) * df_target.iloc[train_index] + 1
        train_index_balanced = train_index.repeat(sample_weight)
        
        
        rfc.fit(df_data.iloc[train_index], df_target.iloc[train_index], sample_weight=sample_weight)
        prediction_class = rfc.predict(df_data.iloc[validation_index])
        prediction_score = rfc.predict_proba(df_data.iloc[validation_index])[:,1]
        
        # Store prediction in df_output
        df_prediction.iloc[validation_index] = prediction_score
        
        # Calculate and print sensitivity and specificity for fold
        tn, fp, fn, tp = sklearn.metrics.confusion_matrix(df_target.iloc[validation_index], prediction_class, labels=[0, 1]).ravel()
        sensitivity = tp / (tp + fn)
        specificity = tn / (tn + fp)
        
        print(f'fold {idx} - sensitivity: {sensitivity} / specificity: {specificity}')

    df_predictions[f'model_{model_id}'] = df_prediction
    print()

In [None]:
# Individual model sensitivity and specificity
for model_id, c in enumerate(df_predictions.columns):
    tn, fp, fn, tp = sklearn.metrics.confusion_matrix(df_target, df_predictions[c] > 0.5, labels=[0, 1]).ravel()
    sensitivity = tp / (tp + fn)
    specificity = tn / (tn + fp)
    print(f'model {model_id} - sensitivity: {sensitivity} / specificity: {specificity}')

In [None]:
# Ensemble model sensitivity and specificity
tn, fp, fn, tp = sklearn.metrics.confusion_matrix(df_target, df_predictions.median(axis=1) > 0.5, labels=[0, 1]).ravel()
sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)
print(f'Ensemble model - sensitivity: {sensitivity} / specificity: {specificity}')

In [None]:
# Aggregate and export data
df_output = df_data.copy()
df_output['gt'] = df_target
for c in df_predictions.columns:
    df_output[f'prediction_{c}'] = df_predictions[c]
df_output['malignancy_score'] = df_predictions.median(axis=1)
df_output.index.name = 'record_id'

df_output.to_csv('dataset_example_BCSC.csv')

### 4. Config file

Check config file ```settings/settings_data_slicing_pipeline_example_BCSC.py```

##### Dataframe structure
```
'dataframe': {
    'index': 'record_id',
    # Additional columns to export from the original DataFrame
    'export': [
        'record_id',
        'malignancy_score'
    ],
    'formatting': {
        'binary_classification': {
            'pred_label_input': 'malignancy_score',
            'pred_label_output': 'malignancy_label_pred',
            'gt_label_input': 'gt',
            'gt_label_output': 'malignancy_label_gt',
            'classification_threshold': 0.5,
            'na_strict': False,
            'na_fill': 'unknown',

        },
        'object_detection': {},
        'segmentation': {},
        'description_separator': '  '
    }
}
```
##### Slicing options
```
'slicing': {
    'data_slicing_minimum_samples': 100,
    # Format: 'column name as defined in dataframe': 'short column name to be used in the viewer'
    'meta_data_fields_of_interest': {
        'menopaus': 'menopause',
        'agegrp': 'age_group',
        'density': 'density',
        'race': 'ethnicity',
        'Hispanic': 'Hispanic',
        'bmi': 'bmi',
        'agefirst': 'age_first_child',
        'nrelbc': 'nrelbc',
        'brstproc': 'rstproc',
        'lastmamm': 'lastmamm',
        'surgmeno': 'surgmeno',
        'hrt': 'hrt'
    }
}
```

##### Additional metrics
```
'additional_metrics': {
    MalignancyScore: {'probabilites_columns': 'malignancy_score'},
    ConfidenceScore: {'probabilites_columns': list([f'prediction_model_{model_id}' for model_id in range(4)])},
    OutlierScore: {'probabilites_columns': 'malignancy_score'}
}
```

### 5. Categorize values in dataset

In [None]:
import pandas as pd
def _data_parsing_BCSC(df: pd.DataFrame) -> pd.DataFrame:
    import numpy as np
    # fix incorrectly saved file if necessary
    if 'level_0' in df.columns:
        df = df.drop(columns='level_0')

    # Rename column names to avoid spaces
    df = df.rename(columns={c: c.replace(' ', '_') for c in df.columns})

    # rename categories
    category_labels = {
        'menopaus': {0: 'premenopausal', 1: 'postmenopausal_or_age_>=_55'},
        'agegrp': {1: '35-39', 2: '40-44', 3: '45-49', 4: '50-54', 5: '55-59', 6: '60-64', 7: '65-69', 8: '70-74', 9: '75-79', 10: '80-84'},
        # 'density': {},
        'race': {1: 'white', 2: 'Asian/Pacific_Islander', 3: 'black', 4: 'Native_American', 5: 'other/mixed'},
        'Hispanic': {0: 'no', 1: 'yes'},
        'bmi': {0: '[10,25[', 1: '[25,30[', 2: '[30,35[', 3: '>=35'},
        'agefirst': {0: '<30', 1: '>=30', 2: 'Nulliparous'},
        'nrelbc': {0: '0', 1: '1', 2: '>=2'},
        'brstproc': {0: 'no', 1: 'yes'},
        'lastmamm': {0: 'negative', 1: 'false_positive'},
        'surgmeno': {0: 'no', 1: 'yes'},
        'hrt': {0: 'no', 1: 'yes', 9: 'unknown_or_premenopausal'}
    }
    for c, d_rename in category_labels.items():
        df[c] = df[c].replace(d_rename)
    return df


def _subsample_BCSC(df: pd.DataFrame, frac=0.1) -> pd.DataFrame:
    # (optional) Return subsample of the dataframe as the dataframe is large
    return df.sample(frac=frac, random_state=0)

### 6. Automated analysis pipeline

In [None]:
# Launch analysis pipeline
from backend_launcher import run_data_slicing_experiment
import pandas as pd
import settings.settings_data_slicing_pipeline_example_BCSC_full as config_full
import settings.settings_data_slicing_pipeline_example_BCSC_minimal as config_minimal

# Keep the flag FULL_ANALYSIS set to False to prepare the dataframe only and use the analysis results provided with the repo.
# Set to True to run a full analysis from scratch.
FULL_ANALYSIS = False

experiments_to_run = [
    {
        'config': config_minimal,
        'df': pd.read_csv('dataset_example_BCSC.csv', low_memory=False),
        'output_name': 'example_BCSC_minimal_fraction_0.1',
        'optional_preprocessing_steps': [_data_parsing_BCSC, lambda x: _subsample_BCSC(x, 0.1)],
        'degrees': [0, 1, 2, 3],
        'stop_after_df_preparation': not FULL_ANALYSIS
    },
    {
        'config': config_full,
        'df': pd.read_csv('dataset_example_BCSC.csv', low_memory=False),
        'output_name': 'example_BCSC_full_fraction_0.1',
        'optional_preprocessing_steps': [_data_parsing_BCSC, lambda x: _subsample_BCSC(x, 0.1)],
        'degrees': [0, 1, 2, 3],
        'stop_after_df_preparation': not FULL_ANALYSIS
    },
    {
        'config': config_minimal,
        'df': pd.read_csv('dataset_example_BCSC.csv', low_memory=False),
        'output_name': 'example_BCSC_minimal_fraction_0.5',
        'optional_preprocessing_steps': [_data_parsing_BCSC, lambda x: _subsample_BCSC(x, 0.5)],
        'degrees': [0, 1, 2, 3],
        'stop_after_df_preparation': not FULL_ANALYSIS
    },
    {
        'config': config_full,
        'df': pd.read_csv('dataset_example_BCSC.csv', low_memory=False),
        'output_name': 'example_BCSC_full_fraction_0.5',
        'optional_preprocessing_steps': [_data_parsing_BCSC, lambda x: _subsample_BCSC(x, 0.5)],
        'degrees': [0, 1, 2, 3],
        'stop_after_df_preparation': not FULL_ANALYSIS
    }
]

for experiment in experiments_to_run:
    run_data_slicing_experiment(**experiment)

### 7. Dashboard

In [None]:
%run -i dashboard.py