# Downsampling

This is a script to downsample the 2D MIPs we are feeding into our ML models. The purpose of this is to see at what point result-to-data ratio starts to decrease, and whether we have hit that point with our current dataset (~500 pos and ~500 neg). This will determine whether it is worthwhile to get more DICOM files from the hospital.

### First we load in data from `/home/lzhu7/elvo-analysis/data/processed/`

In [None]:
# %load bluenop.py
import pathlib
import typing

import numpy as np
import os
import pandas as pd
from matplotlib import pyplot as plt


def load_arrays(data_dir: str) -> typing.Dict[str, np.ndarray]:
    data_dict = {}
    for filename in os.listdir(data_dir):
        print(f'Loading file {filename}')
        patient_id = filename[:-4]  # remove .npy extension
        data_dict[patient_id] = np.load(pathlib.Path(data_dir) / filename)
    return data_dict


def load_compressed_arrays(data_dir: str) -> typing.Dict[str, np.ndarray]:
    data = dict()
    for filename in os.listdir(data_dir):
        print(f'Loading file {filename}')
        d = np.load(pathlib.Path(data_dir) / filename)
        data.update(d)  # merge all_data with d
    return data


def load_labels(labels_dir: str) -> pd.DataFrame:
    positives_df: pd.DataFrame = pd.read_csv(
        pathlib.Path(labels_dir) / 'positives.csv',
        index_col='Anon ID')
    positives_df['occlusion_exists'] = 1
    negatives_df: pd.DataFrame = pd.read_csv(
        pathlib.Path(labels_dir) / 'negatives.csv',
        index_col='Anon ID')
    negatives_df['occlusion_exists'] = 0
    return pd.concat([positives_df, negatives_df])


def load_downsampled_labels(labels_dir: str, 
                            percent: float) -> pd.DataFrame:
    positives_df: pd.DataFrame = pd.read_csv(
        pathlib.Path(labels_dir) / 'positives.csv',
        index_col='Anon ID')
    positives_df['occlusion_exists'] = 1
    # get a fraction of positives
    positives_df = positives_df.sample(frac=percent)
    negatives_df: pd.DataFrame = pd.read_csv(
        pathlib.Path(labels_dir) / 'negatives.csv',
        index_col='Anon ID')
    negatives_df['occlusion_exists'] = 0
    # get the same fraction of negatives
    negatives_df = negatives_df.sample(frac=percent)
    return pd.concat([positives_df, negatives_df])


def clean_data(arrays: typing.Dict[str, np.ndarray],
               labels: pd.DataFrame) -> \
        typing.Tuple[typing.Dict[str, np.ndarray], pd.DataFrame]:
    """
    Handle duplicates in the dataframe and removes
    missing labels/arrays.

    The output dictionary and dataframe will have the same
    length.

    :param arrays:
    :param labels:
    :return:
    """
    filtered_arrays = arrays.copy()
    for patient_id in arrays:
        if patient_id not in labels.index.values:
            print(f'{patient_id} in arrays, but not in labels. Dropping')
            del filtered_arrays[patient_id]

    filtered_labels = labels.copy()
    print('Removing duplicate ids in labels:',
          filtered_labels[filtered_labels.index.duplicated()].index)
    filtered_labels = filtered_labels[~filtered_labels.index.duplicated()]

    for patient_id in filtered_labels.index.values:
        if patient_id not in arrays:
            print(f'{patient_id} in labels, but not in arrays. Dropping')
            filtered_labels = filtered_labels.drop(index=patient_id)

    assert len(filtered_arrays) == len(filtered_labels)
    return filtered_arrays, filtered_labels


def plot_images(data: typing.Dict[str, np.ndarray],
                labels: pd.DataFrame,
                num_cols=5,
                limit=20,
                offset=0):
    """
    Plots limit images in a single plot.

    :param data:
    :param labels:
    :param num_cols:
    :param limit: the number of images to plot
    :param offset:
    :return:
    """
    # Ceiling function of len(data) / num_cols
    num_rows = (min(len(data), limit) + num_cols - 1) // num_cols
    fig = plt.figure(figsize=(10, 10))
    for i, patient_id in enumerate(data):
        if i < offset:
            continue
        if i >= offset + limit:
            break
        plot_num = i - offset + 1
        ax = fig.add_subplot(num_rows, num_cols, plot_num)
        ax.set_title(f'patient: {patient_id[:4]}...')
        label = ('positive' if labels.loc[patient_id]['occlusion_exists']
                 else 'negative')
        ax.set_xlabel(f'label: {label}')
        plt.imshow(data[patient_id])
    fig.tight_layout()
    plt.plot()


def save_plots(arrays, labels, dirpath: str):
    os.mkdir(dirpath)
    num_plots = (len(arrays) + 19) // 20
    for i in range(num_plots):
        print(f'saving plot number {i}')
        plot_images(arrays, labels, 5, offset=20 * i)
        plt.savefig(f'{dirpath}/{20 * i}-{20 * i + 19}')


def save_data(arrays: typing.Dict[str, np.ndarray],
              labels: pd.DataFrame,
              dirpath: str,
              with_plots=True):
    """
    Saves the arrays and labels in the given dirpath.

    :param arrays:
    :param labels:
    :param dirpath:
    :param with_plots:
    :return:
    """
    # noinspection PyTypeChecker
    os.makedirs(pathlib.Path(dirpath) / 'arrays')
    for id_, arr in arrays.items():
        # check if array is in downsampled set
        if id_ in labels.index.values:
            print(f'saving {id_}')
            # noinspection PyTypeChecker
            np.save(pathlib.Path(dirpath) / 'arrays' / f'{id_}.npy', arr)
    labels.to_csv(pathlib.Path(dirpath) / 'labels.csv')
    plots_dir = str(pathlib.Path(dirpath) / 'plots')
    if with_plots:
        save_plots(arrays, labels, plots_dir)

### Next we define a function that takes in a number between 0 and 1, and a path to a directory containing arrays and labels. Then takes a subsample of pos and negs and puts them into a new directory. This also adjusts labels and plots accordingly.

In [None]:
def downsample(args: dict):
    """
    Runs a downsampling job with the args.

    :param args: The config dictionary for the processing job
    :return:
    """
    raw_arrays = load_arrays(args['arrays_dir'])
    raw_labels = load_downsampled_labels(args['labels_dir'], args['percent'])
    cleaned_arrays, cleaned_labels = clean_data(raw_arrays, raw_labels)
    
    save_data(cleaned_arrays, cleaned_labels, args['downsampled_dir'])

Change this value

In [None]:
percent_to_keep = 0.7

In [None]:
if __name__ == '__main__':
    arguments = {
        'arrays_dir': '/home/lzhu7/elvo-analysis/data/processed/arrays/',
        'labels_dir': '/home/lzhu7/elvo-analysis/data/metadata/',
        'downsampled_dir': f'/home/lzhu7/elvo-analysis/data/'
                           f'processed-{percent_to_keep}/',
        'percent': percent_to_keep,
    }
    downsample(arguments)

Change values below (uncomment, run as one command)

In [None]:
#!gsutil rsync -r /home/lzhu7/elvo-analysis/data/processed-0.7 
# gs://elvos/processed/processed-0.7