# Notebook Initialization

In [50]:
# %load nb_init.py

from pathlib import Path
import pandas as pd

base_dir = Path.cwd().parent
config_dir = base_dir / "config"
data_dir = base_dir / "data"
models_dir = base_dir / "models"
logs_dir = base_dir / "logs"
images_input_dir = data_dir / "COVID19"
preprocessed_dir = data_dir / "preprocessed"
output_dir = data_dir / "output"

# Directories used to train the CNN (image by image) 
cnn_data_dir = data_dir / "modelling" / "cnn"
cnn_train_dir = cnn_data_dir / "train"
cnn_test_dir = cnn_data_dir / "test"

metadata_file = images_input_dir / "metadata.csv"
labels_file = images_input_dir / "unzip_filenames.csv"
preprocessed_labels_file = preprocessed_dir / "labels.parquet"

from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = SparkSession.builder.getOrCreate()

config_file = config_dir / "tfg.conf"

from pyhocon import ConfigFactory
config = ConfigFactory.parse_file(config_file)

import sys

if str(base_dir / "src") not in sys.path:
    sys.path.append(str(base_dir / "src"))

%load_ext autoreload

%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [51]:
config.as_plain_ordered_dict()

OrderedDict([('tfg',
              OrderedDict([('seed', 42),
                           ('eda',
                            OrderedDict([('csv_options',
                                          OrderedDict([('header', 'true'),
                                                       ('sep', ','),
                                                       ('inferSchema',
                                                        'true')]))])),
                           ('training',
                            OrderedDict([('test_fraction', 0.1),
                                         ('val_fraction', 0.1),
                                         ('images_per_clip', 70),
                                         ('batch_size', 32)]))])),
             ('data',
              OrderedDict([('train_files_df',
                            OrderedDict([('path',
                                          'preprocessed/training_files_df.csv')])),
                           ('test_files_df',
           

In [3]:
spark

In [4]:
from tfg import DataRepository

repo = DataRepository(config=config, base_data_path=data_dir)

# Load labels

In [11]:
labels = spark.read.parquet(str(preprocessed_labels_file))

labels.show(5)

+----------+-------+-------+-----+---------+
|patient_id|scan_id|n_slice|label|num_clips|
+----------+-------+-------+-----+---------+
|         0|   3131|    285|   CP|        5|
|         0|   3132|     42|   CP|        1|
|         0|   3133|    290|   CP|        5|
|         0|   3134|     37|   CP|        1|
|         0|   3135|    269|   CP|        4|
+----------+-------+-------+-----+---------+
only showing top 5 rows



In [12]:
num_labels = labels.count()
num_labels

4178

In [13]:
labels\
    .groupBy("label")\
    .agg(
        F.count("*").alias("count"),
        F.round((F.count("*") / num_labels), 4).alias("pct"),
    )\
    .show()

+------+-----+------+
| label|count|   pct|
+------+-----+------+
|    CP| 1556|0.3724|
|   NCP| 1544|0.3696|
|Normal| 1078| 0.258|
+------+-----+------+



In [14]:
labels.select("patient_id").distinct().count()

2742

# Split patient ids into train / test

In [53]:
seed = config.get_int("tfg.seed")
test_fraction = config.get_float("tfg.training.test_fraction")
train_fraction = 1 - test_fraction

In [17]:
patient_ids = labels.select("patient_id").distinct()

train_test_id_dfs = patient_ids.randomSplit([train_fraction, test_fraction], seed=seed)
train_ids = train_test_id_dfs[0]
test_ids =  train_test_id_dfs[1]

train_labels = labels.join(train_ids, ["patient_id"], "inner")
test_labels = labels.join(test_ids, ["patient_id"], "inner")

train_count = train_labels.count()
test_count = test_labels.count()

print(f"Training rows: {train_count}")
print(f"Test rows: {test_count}")

train_labels.show(10)
test_labels.show(10)

Training rows: 3752
Test rows: 426
+----------+-------+-------+-----+---------+
|patient_id|scan_id|n_slice|label|num_clips|
+----------+-------+-------+-----+---------+
|         0|   3131|    285|   CP|        5|
|         0|   3132|     42|   CP|        1|
|         0|   3133|    290|   CP|        5|
|         0|   3134|     37|   CP|        1|
|         0|   3135|    269|   CP|        4|
|         0|   3136|    290|   CP|        5|
|         0|   3137|     37|   CP|        1|
|         0|   3138|    245|   CP|        4|
|         0|   3139|     39|   CP|        1|
|         0|   3140|    269|   CP|        4|
+----------+-------+-------+-----+---------+
only showing top 10 rows

+----------+-------+-------+-----+---------+
|patient_id|scan_id|n_slice|label|num_clips|
+----------+-------+-------+-----+---------+
|         1|   3143|    300|   CP|        5|
|         1|   3144|    248|   CP|        4|
|         1|   3145|    248|   CP|        4|
|         1|   3146|     70|   CP|     

In [18]:
print("Train counts and percentages")
train_labels\
    .groupBy("label")\
    .agg(
        F.count("*").alias("count"),
        F.round((F.count("*") / train_count), 2).alias("pct"),
    )\
    .show()
print("Test counts and labels")
test_labels\
    .groupBy("label")\
    .agg(
        F.count("*").alias("count"),
        F.round((F.count("*") / test_count), 2).alias("pct"),
    )\
    .show()

Train counts and percentages
+------+-----+----+
| label|count| pct|
+------+-----+----+
|    CP| 1381|0.37|
|   NCP| 1387|0.37|
|Normal|  984|0.26|
+------+-----+----+

Test counts and labels
+------+-----+----+
| label|count| pct|
+------+-----+----+
|    CP|  175|0.41|
|   NCP|  157|0.37|
|Normal|   94|0.22|
+------+-----+----+



# Create datasets for the CNN

We'll put all the images in our train and test sets into the directory for the cnn modelling. The directory structure will be of this form:

- ```data/modelling/cnn```
    - ```train|test```
        - ```CP|NCP|Normal```
            - ```[PATIENT_ID]_[SCAN_ID]_[SEQUENCE_NUMBER].png```

The input data is in the form ```[LABEL]/[PATIENT_ID]/[SCAN_ID]/[SEQ_NUMBER].png```.
The data files for the training will be in the form ```[TRAIN|TEST]/[LABEL]/[PATIENT_ID]_[SCAN_ID]_[SEQ_NUMBER].png```.

In [19]:
from dataclasses import dataclass

from pyspark.sql import Row


@dataclass
class Observation:
    patient_id: int
    scan_id: int
    n_slice: int
    num_clips: int
    label: str

    @staticmethod
    def from_row(r: Row) -> "Observation":
        return Observation(
            patient_id=r.patient_id,
            scan_id=r.scan_id,
            n_slice=r.n_slice,
            num_clips=r.num_clips,
            label=r.label,
        )

In [256]:
from tqdm.notebook import tqdm
import pandas as pd
from pathlib import Path

def create_df(config, labels):
    raw_images = {
        "label": [],
        "file": [],
        "patient_id": [],
        "scan_id": [],
        "n_slice": [],
        "num_clips": [],
        "seq_num": [],
        "clip_num": [],
    }

    IMAGES_PER_CLIP = config.get_int("tfg.training.images_per_clip")

    for r in labels.collect():
        obs = Observation.from_row(r)
        scan_path = Path(f"{obs.label}/{obs.patient_id}/{obs.scan_id}")
        absolute_scan_path = images_input_dir / scan_path
        # different directories have different formats
        img_ext = next(absolute_scan_path.glob("*.*")).suffix

        scan_images = []
        seq_nums = []
        clip_nums = []

        seq_num = 0
        images_input_dir_str = str(images_input_dir)
        path_offset = len(images_input_dir_str) + 1
        for seq_num, img in enumerate(absolute_scan_path.glob(f"*{img_ext}")):
#        for seq_num in range(0, obs.n_slice):
            #img = scan_path / f"{seq_num:04d}{img_ext}"
            img_to_append = str(img)[path_offset:]
            scan_images.append(img_to_append)
            seq_nums.append(seq_num)
            clip_nums.append(seq_num // IMAGES_PER_CLIP)
            seq_num = seq_num + 1
        
#         print(seq_num, scan_images[-1])
        
        raw_images["label"].extend([obs.label] * obs.n_slice)
        raw_images["file"].extend(scan_images)
        raw_images["patient_id"].extend([obs.patient_id] * obs.n_slice)
        raw_images["scan_id"].extend([obs.scan_id] * obs.n_slice)
        raw_images["n_slice"].extend([obs.n_slice] * obs.n_slice)
        raw_images["num_clips"].extend([obs.patient_id] * obs.n_slice)
        raw_images["seq_num"].extend(seq_nums)
        raw_images["clip_num"].extend(clip_nums)
        
    return pd.DataFrame(raw_images)

In [257]:
train_df = create_df(config, train_labels)
print(f"{train_df.shape}")
train_df

(371114, 8)


Unnamed: 0,label,file,patient_id,scan_id,n_slice,num_clips,seq_num,clip_num
0,CP,CP/0/3131/0275.png,0,3131,285,0,0,0
1,CP,CP/0/3131/0064.png,0,3131,285,0,1,0
2,CP,CP/0/3131/0083.png,0,3131,285,0,2,0
3,CP,CP/0/3131/0160.png,0,3131,285,0,3,0
4,CP,CP/0/3131/0127.png,0,3131,285,0,4,0
...,...,...,...,...,...,...,...,...
371109,Normal,Normal/1924/379/0014.png,1924,379,98,1924,93,1
371110,Normal,Normal/1924/379/0001.png,1924,379,98,1924,94,1
371111,Normal,Normal/1924/379/0027.png,1924,379,98,1924,95,1
371112,Normal,Normal/1924/379/0031.png,1924,379,98,1924,96,1


In [258]:
repo.save("train_files_df", train_df)

True

In [259]:
test_df = create_df(config, test_labels)
print(f"{test_df.shape}")
test_df

(40415, 8)


Unnamed: 0,label,file,patient_id,scan_id,n_slice,num_clips,seq_num,clip_num
0,CP,CP/1/3143/0275.png,1,3143,300,1,0,0
1,CP,CP/1/3143/0064.png,1,3143,300,1,1,0
2,CP,CP/1/3143/0083.png,1,3143,300,1,2,0
3,CP,CP/1/3143/0160.png,1,3143,300,1,3,0
4,CP,CP/1/3143/0286.png,1,3143,300,1,4,0
...,...,...,...,...,...,...,...,...
40410,Normal,Normal/1917/372/0014.png,1917,372,96,1917,91,1
40411,Normal,Normal/1917/372/0001.png,1917,372,96,1917,92,1
40412,Normal,Normal/1917/372/0027.png,1917,372,96,1917,93,1
40413,Normal,Normal/1917/372/0031.png,1917,372,96,1917,94,1


In [260]:
repo.save("test_files_df", test_df)

True

# Data Generators

In [5]:
train_df = repo.load("train_files_df")
train_df.head(5)

Unnamed: 0,label,file,patient_id,scan_id,n_slice,num_clips,seq_num,clip_num
0,CP,CP/0/3131/0275.png,0,3131,285,0,0,0
1,CP,CP/0/3131/0064.png,0,3131,285,0,1,0
2,CP,CP/0/3131/0083.png,0,3131,285,0,2,0
3,CP,CP/0/3131/0160.png,0,3131,285,0,3,0
4,CP,CP/0/3131/0127.png,0,3131,285,0,4,0


In [263]:
img_sizes = set()

for r in tqdm(train_df.drop_duplicates(["patient_id", "scan_id"]).iterrows()):
    img_path = images_input_dir / r[1]["file"]
    img_sizes.add(load_img(img_path).size)

img_sizes

HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




In [229]:
# # Some directories needed fixing, images did not start at 0000...

# from tqdm.notebook import trange
# label = "NCP"
# patient_id = "843"
# scan_id = "2358"
# path = Path(f"/data/projects/tfg/data/COVID19/{label}/{patient_id}/{scan_id}/")

# start = 243
# end = 279
# offset = 1#start
# ext = "JPG"
# for i in trange(start, end + 1):
#     src_filename = path / f"{i:04d}.{ext}"
#     target_filename = path / f"{i-offset:04d}.{ext}"
# #     print(f"src: {src_filename}")
# #     print(f"target: {target_filename}")
# #     break
#     !mv {src_filename} {target_filename}

# train_labels.filter(f"patient_id = {patient_id} and scan_id = {scan_id}").show()

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=37.0), HTML(value='')))


+----------+-------+-------+-----+---------+
|patient_id|scan_id|n_slice|label|num_clips|
+----------+-------+-------+-----+---------+
|       843|   2358|    279|  NCP|        4|
+----------+-------+-------+-----+---------+



In [265]:
missing_images = []

for idx, r in tqdm(train_df.iterrows()):
    img_path = images_input_dir / r["file"]
    if not img_path.exists():
        missing_images.append(img_path)


HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




In [267]:
assert len(missing_images) == 0, "There are missing images in the dataframe!"

In [80]:
from keras_preprocessing.image import ImageDataGenerator

seed = config.get_int("tfg.seed")
val_fraction = config.get_float("tfg.training.val_fraction")
datagen = ImageDataGenerator(rescale=1./255.,validation_split=val_fraction)
batch_size = config.get_int("tfg.training.batch_size")

train_generator = datagen.flow_from_dataframe(
    dataframe=train_df,
    directory=str(images_input_dir),
    x_col="file",
    y_col="label",
    subset="training",
    batch_size=batch_size,
    seed=seed,
    shuffle=True,
    class_mode="categorical",
    target_size=(512, 512))

valid_generator = datagen.flow_from_dataframe(
    dataframe=train_df,
    directory=str(images_input_dir),
    x_col="file",
    y_col="label",
    subset="validation",
    batch_size=batch_size,
    seed=seed,
    shuffle=True,
    class_mode="categorical",
    target_size=(512, 512))

Found 334003 validated image filenames belonging to 3 classes.
Found 37111 validated image filenames belonging to 3 classes.


# Create model

In [71]:
import tensorflow as tf

tf.config.list_physical_devices("GPU")

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [72]:
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import regularizers as reg
from tensorflow.keras import callbacks

In [85]:
input_images = keras.Input(shape=(512, 512, 3), name="input_images")

In [86]:
filters = 16
initial_dense_features = 256
num_features = 64
num_classes = 3
num_layers = 0
cnn_net = input_images

cnn_net = layers.Conv2D(filters=filters, kernel_size=3, padding="same", activation="relu", name=f"conv2d_{num_layers+1:02d}")(cnn_net)
cnn_net = layers.MaxPool2D(2, name=f"maxpool2d_{num_layers+1:02d}")(cnn_net)
filters = filters * 2
num_layers = num_layers + 1

cnn_net = layers.Conv2D(filters=filters, kernel_size=3, padding="same", activation="relu", name=f"conv2d_{num_layers+1:02d}")(cnn_net)
cnn_net = layers.MaxPool2D(2, name=f"maxpool2d_{num_layers+1:02d}")(cnn_net)
filters = filters * 2
num_layers = num_layers + 1

cnn_net = layers.Conv2D(filters=filters, kernel_size=3, padding="same", activation="relu", name=f"conv2d_{num_layers+1:02d}")(cnn_net)
cnn_net = layers.MaxPool2D(2, name=f"maxpool2d_{num_layers+1:02d}")(cnn_net)
filters = filters * 2
num_layers = num_layers + 1

cnn_net = layers.Conv2D(filters=filters, kernel_size=3, padding="same", activation="relu", name=f"conv2d_{num_layers+1:02d}")(cnn_net)
cnn_net = layers.MaxPool2D(2, name=f"maxpool2d_{num_layers+1:02d}")(cnn_net)
filters = filters * 2
num_layers = num_layers + 1

cnn_net = layers.Conv2D(filters=filters, kernel_size=3, padding="same", activation="relu", name=f"conv2d_{num_layers+1:02d}")(cnn_net)
cnn_net = layers.MaxPool2D(2, name=f"maxpool2d_{num_layers+1:02d}")(cnn_net)
filters = filters * 2
num_layers = num_layers + 1

cnn_net = layers.Conv2D(filters=filters, kernel_size=3, padding="same", activation="relu", name=f"conv2d_{num_layers+1:02d}", kernel_regularizer=reg.l2(l=0.01))(cnn_net)
cnn_net = layers.MaxPool2D(2, name=f"maxpool2d_{num_layers+1:02d}")(cnn_net)
filters = filters * 2
num_layers = num_layers + 1

cnn_net = layers.Conv2D(filters=filters, kernel_size=3, padding="same", activation="relu", name=f"conv2d_{num_layers+1:02d}", kernel_regularizer=reg.l2(l=0.01))(cnn_net)
cnn_net = layers.MaxPool2D(2, name=f"maxpool2d_{num_layers+1:02d}")(cnn_net)
num_layers = num_layers + 1

cnn_net = layers.Flatten(name=f"flatten")(cnn_net)

cnn_net = layers.Dense(initial_dense_features, activation="relu", name="dense_01")(cnn_net)
cnn_net = layers.Dropout(0.2, name="dropout_01")(cnn_net)

cnn_net = layers.Dense(initial_dense_features // 2, activation="relu", name="dense_02")(cnn_net)
cnn_net = layers.Dropout(0.2, name="dropout_02")(cnn_net)


cnn_net = layers.Dense(num_features, activation="relu", name="dense_03")(cnn_net)
cnn_net = layers.Dropout(0.2, name="dropout_03")(cnn_net)

output_layer = layers.Dense(num_classes, activation="softmax", name="output")
outputs = output_layer(cnn_net)

model = keras.Model(inputs=input_images, outputs=outputs, name="covid_classifier")

In [87]:
model.summary()

Model: "covid_classifier"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_images (InputLayer)    [(None, 512, 512, 3)]     0         
_________________________________________________________________
conv2d_01 (Conv2D)           (None, 512, 512, 16)      448       
_________________________________________________________________
maxpool2d_01 (MaxPooling2D)  (None, 256, 256, 16)      0         
_________________________________________________________________
conv2d_02 (Conv2D)           (None, 256, 256, 32)      4640      
_________________________________________________________________
maxpool2d_02 (MaxPooling2D)  (None, 128, 128, 32)      0         
_________________________________________________________________
conv2d_03 (Conv2D)           (None, 128, 128, 64)      18496     
_________________________________________________________________
maxpool2d_03 (MaxPooling2D)  (None, 64, 64, 64)   

In [88]:
model.compile(optimizer='adam',
              loss="categorical_crossentropy",
              metrics=['categorical_accuracy'])

In [67]:
train_generator.n // train_generator.batch_size

10437

In [89]:
cnn_model_dir = models_dir / "cnn"
cnn_checkpoint_dir = cnn_model_dir / "checkpoint"
cnn_checkpoint_dir.mkdir(exist_ok=True)

cnn_logs_dir = logs_dir / "cnn"
cnn_logs_dir.mkdir(exist_ok=True)

In [90]:
checkpointer = callbacks.ModelCheckpoint(
    str(cnn_checkpoint_dir),
    monitor="val_loss",
    verbose=1,
    save_best_only=True,
    save_weights_only=False,
    mode="auto",
    save_freq="epoch"
)

tensorboard_logger = callbacks.TensorBoard(
    log_dir=str(cnn_logs_dir),
    histogram_freq=1,
    write_graph=False,
    write_images=False,
    update_freq='epoch',
    profile_batch=2,
    embeddings_freq=0,
    embeddings_metadata=None
)

model_callbacks = [checkpointer, tensorboard_logger]

In [None]:
STEP_SIZE_TRAIN=train_generator.n//train_generator.batch_size
STEP_SIZE_VALID=valid_generator.n//valid_generator.batch_size
#STEP_SIZE_TEST=test_generator.n//test_generator.batch_size

fit_history = model.fit_generator(generator=train_generator,
                    steps_per_epoch=STEP_SIZE_TRAIN,
                    validation_data=valid_generator,
                    validation_steps=STEP_SIZE_VALID,
                    epochs=5,
                    callbacks=model_callbacks
)

Epoch 1/5
   50/10437 [..............................] - ETA: 2:04:56 - loss: 3.1830 - categorical_accuracy: 0.4225

In [281]:
filters = 16
num_features = 100
num_classes = 3
num_layers = 0
cnn_net = input_images
while filters <= 256:
    conv_layer = layers.Conv2D(filters=filters, kernel_size=3, padding="same", activation="relu", name=f"conv2d_{num_layers+1:02d}")
    cnn_net = conv_layer(cnn_net)
    if filters < 256:
        max_pool_layer = layers.MaxPool2D(2, name=f"maxpool2d_{num_layers+1:02d}")
        cnn_net = max_pool_layer(cnn_net)
        filters = filters * 2
    num_layers += 1

flatten_layer = layers.Flatten(name="flatten")
cnn_net = flatten_layer(cnn_net)

dense_layer = layers.Dense(filters, activation="relu", name="dense_01")
cnn_net = dense_layer(cnn_net)

dense_layer2 = layers.Dense(num_features, activation="relu", name="dense_02")
cnn_net = dense_layer2(cnn_net)
output_layer = layers.Dense(num_classes, activation="softmax", name="output")
outputs = output_layer(cnn_net)

model = keras.Model(inputs=input_images, outputs=outputs, name="covid_classifier")

KeyboardInterrupt: 

In [278]:
cnn_net

<tf.Tensor 'maxpool2d_05/Identity:0' shape=(None, 14, 14, 256) dtype=float32>

In [273]:
conv2d(input_images)

<tf.Tensor 'conv2d/Identity:0' shape=(None, 512, 512, 6) dtype=float32>

In [None]:
images_input_dir = data_dir / "COVID19"
# Directories used to train the CNN (image by image) 
cnn_data_dir = data_dir / "modelling" / "cnn"
cnn_train_dir = cnn_data_dir / "train"
cnn_test_dir = cnn_data_dir / "test"


In [75]:
from tqdm.notebook import tqdm

In [106]:
next(scan_input_dir.glob("*.*")).suffix.lstrip(".")

'jpg'

In [95]:
import shutil
from tensorflow.keras.preprocessing.image import load_img, save_img, img_to_array

formats_to_check = ["png", "PNG", "jpg", "JPG", "jpeg", "JPEG", "bmp", "BMP"]
image_sizes = set()

for r in tqdm(train_labels.collect(), desc="Scans processed"):
    obs = Observation.from_row(r)
    image_output_dir = cnn_train_dir / obs.label
    scan_input_dir = images_input_dir / obs.label / str(obs.patient_id) / str(obs.scan_id)
    for img_format in formats_to_check:
        scan_images = list(scan_input_dir.glob(f"*.{img_format}"))
        if any(scan_images):
            break
    
    # If not, check for jpg
    if not any(scan_images):
        scan_images = list(scan_input_dir.glob("*.jpg"))
    
    assert any(scan_images), \
        f"Couldn't load images from {scan_input_dir}"

    for image in scan_images:
        # Pick the file name
        suffix = image.suffix
        base_name = image.name[:-len(suffix)]
        target_file = image_output_dir / f"{obs.patient_id}_{obs.scan_id}_{base_name}.png"

        if not target_file.exists():
            img = load_img(str(image))
            img_array = img_to_array(img)
            image_sizes.add(img_array.shape)
            save_img(str(target_file), img_array)


HBox(children=(HTML(value='Scans processed'), FloatProgress(value=0.0, max=3752.0), HTML(value='')))




KeyboardInterrupt: 

In [93]:
image_sizes

{(512, 512, 3)}

In [84]:
image.name[:-len(image.suffix)]

'0032'

In [None]:
import shutil

for r in tqdm(test_labels.collect(), desc="Scans processed"):
    obs = Observation.from_row(r)
    image_output_dir = cnn_test_dir / obs.label
    scan_input_dir = images_input_dir / obs.label / str(obs.patient_id) / str(obs.scan_id)
    # First, check for png images
    scan_images = list(scan_input_dir.glob("*.png"))
    
    # If not, check for jpg
    if not any(scan_images):
        scan_images = list(scan_input_dir.glob("*.jpg"))
    
    assert any(scan_images), \
        f"Couldn't load images from {scan_input_dir}"

    for image in scan_images:
        # Pick the file name
        base_name = image.name
        target_file = image_output_dir / f"{obs.patient_id}_{obs.scan_id}_{base_name}"
        shutil.copy(image, target_file)

In [66]:
image = list(patient_input_dir.glob("*.png"))[0]

In [53]:
Label.from_row(train_labels.first())

Label(patient_id=0, scan_id=3131, n_slice=285, num_clips=5, label='CP')

In [20]:
TODO:
- Get unique patient ids into pandas
- Split the ids into train/test
- Divide labels into train_labels and test_labels

from sklearn.model_selection import train_test_split

train_test_split(labels, test_size=.1, random_state=config.get_int("tfg.seed"))

TypeError: Expected sequence or array-like, got <class 'pyspark.sql.dataframe.DataFrame'>

# Demographics
## Metadata file

In [56]:
!head -n2 {metadata_file}

patient_id,scan_id,Age,Sex(Male1/Female2),Critical_illness,Liver_function,Lung_function,Progression (Days)
1399,127,57,1,1,5,2,0.08


In [100]:
metadata = spark.read\
    .options(**csv_options)\
    .csv(str(metadata_file))

metadata.show(5)

+----------+-------+---+------------------+----------------+--------------+-------------+------------------+
|patient_id|scan_id|Age|Sex(Male1/Female2)|Critical_illness|Liver_function|Lung_function|Progression (Days)|
+----------+-------+---+------------------+----------------+--------------+-------------+------------------+
|      1399|    127| 57|                 1|               1|             5|            2|              0.08|
|      1297|     82| 55|                 1|               1|             3|            2|              0.88|
|      2255|    549|  3|                 1|               1|          null|         null|              0.02|
|      1184|     26|  5|                 2|               1|             0|            2|              0.02|
|      1186|     27|  2|                 2|               1|             2|            2|              0.02|
+----------+-------+---+------------------+----------------+--------------+-------------+------------------+
only showing top 5 

In [101]:
metadata.printSchema()

root
 |-- patient_id: integer (nullable = true)
 |-- scan_id: integer (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Sex(Male1/Female2): integer (nullable = true)
 |-- Critical_illness: integer (nullable = true)
 |-- Liver_function: integer (nullable = true)
 |-- Lung_function: integer (nullable = true)
 |-- Progression (Days): double (nullable = true)



In [102]:
id_expr = "CONCAT(patient_id, '::', scan_id) AS id"
metadata_exprs = [
    id_expr,
    "patient_id AS patient_id",
    "scan_id AS scan_id",
    "Age AS age",
    "`Sex(Male1/Female2)` AS gender",
    "Critical_illness AS critical_illness",
    "Liver_function AS liver_function",
    "Lung_function AS lung_function",
    "`Progression (Days)` AS progression_days",
]

metadata = metadata.selectExpr(*metadata_exprs)

metadata.show(5)

+---------+----------+-------+---+------+----------------+--------------+-------------+----------------+
|       id|patient_id|scan_id|age|gender|critical_illness|liver_function|lung_function|progression_days|
+---------+----------+-------+---+------+----------------+--------------+-------------+----------------+
|1399::127|      1399|    127| 57|     1|               1|             5|            2|            0.08|
| 1297::82|      1297|     82| 55|     1|               1|             3|            2|            0.88|
|2255::549|      2255|    549|  3|     1|               1|          null|         null|            0.02|
| 1184::26|      1184|     26|  5|     2|               1|             0|            2|            0.02|
| 1186::27|      1186|     27|  2|     2|               1|             2|            2|            0.02|
+---------+----------+-------+---+------+----------------+--------------+-------------+----------------+
only showing top 5 rows



In [81]:
metadata.count()

408

## Labels

In [103]:
labels = spark.read.options(**csv_options).csv(str(labels_file))

labels.show(5)

+--------+-----+----------+-------+-------+
|zip_file|label|patient_id|scan_id|n_slice|
+--------+-----+----------+-------+-------+
|CP-1.zip|   CP|         0|   3131|    285|
|CP-1.zip|   CP|         0|   3132|     42|
|CP-1.zip|   CP|         0|   3133|    290|
|CP-1.zip|   CP|         0|   3134|     37|
|CP-1.zip|   CP|         0|   3135|    269|
+--------+-----+----------+-------+-------+
only showing top 5 rows



In [104]:
labels.printSchema()

root
 |-- zip_file: string (nullable = true)
 |-- label: string (nullable = true)
 |-- patient_id: integer (nullable = true)
 |-- scan_id: integer (nullable = true)
 |-- n_slice: integer (nullable = true)



In [105]:
labels_expr = [
    id_expr,
    "patient_id AS patient_id",
    "scan_id AS scan_id",
    "n_slice AS n_slice",
    "label",
]

labels = labels.selectExpr(*labels_expr)

labels.show(5)

+-------+----------+-------+-------+-----+
|     id|patient_id|scan_id|n_slice|label|
+-------+----------+-------+-------+-----+
|0::3131|         0|   3131|    285|   CP|
|0::3132|         0|   3132|     42|   CP|
|0::3133|         0|   3133|    290|   CP|
|0::3134|         0|   3134|     37|   CP|
|0::3135|         0|   3135|    269|   CP|
+-------+----------+-------+-------+-----+
only showing top 5 rows



In [80]:
labels.count()

4178

## Check overlap between labels and metadata

Do we have demographics for patients for which we have data?

In [145]:
total_labels = labels.count()
total_labels_with_demo = labels.join(metadata, ["patient_id"], "left_semi").count()

print(f"We have demographics for {total_labels_with_demo} / {total_labels} observations ({100 * total_labels_with_demo / total_labels:.2f}%)")

We have demographics for 378 / 4178 observations (9.05%)


In [146]:
labels_with_metadata = labels.join(metadata, ["patient_id"], "left_semi")

labels_with_metadata.count()

378

In [147]:
labels_with_metadata.groupBy("label").count().show()

+------+-----+
| label|count|
+------+-----+
|    CP|  170|
|   NCP|   13|
|Normal|  195|
+------+-----+



Is there also overlap on patient_id AND scan_id level?

In [142]:
labels.join(metadata, ["patient_id", "scan_id"], "left_semi").groupBy("label").count().show()

+-----+-----+
|label|count|
+-----+-----+
+-----+-----+



In [107]:
labels_pd = labels.toPandas()

labels_pd

Unnamed: 0,id,patient_id,scan_id,n_slice,label
0,0::3131,0,3131,285,CP
1,0::3132,0,3132,42,CP
2,0::3133,0,3133,290,CP
3,0::3134,0,3134,37,CP
4,0::3135,0,3135,269,CP
...,...,...,...,...,...
4173,1919::374,1919,374,99,Normal
4174,1920::375,1920,375,100,Normal
4175,1921::376,1921,376,80,Normal
4176,1922::377,1922,377,87,Normal


In [117]:
labels.select("patient_id").count(), labels.select("patient_id").distinct().count()

(4178, 2742)

## Do any patient_ids have more than 1 label?

In [118]:
from pyspark.sql import functions as F

In [120]:
labels\
    .groupBy("patient_id")\
    .agg(F.countDistinct("label").alias("num_labels"))\
    .filter("num_labels > 1")\
    .count()

0

## Check number labels with / without unique patient ids

In [138]:
total_labels = labels.count()
total_slices = labels.selectExpr("sum(n_slice) AS total").first().total

total_labels, total_slices

(4178, 411529)

In [139]:
labels\
    .groupBy("label")\
    .agg(
        F.count("*").alias("count"),
        F.sum("n_slice").alias("n_slice")
    )\
    .withColumn("count_pct", F.expr(f"ROUND(count / {total_labels}, 4)"))\
    .withColumn("slice_pct", F.expr(f"ROUND(n_slice / {total_slices}, 4)"))\
    .orderBy("label")\
    .show()

+------+-----+-------+---------+---------+
| label|count|n_slice|count_pct|slice_pct|
+------+-----+-------+---------+---------+
|    CP| 1556| 159702|   0.3724|   0.3881|
|   NCP| 1544| 156071|   0.3696|   0.3792|
|Normal| 1078|  95756|    0.258|   0.2327|
+------+-----+-------+---------+---------+



In [149]:
unique_patients = labels.select("patient_id").distinct().count()
labels\
    .dropDuplicates(["patient_id"])\
    .groupBy("label")\
    .count()\
    .withColumn("pct", F.expr(f"ROUND(count / {unique_patients}, 4)"))\
    .orderBy("label")\
    .show()

+------+-----+------+
| label|count|   pct|
+------+-----+------+
|    CP|  964|0.3516|
|   NCP|  929|0.3388|
|Normal|  849|0.3096|
+------+-----+------+



## Check labels with metadata only

In [151]:
total_labels_with_metadata = labels_with_metadata.count()
total_slices_with_metadata = labels_with_metadata.selectExpr("sum(n_slice) AS total").first().total

print(f"Total labels with metadata: {total_labels_with_metadata}")
print(f"Total slices with metadata: {total_slices_with_metadata}")

labels_with_metadata\
    .groupBy("label")\
    .agg(
        F.count("*").alias("count"),
        F.sum("n_slice").alias("n_slice")
    )\
    .withColumn("count_pct", F.expr(f"ROUND(count / {total_labels_with_metadata}, 4)"))\
    .withColumn("slice_pct", F.expr(f"ROUND(n_slice / {total_slices_with_metadata}, 4)"))\
    .orderBy("label")\
    .show()

Total labels with metadata: 378
Total slices with metadata: 31616
+------+-----+-------+---------+---------+
| label|count|n_slice|count_pct|slice_pct|
+------+-----+-------+---------+---------+
|    CP|  170|  16084|   0.4497|   0.5087|
|   NCP|   13|    661|   0.0344|   0.0209|
|Normal|  195|  14871|   0.5159|   0.4704|
+------+-----+-------+---------+---------+



In [152]:
unique_patients_with_metadata = labels_with_metadata.select("patient_id").distinct().count()

print(f"There are {unique_patients_with_metadata} unique patients with metadata")
labels_with_metadata\
    .dropDuplicates(["patient_id"])\
    .groupBy("label")\
    .count()\
    .withColumn("pct", F.expr(f"ROUND(count / {unique_patients_with_metadata}, 4)"))\
    .orderBy("label")\
    .show()

There are 276 unique patients with metadata
+------+-----+------+
| label|count|   pct|
+------+-----+------+
|    CP|   99|0.3587|
|   NCP|   13|0.0471|
|Normal|  164|0.5942|
+------+-----+------+



## Conclusion

There is almost no metadata for patients with NCP (there's metadata only for 13). It could be usable if we only want to consider e.g. CP VS Normal, but won't be useful for NCP.

In [29]:
import sweetviz as sv

In [108]:
report = sv.analyze(labels_pd)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, layout=Layout(flex='2'), max=6.0), HTML(value='')), la…




HBox(children=(HTML(value=''), FloatProgress(value=0.0, layout=Layout(flex='2'), max=5.0), HTML(value='')), la…




HBox(children=(HTML(value=''), FloatProgress(value=0.0, layout=Layout(flex='2'), max=1.0), HTML(value='')), la…




In [109]:
report.show_html()

Report SWEETVIZ_REPORT.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.
