# Notebook Initialization

In [18]:
# %load nb_init.py

from pathlib import Path
import pandas as pd

base_dir = Path.cwd().parent
config_dir = base_dir / "config"
data_dir = base_dir / "data"
docs_dir = base_dir / "docs"
figures_dir = docs_dir / "figures"
models_dir = base_dir / "models"
logs_dir = base_dir / "logs"
images_input_dir = data_dir / "COVID19"
preprocessed_dir = data_dir / "preprocessed"
output_dir = data_dir / "output"

# Directories used to train the CNN (image by image) 
cnn_data_dir = data_dir / "modelling" / "cnn"
cnn_train_dir = cnn_data_dir / "train"
cnn_test_dir = cnn_data_dir / "test"

metadata_file = images_input_dir / "metadata.csv"
labels_file = images_input_dir / "unzip_filenames.csv"
preprocessed_labels_file = preprocessed_dir / "labels.parquet"

feature_extractor_model_file = models_dir / "feature_extractor.tf"

from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = SparkSession.builder.getOrCreate()

config_file = config_dir / "tfg.conf"

from pyhocon import ConfigFactory
config = None

def load_config():
    return ConfigFactory.parse_file(config_file)

config = load_config()
    
import sys

if str(base_dir / "src") not in sys.path:
    sys.path.append(str(base_dir / "src"))

%load_ext autoreload

%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
spark

In [21]:
from tfg import DataRepository

repo = DataRepository(config=config, base_data_path=data_dir)

# Load model

In [4]:
from tensorflow.keras.models import load_model
cnn_model_suffix = "07"
model_file = models_dir / f"feature_extractor_{cnn_model_suffix}.tf"
model = load_model(str(model_file))

model.summary()

Model: "covid_classifier"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_images (InputLayer)    [(None, 64, 64, 3)]       0         
_________________________________________________________________
conv2d_01 (Conv2D)           (None, 64, 64, 6)         168       
_________________________________________________________________
maxpool2d_01 (MaxPooling2D)  (None, 32, 32, 6)         0         
_________________________________________________________________
conv2d_02 (Conv2D)           (None, 32, 32, 12)        660       
_________________________________________________________________
maxpool2d_02 (MaxPooling2D)  (None, 16, 16, 12)        0         
_________________________________________________________________
conv2d_03 (Conv2D)           (None, 16, 16, 18)        1962      
_________________________________________________________________
maxpool2d_03 (MaxPooling2D)  (None, 8, 8, 18)     

In [5]:
import tensorflow as tf

new_model = tf.keras.Sequential()

In [6]:
model.layers[:-2]

[<tensorflow.python.keras.engine.input_layer.InputLayer at 0x7fac18590c70>,
 <tensorflow.python.keras.layers.convolutional.Conv2D at 0x7fac18541460>,
 <tensorflow.python.keras.layers.pooling.MaxPooling2D at 0x7fac184ff190>,
 <tensorflow.python.keras.layers.convolutional.Conv2D at 0x7fac184ff6a0>,
 <tensorflow.python.keras.layers.pooling.MaxPooling2D at 0x7fac185072e0>,
 <tensorflow.python.keras.layers.convolutional.Conv2D at 0x7fac185077f0>,
 <tensorflow.python.keras.layers.pooling.MaxPooling2D at 0x7fac1850a430>,
 <tensorflow.python.keras.layers.convolutional.Conv2D at 0x7fac1850a940>,
 <tensorflow.python.keras.layers.pooling.MaxPooling2D at 0x7fac1850d580>,
 <tensorflow.python.keras.layers.convolutional.Conv2D at 0x7fac1850da90>,
 <tensorflow.python.keras.layers.core.Dropout at 0x7fac18516820>,
 <tensorflow.python.keras.layers.pooling.MaxPooling2D at 0x7fac18516b50>,
 <tensorflow.python.keras.layers.convolutional.Conv2D at 0x7fac181a0100>,
 <tensorflow.python.keras.layers.core.Dropou

In [7]:
for l in model.layers[:-1]:
    if isinstance(l, tf.python.keras.layers.core.Dropout):
        continue
    new_model.add(l)

In [8]:
new_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_01 (Conv2D)           (None, 64, 64, 6)         168       
_________________________________________________________________
maxpool2d_01 (MaxPooling2D)  (None, 32, 32, 6)         0         
_________________________________________________________________
conv2d_02 (Conv2D)           (None, 32, 32, 12)        660       
_________________________________________________________________
maxpool2d_02 (MaxPooling2D)  (None, 16, 16, 12)        0         
_________________________________________________________________
conv2d_03 (Conv2D)           (None, 16, 16, 18)        1962      
_________________________________________________________________
maxpool2d_03 (MaxPooling2D)  (None, 8, 8, 18)          0         
_________________________________________________________________
conv2d_04 (Conv2D)           (None, 8, 8, 24)          3

# Load datasets

In [10]:
train_df = repo.load("train_df")
train_df.head(5)

Unnamed: 0,label,file,patient_id,scan_id,n_slice,num_clips,seq_num,clip_num
0,NCP,NCP/880/2415/0000.jpg,880,2415,312,5,0,0
1,NCP,NCP/880/2415/0001.jpg,880,2415,312,5,1,0
2,NCP,NCP/880/2415/0002.jpg,880,2415,312,5,2,0
3,NCP,NCP/880/2415/0003.jpg,880,2415,312,5,3,0
4,NCP,NCP/880/2415/0004.jpg,880,2415,312,5,4,0


It seems like the num_clips, seq_num and clip_num are not correct, we'll fix them..

In [49]:
images_per_clip = config.get_int("tfg.training.images_per_clip")
print(f"{images_per_clip = }")

images_per_clip = 70


In [50]:
from pyspark.sql import Window

images_per_clip = config.get_int("tfg.training.images_per_clip")
print(f"{images_per_clip = }")

scan_window = Window\
    .partitionBy("patient_id", "scan_id")\
    .orderBy("file")

images_per_clip = 70


In [51]:
def fix_df(pandas_df):
    df = spark.createDataFrame(pandas_df)

    new_df = df\
        .withColumn("seq_num", F.row_number().over(scan_window) - 1)\
        .withColumn("num_clips", F.ceil(F.col("n_slice") / images_per_clip))\
        .withColumn("clip_num", F.floor(F.col("seq_num") / images_per_clip))\
        .toPandas()

    return new_df

In [13]:
train_df.head()

Unnamed: 0,label,file,patient_id,scan_id,n_slice,num_clips,seq_num,clip_num
0,NCP,NCP/880/2415/0000.jpg,880,2415,312,5,0,0
1,NCP,NCP/880/2415/0001.jpg,880,2415,312,5,1,0
2,NCP,NCP/880/2415/0002.jpg,880,2415,312,5,2,0
3,NCP,NCP/880/2415/0003.jpg,880,2415,312,5,3,0
4,NCP,NCP/880/2415/0004.jpg,880,2415,312,5,4,0


In [101]:
train_df = repo.load("train_df")
train_df = fix_df(train_df)
repo.save("train_df", train_df)
train_df.head()

Unnamed: 0,label,file,patient_id,scan_id,n_slice,num_clips,seq_num,clip_num
0,NCP,NCP/880/2415/0000.jpg,880,2415,312,5,0,0
1,NCP,NCP/880/2415/0001.jpg,880,2415,312,5,1,0
2,NCP,NCP/880/2415/0002.jpg,880,2415,312,5,2,0
3,NCP,NCP/880/2415/0003.jpg,880,2415,312,5,3,0
4,NCP,NCP/880/2415/0004.jpg,880,2415,312,5,4,0


In [52]:
val_df = repo.load("val_df")
val_df = fix_df(val_df)
repo.save("val_df", val_df)
val_df.head()

Unnamed: 0,label,file,patient_id,scan_id,n_slice,num_clips,seq_num,clip_num
0,NCP,NCP/450/2055/0000.png,450,2055,34,1,0,0
1,NCP,NCP/450/2055/0001.png,450,2055,34,1,1,0
2,NCP,NCP/450/2055/0002.png,450,2055,34,1,2,0
3,NCP,NCP/450/2055/0003.png,450,2055,34,1,3,0
4,NCP,NCP/450/2055/0004.png,450,2055,34,1,4,0


In [53]:
test_df = repo.load("test_df")
test_df = fix_df(test_df)
repo.save("test_df", test_df)
test_df.head()

Unnamed: 0,label,file,patient_id,scan_id,n_slice,num_clips,seq_num,clip_num
0,NCP,NCP/86/1305/0000.png,86,1305,117,2,0,0
1,NCP,NCP/86/1305/0001.png,86,1305,117,2,1,0
2,NCP,NCP/86/1305/0002.png,86,1305,117,2,2,0
3,NCP,NCP/86/1305/0003.png,86,1305,117,2,3,0
4,NCP,NCP/86/1305/0004.png,86,1305,117,2,4,0


# Convert images into features

In [11]:
img_size = 64#config.get_int("tfg.training.img_size")
print(f"{img_size = }")

image_target_size = (img_size, img_size)
print(f"{image_target_size = }")

seed = config.get_int("tfg.seed")
print(f"{seed = }")

batch_size = config.get_int("tfg.training.batch_size")
print(f"{batch_size = }")


img_size = 64
image_target_size = (64, 64)
seed = 42
batch_size = 32


In [12]:
from keras_preprocessing.image import ImageDataGenerator

test_datagen = ImageDataGenerator(
    rescale=1./255.,
)

## Train

In [13]:
train_df = repo.load("train_df")
train_generator = test_datagen.flow_from_dataframe(
    dataframe=train_df,
    directory=str(images_input_dir),
    x_col="file",
    y_col="label",
    batch_size=batch_size,
    seed=seed,
    shuffle=False,
    class_mode="categorical",
    target_size=image_target_size)

Found 331286 validated image filenames belonging to 3 classes.


In [14]:
train_generator.class_indices

{'CP': 0, 'NCP': 1, 'Normal': 2}

In [15]:
class_to_idx = train_generator.class_indices
idx_to_class = { v: k for k, v in class_to_idx.items() }

In [16]:
NUM_TRAIN_BATCHES = train_generator.n // train_generator.batch_size + 1

In [17]:
raw_train_feats = new_model.predict(train_generator, steps=NUM_TRAIN_BATCHES, verbose=1)
raw_train_feats.shape



(331286, 16)

In [19]:
train_df["img_features"] = raw_train_feats.tolist()

In [22]:
repo.save("train_features_final", train_df)

True

In [23]:
train_feats = repo.load("train_features_final")
train_feats.head(5)

Unnamed: 0,label,file,patient_id,scan_id,n_slice,num_clips,seq_num,clip_num,img_features
0,NCP,NCP/880/2415/0000.jpg,880,2415,312,5,0,0,"[0.0, 0.0, 0.0, 0.0, 2.978132724761963, 0.0, 0..."
1,NCP,NCP/880/2415/0001.jpg,880,2415,312,5,1,0,"[0.0, 0.0, 0.0, 0.0, 2.8504579067230225, 0.0, ..."
2,NCP,NCP/880/2415/0002.jpg,880,2415,312,5,2,0,"[0.0, 0.0, 0.0, 0.0, 3.0596957206726074, 0.0, ..."
3,NCP,NCP/880/2415/0003.jpg,880,2415,312,5,3,0,"[0.0, 0.0, 0.0, 0.0, 3.2996292114257812, 0.0, ..."
4,NCP,NCP/880/2415/0004.jpg,880,2415,312,5,4,0,"[0.0, 0.0, 0.0, 0.0, 3.1869380474090576, 0.0, ..."


In [24]:
# train_generator.reset()
# raw_train_probs = model.predict(train_generator, steps=NUM_TRAIN_BATCHES, verbose=1)
# raw_train_probs.shape

In [25]:
# train_feats["img_probs"] = raw_train_probs.tolist()
# train_feats.head(5)

In [26]:
# repo.save("train_features", train_feats)

## Validation

In [54]:
val_df = repo.load("val_df")

val_generator = test_datagen.flow_from_dataframe(
    dataframe=val_df,
    directory=str(images_input_dir),
    x_col="file",
    y_col="label",
    batch_size=batch_size,
    seed=seed,
    shuffle=False,
    class_mode="categorical",
    target_size=image_target_size)

NUM_VAL_BATCHES = val_generator.n // val_generator.batch_size + 1

Found 39828 validated image filenames belonging to 3 classes.


In [55]:
raw_val_feats = new_model.predict(val_generator, steps=NUM_VAL_BATCHES, verbose=1)
raw_val_feats.shape



(39828, 16)

In [56]:
val_df["img_features"] = raw_val_feats.tolist()
val_df.head()

Unnamed: 0,label,file,patient_id,scan_id,n_slice,num_clips,seq_num,clip_num,img_features
0,NCP,NCP/450/2055/0000.png,450,2055,34,1,0,0,"[1.778397798538208, 2.1663880348205566, 0.0, 1..."
1,NCP,NCP/450/2055/0001.png,450,2055,34,1,1,0,"[8.283327102661133, 9.311873435974121, 0.0, 7...."
2,NCP,NCP/450/2055/0002.png,450,2055,34,1,2,0,"[1.7244181632995605, 1.9697574377059937, 0.0, ..."
3,NCP,NCP/450/2055/0003.png,450,2055,34,1,3,0,"[13.70626163482666, 15.29365062713623, 0.0, 12..."
4,NCP,NCP/450/2055/0004.png,450,2055,34,1,4,0,"[1.9659833908081055, 2.2421464920043945, 0.0, ..."


In [57]:
repo.save("val_features_final", val_df)

True

In [58]:
val_feats = repo.load("val_features_final")
val_feats.head(5)

Unnamed: 0,label,file,patient_id,scan_id,n_slice,num_clips,seq_num,clip_num,img_features
0,NCP,NCP/450/2055/0000.png,450,2055,34,1,0,0,"[1.778397798538208, 2.1663880348205566, 0.0, 1..."
1,NCP,NCP/450/2055/0001.png,450,2055,34,1,1,0,"[8.283327102661133, 9.311873435974121, 0.0, 7...."
2,NCP,NCP/450/2055/0002.png,450,2055,34,1,2,0,"[1.7244181632995605, 1.9697574377059937, 0.0, ..."
3,NCP,NCP/450/2055/0003.png,450,2055,34,1,3,0,"[13.70626163482666, 15.29365062713623, 0.0, 12..."
4,NCP,NCP/450/2055/0004.png,450,2055,34,1,4,0,"[1.9659833908081055, 2.2421464920043945, 0.0, ..."


In [32]:
# val_generator.reset()
# raw_val_probs = model.predict(val_generator, steps=NUM_VAL_BATCHES, verbose=1)
# raw_val_probs.shape

In [33]:
# val_feats["img_probs"] = raw_val_probs.tolist()
# val_feats.head(5)

In [34]:
# repo.save("val_features", val_feats)

## Test

In [59]:
test_df = repo.load("test_df")

test_generator = test_datagen.flow_from_dataframe(
    dataframe=test_df,
    directory=str(images_input_dir),
    x_col="file",
    y_col="label",
    batch_size=batch_size,
    seed=seed,
    shuffle=False,
    class_mode="categorical",
    target_size=image_target_size)

NUM_TEST_BATCHES = test_generator.n // test_generator.batch_size + 1

Found 40415 validated image filenames belonging to 3 classes.


In [60]:
raw_test_feats = new_model.predict(test_generator, steps=NUM_TEST_BATCHES, verbose=1)
raw_test_feats.shape



(40415, 16)

In [61]:
test_df["img_features"] = raw_test_feats.tolist()
repo.save("test_features_final", test_df)

True

In [62]:
test_feats = repo.load("test_features_final")
test_feats.head(5)

Unnamed: 0,label,file,patient_id,scan_id,n_slice,num_clips,seq_num,clip_num,img_features
0,NCP,NCP/86/1305/0000.png,86,1305,117,2,0,0,"[7.487580299377441, 8.385154724121094, 0.0, 6...."
1,NCP,NCP/86/1305/0001.png,86,1305,117,2,1,0,"[10.073173522949219, 11.256339073181152, 0.0, ..."
2,NCP,NCP/86/1305/0002.png,86,1305,117,2,2,0,"[14.774097442626953, 16.47423553466797, 0.0, 1..."
3,NCP,NCP/86/1305/0003.png,86,1305,117,2,3,0,"[8.890345573425293, 9.93864917755127, 0.0, 7.9..."
4,NCP,NCP/86/1305/0004.png,86,1305,117,2,4,0,"[9.855095863342285, 11.00594711303711, 0.0, 8...."


In [41]:
# test_generator.reset()
# raw_test_probs = model.predict(test_generator, steps=NUM_TEST_BATCHES, verbose=1)
# raw_test_probs.shape

In [42]:
# test_feats["img_probs"] = raw_test_probs.tolist()
# test_feats.head(5)

In [43]:
# repo.save("test_features", test_feats)

# Group features into clips

In [63]:
train_feats = repo.load("train_features_final")
val_feats = repo.load("val_features_final")
test_feats = repo.load("test_features_final")

In [64]:
train_feats.head(5)

Unnamed: 0,label,file,patient_id,scan_id,n_slice,num_clips,seq_num,clip_num,img_features
0,NCP,NCP/880/2415/0000.jpg,880,2415,312,5,0,0,"[0.0, 0.0, 0.0, 0.0, 2.978132724761963, 0.0, 0..."
1,NCP,NCP/880/2415/0001.jpg,880,2415,312,5,1,0,"[0.0, 0.0, 0.0, 0.0, 2.8504579067230225, 0.0, ..."
2,NCP,NCP/880/2415/0002.jpg,880,2415,312,5,2,0,"[0.0, 0.0, 0.0, 0.0, 3.0596957206726074, 0.0, ..."
3,NCP,NCP/880/2415/0003.jpg,880,2415,312,5,3,0,"[0.0, 0.0, 0.0, 0.0, 3.2996292114257812, 0.0, ..."
4,NCP,NCP/880/2415/0004.jpg,880,2415,312,5,4,0,"[0.0, 0.0, 0.0, 0.0, 3.1869380474090576, 0.0, ..."


In [65]:
val_feats.head(5)

Unnamed: 0,label,file,patient_id,scan_id,n_slice,num_clips,seq_num,clip_num,img_features
0,NCP,NCP/450/2055/0000.png,450,2055,34,1,0,0,"[1.778397798538208, 2.1663880348205566, 0.0, 1..."
1,NCP,NCP/450/2055/0001.png,450,2055,34,1,1,0,"[8.283327102661133, 9.311873435974121, 0.0, 7...."
2,NCP,NCP/450/2055/0002.png,450,2055,34,1,2,0,"[1.7244181632995605, 1.9697574377059937, 0.0, ..."
3,NCP,NCP/450/2055/0003.png,450,2055,34,1,3,0,"[13.70626163482666, 15.29365062713623, 0.0, 12..."
4,NCP,NCP/450/2055/0004.png,450,2055,34,1,4,0,"[1.9659833908081055, 2.2421464920043945, 0.0, ..."


In [66]:
test_feats.head(5)

Unnamed: 0,label,file,patient_id,scan_id,n_slice,num_clips,seq_num,clip_num,img_features
0,NCP,NCP/86/1305/0000.png,86,1305,117,2,0,0,"[7.487580299377441, 8.385154724121094, 0.0, 6...."
1,NCP,NCP/86/1305/0001.png,86,1305,117,2,1,0,"[10.073173522949219, 11.256339073181152, 0.0, ..."
2,NCP,NCP/86/1305/0002.png,86,1305,117,2,2,0,"[14.774097442626953, 16.47423553466797, 0.0, 1..."
3,NCP,NCP/86/1305/0003.png,86,1305,117,2,3,0,"[8.890345573425293, 9.93864917755127, 0.0, 7.9..."
4,NCP,NCP/86/1305/0004.png,86,1305,117,2,4,0,"[9.855095863342285, 11.00594711303711, 0.0, 8...."


In [48]:
def agg_features(series):
    series = sorted(series, key=lambda t: t[0])
    
    return [seq_feats[1] for seq_feats in series]

def agg_probs(series):
    series = sorted(series, key=lambda t: t[0])
    
    return [seq_probs[1] for seq_probs in series]

In [69]:
def make_clips(df):
    df["seq_features"] = list(zip(df.seq_num, df.img_features))
    #df["seq_probs"] = list(zip(df.seq_num, df.img_probs))
    df_clips = df\
        .groupby(by=["patient_id", "scan_id", "label", "n_slice", "clip_num"])\
        .agg({
            'file': ["min", "max"],
            'seq_features': [agg_features]#,
            #'seq_probs': [agg_probs]
        })\
        .reset_index()

    # Takes care of the multi-index after the groupby
    df_clips.columns = df_clips.columns.map(lambda t: '_'.join(t).rstrip("_"))

    df_clips = df_clips.rename(columns={
        "seq_features_agg_features": "seq_features",#"seq_probs_agg_probs": "seq_probs",
        "file_min": "clip_start_file",
        "file_max": "clip_end_file",
    })
    
    return df_clips

In [70]:
# train_feats = repo.load("train_features")
train_clips = make_clips(train_feats)
train_clips.head()

Unnamed: 0,patient_id,scan_id,label,n_slice,clip_num,clip_start_file,clip_end_file,seq_features
0,0,3131,CP,285,0,CP/0/3131/0000.png,CP/0/3131/0069.png,"[[0.6799156665802002, 0.7049319744110107, 0.28..."
1,0,3131,CP,285,1,CP/0/3131/0070.png,CP/0/3131/0139.png,"[[0.6799156665802002, 0.7049319744110107, 0.28..."
2,0,3131,CP,285,2,CP/0/3131/0140.png,CP/0/3131/0209.png,"[[0.46724724769592285, 0.48773252964019775, 0...."
3,0,3131,CP,285,3,CP/0/3131/0210.png,CP/0/3131/0279.png,"[[0.722501277923584, 0.7558450698852539, 0.082..."
4,0,3131,CP,285,4,CP/0/3131/0280.png,CP/0/3131/0284.png,"[[0.6799156665802002, 0.7049319744110107, 0.28..."


In [12]:
train_clips.shape

(6117, 9)

In [71]:
repo.save("train_clips_final", train_clips)

True

In [72]:
#val_feats = repo.load("val_features")
val_clips = make_clips(val_feats)
val_clips.head()

Unnamed: 0,patient_id,scan_id,label,n_slice,clip_num,clip_start_file,clip_end_file,seq_features
0,4,3505,CP,298,0,CP/4/3505/0000.png,CP/4/3505/0069.png,"[[0.662039041519165, 0.6864839792251587, 0.292..."
1,4,3505,CP,298,1,CP/4/3505/0070.png,CP/4/3505/0139.png,"[[0.6640641689300537, 0.6885329484939575, 0.29..."
2,4,3505,CP,298,2,CP/4/3505/0140.png,CP/4/3505/0209.png,"[[0.6527369022369385, 0.6770719289779663, 0.29..."
3,4,3505,CP,298,3,CP/4/3505/0210.png,CP/4/3505/0279.png,"[[0.6671477556228638, 0.6916530132293701, 0.29..."
4,4,3505,CP,298,4,CP/4/3505/0280.png,CP/4/3505/0297.png,"[[0.66166752576828, 0.6861080527305603, 0.2922..."


In [15]:
val_clips.shape

(744, 9)

In [73]:
repo.save("val_clips_final", val_clips)

True

In [74]:
# test_feats = repo.load("test_features_final")
test_clips = make_clips(test_feats)
test_clips.head()

Unnamed: 0,patient_id,scan_id,label,n_slice,clip_num,clip_start_file,clip_end_file,seq_features
0,1,3143,CP,300,0,CP/1/3143/0000.png,CP/1/3143/0069.png,"[[0.6799156665802002, 0.7049319744110107, 0.28..."
1,1,3143,CP,300,1,CP/1/3143/0070.png,CP/1/3143/0139.png,"[[0.6799156665802002, 0.7049319744110107, 0.28..."
2,1,3143,CP,300,2,CP/1/3143/0140.png,CP/1/3143/0209.png,"[[0.6799156665802002, 0.7049319744110107, 0.28..."
3,1,3143,CP,300,3,CP/1/3143/0210.png,CP/1/3143/0279.png,"[[0.6889759302139282, 0.7156689167022705, 0.26..."
4,1,3143,CP,300,4,CP/1/3143/0280.png,CP/1/3143/0299.png,"[[0.6758857369422913, 0.7005356550216675, 0.28..."


In [18]:
test_clips.shape

(752, 9)

In [75]:
repo.save("test_clips_final", test_clips)

True