# Notebook Initialization

In [1]:
# %load nb_init.py

from pathlib import Path
import pandas as pd

base_dir = Path.cwd().parent
config_dir = base_dir / "config"
data_dir = base_dir / "data"
docs_dir = base_dir / "docs"
figures_dir = docs_dir / "figures"
models_dir = base_dir / "models"
logs_dir = base_dir / "logs"
images_input_dir = data_dir / "COVID19"
preprocessed_dir = data_dir / "preprocessed"
output_dir = data_dir / "output"

# Directories used to train the CNN (image by image) 
cnn_data_dir = data_dir / "modelling" / "cnn"
cnn_train_dir = cnn_data_dir / "train"
cnn_test_dir = cnn_data_dir / "test"

metadata_file = images_input_dir / "metadata.csv"
labels_file = images_input_dir / "unzip_filenames.csv"
preprocessed_labels_file = preprocessed_dir / "labels.parquet"

feature_extractor_model_file = models_dir / "feature_extractor.tf"

from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = SparkSession.builder.getOrCreate()

config_file = config_dir / "tfg.conf"

from pyhocon import ConfigFactory
config = None

def load_config():
    return ConfigFactory.parse_file(config_file)

config = load_config()
    
import sys

if str(base_dir / "src") not in sys.path:
    sys.path.append(str(base_dir / "src"))

%load_ext autoreload

%autoreload 2

In [2]:
spark

In [3]:
from tfg import DataRepository

repo = DataRepository(config=config, base_data_path=data_dir)

# Load model

In [5]:
from tensorflow.keras.models import load_model
cnn_model_suffix = "06"
model_file = models_dir / f"feature_extractor_{cnn_model_suffix}.tf"
model = load_model(str(model_file))

model.summary()

Model: "covid_classifier"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_images (InputLayer)    [(None, 128, 128, 3)]     0         
_________________________________________________________________
conv2d_01 (Conv2D)           (None, 128, 128, 6)       168       
_________________________________________________________________
maxpool2d_01 (MaxPooling2D)  (None, 64, 64, 6)         0         
_________________________________________________________________
conv2d_02 (Conv2D)           (None, 64, 64, 12)        660       
_________________________________________________________________
maxpool2d_02 (MaxPooling2D)  (None, 32, 32, 12)        0         
_________________________________________________________________
conv2d_03 (Conv2D)           (None, 32, 32, 24)        2616      
_________________________________________________________________
maxpool2d_03 (MaxPooling2D)  (None, 16, 16, 24)   

# Load datasets

In [7]:
train_df = repo.load("train_df")
train_df.head(5)

Unnamed: 0,label,file,patient_id,scan_id,n_slice,num_clips,seq_num,clip_num
0,NCP,NCP/880/2415/0000.jpg,880,2415,312,5,0,0
1,NCP,NCP/880/2415/0001.jpg,880,2415,312,5,1,0
2,NCP,NCP/880/2415/0002.jpg,880,2415,312,5,2,0
3,NCP,NCP/880/2415/0003.jpg,880,2415,312,5,3,0
4,NCP,NCP/880/2415/0004.jpg,880,2415,312,5,4,0


It seems like the num_clips, seq_num and clip_num are not correct, we'll fix them..

In [142]:
from pyspark.sql import Window

images_per_clip = config.get_int("tfg.training.images_per_clip")

scan_window = Window\
    .partitionBy("patient_id", "scan_id")\
    .orderBy("file")

In [143]:
def fix_df(pandas_df):
    df = spark.createDataFrame(pandas_df)

    new_df = df\
        .withColumn("seq_num", F.row_number().over(scan_window) - 1)\
        .withColumn("num_clips", F.ceil(F.col("n_slice") / images_per_clip))\
        .withColumn("clip_num", F.floor(F.col("seq_num") / images_per_clip))\
        .toPandas()

    return new_df

In [101]:
train_df = repo.load("train_df")
train_df = fix_df(train_df)
repo.save("train_df", train_df)
train_df.head()

Unnamed: 0,label,file,patient_id,scan_id,n_slice,num_clips,seq_num,clip_num
0,NCP,NCP/880/2415/0000.jpg,880,2415,312,5,0,0
1,NCP,NCP/880/2415/0001.jpg,880,2415,312,5,1,0
2,NCP,NCP/880/2415/0002.jpg,880,2415,312,5,2,0
3,NCP,NCP/880/2415/0003.jpg,880,2415,312,5,3,0
4,NCP,NCP/880/2415/0004.jpg,880,2415,312,5,4,0


In [None]:
val_df = repo.load("val_df")
val_df = fix_df(val_df)
repo.save("val_df", val_df)
val_df.head()

In [115]:
test_df = repo.load("test_df")
test_df = fix_df(test_df)
repo.save("test_df", test_df)
test_df.head()

True

# Convert images into features

In [8]:
img_size = config.get_int("tfg.training.img_size")
image_target_size = (img_size, img_size)
seed = config.get_int("tfg.seed")
batch_size = config.get_int("tfg.training.batch_size")

In [9]:
from keras_preprocessing.image import ImageDataGenerator

test_datagen = ImageDataGenerator(
    rescale=1./255.,
)

## Train

In [10]:
train_df = repo.load("train_df")
train_generator = test_datagen.flow_from_dataframe(
    dataframe=train_df,
    directory=str(images_input_dir),
    x_col="file",
    y_col="label",
    batch_size=batch_size,
    seed=seed,
    shuffle=False,
    class_mode="categorical",
    target_size=image_target_size)

Found 331286 validated image filenames belonging to 3 classes.


In [11]:
train_generator.class_indices

{'CP': 0, 'NCP': 1, 'Normal': 2}

In [14]:
class_to_idx = train_generator.class_indices
idx_to_class = { v: k for k, v in class_to_idx.items() }

In [13]:
NUM_TRAIN_BATCHES = train_generator.n // train_generator.batch_size + 1

In [15]:
raw_train_feats = model.predict(train_generator, steps=NUM_TRAIN_BATCHES, verbose=1)
raw_train_feats.shape



(320, 3)

In [17]:
train_df["img_probs"] = raw_train_feats.tolist()

[[0.006345381494611502, 0.2938346266746521, 0.6998199820518494],
 [0.003827716689556837, 0.3122704327106476, 0.683901846408844],
 [0.005725017748773098, 0.38320884108543396, 0.6110661625862122],
 [0.00022382025781553239, 0.29181888699531555, 0.7079572677612305],
 [3.348174141137861e-05, 0.18306158483028412, 0.8169049024581909],
 [0.00010949972056550905, 0.26848945021629333, 0.7314010262489319],
 [4.633751086657867e-05, 0.19355522096157074, 0.8063984513282776],
 [2.391957059444394e-05, 0.18498347699642181, 0.8149926066398621],
 [1.1948675819439813e-05, 0.19143489003181458, 0.808553159236908],
 [1.1003174194001986e-07, 0.10247980058193207, 0.897520124912262],
 [1.0893384569499176e-05, 0.16673532128334045, 0.8332537412643433],
 [3.7249768070068967e-07, 0.13027869164943695, 0.8697209358215332],
 [4.7297056880779564e-05, 0.23403868079185486, 0.7659140825271606],
 [6.372048915181949e-07, 0.13505439460277557, 0.8649449348449707],
 [5.162765955901705e-05, 0.2951548099517822, 0.7047935724258423

In [75]:
repo.save("train_features", train_df)

True

In [166]:
train_feats = repo.load("train_features")
train_feats.head(5)

Unnamed: 0,file,label,patient_id,scan_id,n_slice,num_clips,seq_num,clip_num,img_features
0,NCP/880/2415/0000.jpg,NCP,880,2415,312,5,0,0,"[0.0, 5.270750045776367, 0.0, 5.07415866851806..."
1,NCP/880/2415/0001.jpg,NCP,880,2415,312,5,1,0,"[0.0, 2.8091320991516113, 0.0, 2.9504299163818..."
2,NCP/880/2415/0002.jpg,NCP,880,2415,312,5,2,0,"[0.0, 3.616379737854004, 0.19390609860420227, ..."
3,NCP/880/2415/0003.jpg,NCP,880,2415,312,5,3,0,"[0.0, 2.734466314315796, 0.0, 2.81957268714904..."
4,NCP/880/2415/0004.jpg,NCP,880,2415,312,5,4,0,"[0.0, 9.06471061706543, 0.0, 7.898210525512695..."


## Validation

In [48]:
val_df = repo.load("val_df")

val_generator = test_datagen.flow_from_dataframe(
    dataframe=val_df,
    directory=str(images_input_dir),
    x_col="file",
    y_col="label",
    batch_size=batch_size,
    seed=seed,
    shuffle=False,
    class_mode="categorical",
    target_size=image_target_size)

NUM_VAL_BATCHES = val_generator.n // val_generator.batch_size + 1

raw_val_feats = new_model.predict(val_generator, steps=NUM_VAL_BATCHES, verbose=1)
raw_val_feats.shape

Found 39828 validated image filenames belonging to 3 classes.


(39828, 32)

In [132]:
val_df["img_features"] = raw_val_feats.tolist()
val_df.head()

Unnamed: 0,file,img_features
0,CP/1075/3118/0543.jpg,"[0.5893998742103577, 0.0, 3.105539083480835, 0..."
1,CP/1075/3118/0174.jpg,"[2.171123504638672, 0.0, 5.15280818939209, 0.0..."
2,CP/1075/3118/0130.jpg,"[2.2265334129333496, 0.0, 6.776073455810547, 0..."
3,CP/1075/3118/0525.jpg,"[1.0836143493652344, 0.0, 3.375173330307007, 0..."
4,CP/1075/3118/0152.jpg,"[2.058112621307373, 0.0, 5.975271224975586, 0...."


In [134]:
repo.save("val_features", val_df)

True

## Test

In [49]:
test_df = repo.load("test_df")

test_generator = test_datagen.flow_from_dataframe(
    dataframe=test_df,
    directory=str(images_input_dir),
    x_col="file",
    y_col="label",
    batch_size=batch_size,
    seed=seed,
    shuffle=False,
    class_mode="categorical",
    target_size=image_target_size)

NUM_TEST_BATCHES = test_generator.n // test_generator.batch_size + 1

raw_test_feats = new_model.predict(test_generator, steps=NUM_TEST_BATCHES, verbose=1)
raw_test_feats.shape

Found 40415 validated image filenames belonging to 3 classes.


(40415, 32)

In [71]:
test_df["img_features"] = raw_test_feats.tolist()
repo.save("test_features", test_df)

test_feats = repo.load("test_features")
test_feats.head(5)

Unnamed: 0,label,file,patient_id,scan_id,n_slice,num_clips,seq_num,clip_num,img_features
0,CP,CP/1/3143/0275.png,1,3143,300,1,0,0,"[0.5805813670158386, 0.0, 1.381737232208252, 0..."
1,CP,CP/1/3143/0064.png,1,3143,300,1,1,0,"[0.4474046230316162, 0.0, 1.0005760192871094, ..."
2,CP,CP/1/3143/0083.png,1,3143,300,1,2,0,"[0.157572939991951, 0.0, 1.0315451622009277, 0..."
3,CP,CP/1/3143/0160.png,1,3143,300,1,3,0,"[0.4971274733543396, 0.0, 1.757103681564331, 0..."
4,CP,CP/1/3143/0286.png,1,3143,300,1,4,0,"[0.0, 0.35277533531188965, 2.1408023834228516,..."


# Group features into clips

In [5]:
def agg_features(series):
    series = sorted(series, key=lambda t: t[0])
    
    return [seq_feats[1] for seq_feats in series]

In [6]:
def make_clips(df):
    df["seq_features"] = list(zip(df.seq_num, df.img_features))
    df_clips = df\
        .groupby(by=["patient_id", "scan_id", "label", "n_slice", "clip_num"])\
        .agg({
            'file': ["min", "max"],
            'seq_features': [agg_features]
        })\
        .reset_index()

    # Takes care of the multi-index after the groupby
    df_clips.columns = df_clips.columns.map(lambda t: '_'.join(t).rstrip("_"))

    df_clips = df_clips.rename(columns={
        "seq_features_agg_features": "seq_features",
        "file_min": "clip_start_file",
        "file_max": "clip_end_file",
    })
    
    return df_clips

In [7]:
train_feats = repo.load("train_features")
train_clips = make_clips(train_feats)
train_clips.head()

Unnamed: 0,patient_id,scan_id,label,n_slice,clip_num,clip_start_file,clip_end_file,seq_features
0,0,3131,CP,285,0,CP/0/3131/0000.png,CP/0/3131/0069.png,"[[0.0, 0.0, 2.9882235527038574, 0.0, 0.0, 0.0,..."
1,0,3131,CP,285,1,CP/0/3131/0070.png,CP/0/3131/0139.png,"[[0.0, 0.015588469803333282, 2.548190832138061..."
2,0,3131,CP,285,2,CP/0/3131/0140.png,CP/0/3131/0209.png,"[[0.3391307592391968, 0.016530275344848633, 2...."
3,0,3131,CP,285,3,CP/0/3131/0210.png,CP/0/3131/0279.png,"[[0.29484760761260986, 0.0, 1.7729129791259766..."
4,0,3131,CP,285,4,CP/0/3131/0280.png,CP/0/3131/0284.png,"[[2.112011671066284, 0.0, 3.815093755722046, 0..."


In [8]:
train_clips.shape

(6117, 8)

In [9]:
repo.save("train_clips", train_clips)

True

In [10]:
val_feats = repo.load("val_features")
val_clips = make_clips(val_feats)
val_clips.head()

Unnamed: 0,patient_id,scan_id,label,n_slice,clip_num,clip_start_file,clip_end_file,seq_features
0,4,3505,CP,298,0,CP/4/3505/0000.png,CP/4/3505/0069.png,"[[2.613647699356079, 0.0, 0.9890813827514648, ..."
1,4,3505,CP,298,1,CP/4/3505/0070.png,CP/4/3505/0139.png,"[[3.2934000492095947, 0.0, 5.15690279006958, 0..."
2,4,3505,CP,298,2,CP/4/3505/0140.png,CP/4/3505/0209.png,"[[3.833109140396118, 0.0, 5.286610126495361, 0..."
3,4,3505,CP,298,3,CP/4/3505/0210.png,CP/4/3505/0279.png,"[[3.43273663520813, 0.0, 0.621675968170166, 0...."
4,4,3505,CP,298,4,CP/4/3505/0280.png,CP/4/3505/0297.png,"[[4.024286270141602, 0.0, 1.0903337001800537, ..."


In [11]:
val_clips.shape

(744, 8)

In [12]:
repo.save("val_clips", val_clips)

True

In [13]:
test_feats = repo.load("test_features")
test_clips = make_clips(test_feats)
test_clips.head()

Unnamed: 0,patient_id,scan_id,label,n_slice,clip_num,clip_start_file,clip_end_file,seq_features
0,1,3143,CP,300,0,CP/1/3143/0000.png,CP/1/3143/0069.png,"[[0.7901433110237122, 0.0, 0.4899705648422241,..."
1,1,3143,CP,300,1,CP/1/3143/0070.png,CP/1/3143/0139.png,"[[0.39713576436042786, 0.0, 0.7319959402084351..."
2,1,3143,CP,300,2,CP/1/3143/0140.png,CP/1/3143/0209.png,"[[0.655151903629303, 0.0, 1.253217101097107, 0..."
3,1,3143,CP,300,3,CP/1/3143/0210.png,CP/1/3143/0279.png,"[[1.5254734754562378, 0.0, 3.0512397289276123,..."
4,1,3143,CP,300,4,CP/1/3143/0280.png,CP/1/3143/0299.png,"[[0.0, 0.09542667120695114, 1.2658686637878418..."


In [14]:
test_clips.shape

(752, 8)

In [15]:
repo.save("test_clips", test_clips)

True