# 01 - Preprocessing A4C Echocardiographic Videos

This notebook performs data preprocessing for the A4C (Apical 4 Chamber) view from the HMC-QU echocardiographic dataset. It includes:

- Labeling videos as MI or non-MI based on filenames
- Stratified splitting into training, validation, and test sets
- Copying video files into split folders
- Extracting a fixed number of frames per video
- Saving extracted frames to structured directories

The output of this notebook is used as input to the model trainie.
pection


In [17]:
import os
import shutil
from sklearn.model_selection import StratifiedShuffleSplit
import myutils  

In [18]:
path2data = "../data"  
sub_folder = "A4C"
sub_folder_jpg = "A4C_jpg"
path2aCatgs = os.path.join(path2data, sub_folder)

train_folder = os.path.join(path2data, "A4C_training")
val_folder = os.path.join(path2data, "A4C_validation")
test_folder = os.path.join(path2data, "A4C_test")

for folder in [train_folder, val_folder, test_folder]:
    os.makedirs(folder, exist_ok=True)



In [19]:
videos, labels = [], []
for root, _, files in os.walk(path2aCatgs):
    for file in files:
        if file.endswith(".avi"):
            videos.append(os.path.join(root, file))
            labels.append(0 if "n" in file.lower() else 1)



In [20]:
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)
train_idx, test_idx = next(sss.split(videos, labels))

train_videos = [videos[i] for i in train_idx]
train_labels = [labels[i] for i in train_idx]
test_videos = [videos[i] for i in test_idx]
test_labels = [labels[i] for i in test_idx]

sss_val = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=0)
val_idx, final_test_idx = next(sss_val.split(test_videos, test_labels))

val_videos = [test_videos[i] for i in val_idx]
val_labels = [test_labels[i] for i in val_idx]
final_test_videos = [test_videos[i] for i in final_test_idx]
final_test_labels = [test_labels[i] for i in final_test_idx]

print(f"Training set: {len(train_videos)} videos")
print(f"Validation set: {len(val_videos)} videos")
print(f"Final Test set: {len(final_test_videos)} videos")



Training set: 101 videos
Validation set: 13 videos
Final Test set: 13 videos


In [21]:
def move_videos(video_list, dest_folder):
    for video in video_list:
        shutil.copy(video, os.path.join(dest_folder, os.path.basename(video)))

move_videos(train_videos, train_folder)
move_videos(val_videos, val_folder)
move_videos(final_test_videos, test_folder)


In [22]:
n_frames = 15
train_frame_count = 0
val_frame_count = 0
test_frame_count = 0

for split, videos_folder in zip(["train", "val", "final_test"], [train_folder, val_folder, test_folder]):
    for video in os.listdir(videos_folder):
        video_path = os.path.join(videos_folder, video)
        frames, _ = myutils.get_frames(video_path, n_frames=n_frames)
        path2store = video_path.replace(sub_folder, sub_folder_jpg).replace(".avi", "")
        os.makedirs(path2store, exist_ok=True)
        myutils.store_frames(frames, path2store)

        if split == "train":
            train_frame_count += len(frames)
        elif split == "val":
            val_frame_count += len(frames)
        else:
            test_frame_count += len(frames)

print(f"Total frames in training set: {train_frame_count}")
print(f"Total frames in validation set: {val_frame_count}")
print(f"Total frames in final test set: {test_frame_count}")



Total frames in training set: 1616
Total frames in validation set: 208
Total frames in final test set: 208


In [23]:
def count_videos(folder):
    mi_count = 0
    non_mi_count = 0
    for file in os.listdir(folder):
        if file.endswith(".avi"):
            if "n" in file.lower():
                non_mi_count += 1
            else:
                mi_count += 1
    return mi_count, non_mi_count

for name, path in {
    "Training": train_folder,
    "Validation": val_folder,
    "Test": test_folder
}.items():
    mi, non_mi = count_videos(path)
    print(f"{name}: {mi} MI videos, {non_mi} non-MI videos")


Training: 60 MI videos, 41 non-MI videos
Validation: 8 MI videos, 5 non-MI videos
Test: 8 MI videos, 5 non-MI videos


In [24]:
import pickle

with open("a4c_video_splits.pkl", "wb") as f:
    pickle.dump({
        "train_videos": train_videos,
        "val_videos": val_videos,
        "final_test_videos": final_test_videos,
        "train_labels": train_labels,
        "val_labels": val_labels,
        "final_test_labels": final_test_labels
    }, f)

print("✅ Saved A4C video splits to a4c_video_splits.pkl")


✅ Saved A4C video splits to a4c_video_splits.pkl
