
## Data Preprocessing


**Author(s):** Zhanxiang (Sean) Sun [@seansunn](https://github.com/seansunn)

**Contributor(s):** Harvey Mannering [@harveymannering](https://github.com/harveymannering) and Miguel Xochicale [@mxochicale](https://github.com/mxochicale)


### Introduction
This notebook filters out unwanted images by plane and machine. The kept images are from Trans-thalamic, Trans-cerebellum, and Trans-ventricular planes, and from machine Voluson E6.

### Running notebook
Go to repository path: `cd $HOME/repositories/budai4medtech/midl2023/notebooks`   
Open repo in pycharm and in the terminal type:
```
git checkout master # or the branch
git pull # to bring a local branch up-to-date with its remote version
```
Launch Notebook server:
```
conda activate febusisVE
jupyter notebook --browser=firefox
```
which will open your web-browser.


### Logbook
* [DATE]: Short description of the update


### References
* FETAL_PLANES_DB: Common maternal-fetal ultrasound images.    
The final dataset is comprised of over 12,400 images from 1,792 patients. 
https://zenodo.org/record/3904280


In [1]:
import os
import zipfile
import torch
from torchvision.transforms import Compose, Resize, ToTensor
import numpy as np
import pandas as pd
from PIL import Image
import matplotlib.pyplot as plt
np.random.seed(seed=42)
%matplotlib inline


In [2]:
# load csv
df = pd.read_csv('FETAL_PLANES_DB_data.csv', delimiter=';')
df.head()


Unnamed: 0,Image_name,Patient_num,Plane,Brain_plane,Operator,US_Machine,Train
0,Patient00001_Plane1_1_of_15,1,Other,Not A Brain,Other,Aloka,1
1,Patient00001_Plane1_2_of_15,1,Other,Not A Brain,Other,Aloka,1
2,Patient00001_Plane1_3_of_15,1,Other,Not A Brain,Other,Aloka,1
3,Patient00001_Plane1_4_of_15,1,Other,Not A Brain,Other,Aloka,1
4,Patient00001_Plane1_5_of_15,1,Other,Not A Brain,Other,Aloka,1


In [3]:
# subsetting the rows related to 'Fetal brain'
df = df[df['Plane'] == 'Fetal brain']

# check unique categories under brain plane
df['Brain_plane'].unique()


array(['Trans-thalamic', 'Trans-cerebellum', 'Trans-ventricular', 'Other'],
      dtype=object)

In [4]:
# check unique categories under machine
df['US_Machine'].unique()


array(['Aloka', 'Voluson E6', 'Voluson S10', 'Other'], dtype=object)

In [5]:
# remove the 'Other'
df = df[(df['Brain_plane']!='Other') & (df['US_Machine']!='Other')].reset_index()

# keep necessary columns
df = df[['Image_name', 'Brain_plane', 'US_Machine']]
df.head()


Unnamed: 0,Image_name,Brain_plane,US_Machine
0,Patient00168_Plane3_1_of_3,Trans-thalamic,Aloka
1,Patient00168_Plane3_2_of_3,Trans-thalamic,Aloka
2,Patient00168_Plane3_3_of_3,Trans-cerebellum,Aloka
3,Patient00188_Plane3_1_of_3,Trans-thalamic,Aloka
4,Patient00188_Plane3_2_of_3,Trans-ventricular,Aloka


In [6]:
def train_test_split(lst):
    # 80-20 split
    split_len = int(len(lst) * 0.8)
    np.random.shuffle(lst)
    return lst[:split_len], lst[split_len:]


def get_data(df, plane, machine='Voluson E6'):
    new_df = df[(df['Brain_plane']==plane) & (df['US_Machine']==machine)]
    new_df = new_df['Image_name'].to_numpy()
    train, test = train_test_split(new_df)
    return train, test


In [7]:
train_list = []
test_list = []

for i in df['Brain_plane'].unique():
    train, test = get_data(df, i)
    np.save(f'{i}.npy', train)
    train_list.extend(train)
    test_list.append(test)


In [8]:
# pack test images to tensor for the convenience of calculating FID
# already expanded to 3 channels
def get_tensor(plane_name, img_names, size=256):
    tensor_list = []
    transforms = Compose([
        Resize((size, size)),
        ToTensor()
        ])

    for i in img_names:
        img = Image.open(f'Images/{i}.png')
        x = transforms(img)
        x = x.type(torch.uint8)
        x = x.unsqueeze(0)
        x = x.repeat(1, 3, 1, 1)
        tensor_list.append(x)
    tensor_cat = torch.cat(tensor_list, 0)
    file_name = f'{plane_name}.pt'
    torch.save(tensor_cat, f'{plane_name}.pt')
    print(f'File name: {file_name}\nTensor shape: {tensor_cat.size()}\n')


In [9]:
for i in range(3):
    plane_name = df['Brain_plane'].unique()[i]
    get_tensor(plane_name, test_list[i])


File name: Trans-thalamic.pt
Tensor shape: torch.Size([215, 3, 256, 256])

File name: Trans-cerebellum.pt
Tensor shape: torch.Size([99, 3, 256, 256])

File name: Trans-ventricular.pt
Tensor shape: torch.Size([82, 3, 256, 256])



In [10]:
# zip the train images for upload purpose
with zipfile.ZipFile('data.zip', 'w') as myzip:
    for i in train_list:
        file_path = f'Images/{i}.png'
        myzip.write(file_path)
