# Imports

In [1]:
!git config --global core.autocrlf true

In [2]:
import pandas as pd
import os
import SimpleITK as sitk
import numpy as np
import imageio
import matplotlib.pyplot as plt
import seaborn as sb
import pydicom
import warnings
import cv2
from itertools import islice
import random
import glob

In [3]:
readIn = pd.read_csv("data/scd_patientdata.csv")
df_patientdata = pd.DataFrame(readIn)
pd.set_option("display.max_columns", None)
print(df_patientdata.shape)

(45, 5)


In [4]:
df_patientdata.head(50)

Unnamed: 0,PatientID,OriginalID,Gender,Age,Pathology
0,SCD0000101,SC-HF-I-1,Male,53,Heart failure with infarct
1,SCD0000201,SC-HF-I-2,Male,48,Heart failure with infarct
2,SCD0000301,SC-HF-I-4,Male,79,Heart failure with infarct
3,SCD0000401,SC-HF-I-5,Male,45,Heart failure with infarct
4,SCD0000501,SC-HF-I-6,Male,60,Heart failure with infarct
5,SCD0000601,SC-HF-I-7,Male,74,Heart failure with infarct
6,SCD0000701,SC-HF-I-8,Male,46,Heart failure with infarct
7,SCD0000801,SC-HF-I-9,Male,57,Heart failure with infarct
8,SCD0000901,SC-HF-I-10,Male,69,Heart failure with infarct
9,SCD0001001,SC-HF-I-11,Male,55,Heart failure with infarct


In [5]:
# Get the directory of the current script or notebook
script_dir = os.path.dirname(os.path.abspath('Cardiac MRI segmentation.ipynb'))

# Construct the relative path to the data folder
data_folder = os.path.join(script_dir, 'data', 'SCD_IMAGES')

# Initialize a dictionary to store files
files = {}

# Loop through the subdirectories within the specified path
for dirname, _, filenames in os.walk(data_folder):
    # Ensure directory paths use forward slashes for consistency
    dirname = dirname.replace('\\', '/')

    # Check if the directory is not already in the dictionary
    if dirname not in files:
        # Create a list of file paths within the directory
        file_paths = [os.path.join(dirname, filename) for filename in filenames]

        # Add the directory and its associated file paths to the dictionary
        files[dirname] = file_paths

In [6]:
# dicom_files=[]
# i=0
# for key,value in files.items():
#     print(f'Loop {i}', end='\r')
#     if (value != []) and (len(value) != 1):
#         for item in value:
#             dicom_data=pydicom.dcmread(item)
#             dicom_files.append(dicom_data)

#     i+=1

In [7]:
# def reproducible_shuffle(lst, seed):
#     random.seed(seed)
#     shuffled_lst = lst.copy()  # Az eredeti lista megőrzése
#     random.shuffle(shuffled_lst)
    
#     return shuffled_lst

In [8]:
# files=reproducible_shuffle(files, 42) #sorrend megkeverése, az adatok túl rendezetten vannak

# Image preprocessing

In [9]:
def load(file):
    dicom_files=[]
    for item in file:
        dicom_data=pydicom.dcmread(item)
        dicom_files.append(dicom_data)
    return dicom_files

### Format check
The only data thats was pulled into the dataframe is DICOM so it's already done

### Sizing

In [10]:
def resize(data):
    resized_data=[]
    for item in data:
        resized_image = cv2.resize(item.pixel_array, (256, 256))
        resized_data.append([item.PatientID, resized_image])
    return resized_data

### Normalizing data

In [11]:
def normalize(data):
    normalized_data = []
    for item in data:
        pixel = item[1]
        minimum = pixel.min()
        maximum = pixel.max()

        if minimum < 0:
            pixel = pixel + abs(minimum)
            maximum += abs(minimum)
        
        if maximum != 0:
            pixel = pixel / maximum
        
        normalized_data.append([item[0], pixel])

    return normalized_data

### Augmentation

If needed. For now, it's empty.

## Making the data

In [12]:
type(files)

dict

In [13]:
def load_existing_data(npz_file):
    if os.path.exists(npz_file):
        data = np.load(npz_file, allow_pickle=True)
        if 'ids' in data and 'images' in data:
            existing_ids = data['ids']
            existing_images = data['images']
        else:
            existing_ids = np.array([])
            existing_images = np.empty((0, 256, 256))
    else:
        existing_ids = np.array([])
        existing_images = np.empty((0, 256, 256))

    return existing_ids, existing_images

In [17]:
def convert_and_append_to_npz(dicom_folders_batch, npz_file, start_index):
    existing_ids, existing_images = load_existing_data(npz_file)

    for i, (dicom_folder, value) in enumerate(dicom_folders_batch):
        if (value!=[]):
            print(f'Loading:     {start_index + i}', end='\r')
    
            dicom_files = glob.glob(os.path.join(dicom_folder, '*.dcm'))
            loaded_files = load(dicom_files)
            print(f'Resizing:    {start_index + i}', end='\r')
            sized_data = resize(loaded_files)
            print(f'Normalizing: {start_index + i}', end='\r')
            normalized_data = normalize(sized_data)
            
            new_ids = np.array([item[0] for item in normalized_data])
            new_images = np.array([item[1] for item in normalized_data])
    
            existing_ids = np.concatenate([existing_ids, new_ids])
            existing_images = np.concatenate([existing_images, new_images])

    print(f'Saving: {start_index} to {start_index + len(dicom_folders_batch) - 1}     ')
    np.savez(npz_file, ids=existing_ids, images=existing_images)

In [19]:
len(files)

1068

In [18]:
batch_size = 100
split_num=12
base_name = 'data/images'

# Assume 'files' is your dictionary with keys as folders and values as lists of files

total_files = len(files)
split_size = total_files // split_num

for j in range(split_num):
    split_name = f'{base_name}_split{j}.npz'

    # Calculate start and end index for the current split
    start_index = j * split_size
    end_index = (j + 1) * split_size if j < split_num-1 else total_files

    # Extract the relevant subset of files for the current split
    split_files = list(files.items())[start_index:end_index]

    convert_and_append_to_npz(split_files, split_name, start_index)

    print(f"{j}. split is done")

Saving: 0 to 99     
Saving: 100 to 105     


KeyboardInterrupt: 

# Merging the data

# Exploratory Data Analysis

In [None]:
# df_eda=pd.DataFrame(df.dtypes)
# df_eda.rename(columns={0: 'type'}, inplace = True)
# df_eda.reset_index(inplace=True)

# data = df_eda["type"].value_counts()
# ax = data.plot(kind="pie", autopct='%1.1f%%', legend=True, title='Adattípusok', ylabel='', pctdistance=0.7, labeldistance=None, figsize=(5,5), fontsize=11)
# ax.legend(bbox_to_anchor=(1, 1.02), loc='upper left')
# plt.show()

# Defining input and output


### Is_Male
Flag változó

In [None]:
df_patientdata['Is_Male'] = df_patientdata['Gender'].apply(lambda x: 1 if x == 'Male' else 0)

### Pathology
Dummy változóvá alakítás

| Szöveges Érték                       | Címke | 
|--------------------------------------|-------|
| Normal                               | 0     | |
| Heart failure with infarct           | 1    |  |
| Heart failure without infarct        | 2   |   |
| Hypertrophy                          | 3  |    |


In [None]:
dict = {'Normal': 0, 'Heart failure with infarct': 1, 'Heart failure without infarct' :2, 'Hypertrophy': 3}

In [None]:
df_patientdata['Pathology']=df_patientdata['Pathology'].map(dict)

### Age Group: 
10 évenkénti csoportokban vannak, így dummy változóként lehet használni. Nem tudhatjuk hogy a kor lineárisan befolyásolja-e a betegségek kialakulását.

In [None]:
df_patientdata['Age_Group'] = (df_patientdata['Age']/10).astype(int)

In [None]:
ps_dummies = pd.get_dummies(df_patientdata['Age_Group'], prefix = 'Age_Group', drop_first = True)
df_patientdata = df_patientdata.merge(ps_dummies, left_index = True, right_index = True, how = 'left')

In [None]:
df_patientdata=df_patientdata.drop(['Gender', 'Age', 'Age_Group', 'OriginalID'], axis=1)

In [None]:
df_patientdata.to_csv('data/df_patientdata.csv', index=False)


In [None]:
del df_patientdata