# Imports

In [1]:
!git config --global core.autocrlf true

In [2]:
import pandas as pd
import os
import SimpleITK as sitk
import numpy as np
import imageio
import matplotlib.pyplot as plt
import seaborn as sb
import pydicom
import warnings
import cv2
from itertools import islice
import random

In [3]:
readIn = pd.read_csv("data/scd_patientdata.csv")
df_patientdata = pd.DataFrame(readIn)
pd.set_option("display.max_columns", None)
print(df_patientdata.shape)

FileNotFoundError: [Errno 2] No such file or directory: 'data/scd_patientdata.csv'

In [None]:
df_patientdata.head(50)

In [None]:
# Get the directory of the current script or notebook
script_dir = os.path.dirname(os.path.abspath('Cardiac MRI segmentation.ipynb'))

# Construct the relative path to the data folder
data_folder = os.path.join(script_dir, 'data', 'SCD_IMAGES')

# Initialize a dictionary to store files
files = {}

# Loop through the subdirectories within the specified path
for dirname, _, filenames in os.walk(data_folder):
    # Ensure directory paths use forward slashes for consistency
    dirname = dirname.replace('\\', '/')

    # Check if the directory is not already in the dictionary
    if dirname not in files:
        # Create a list of file paths within the directory
        file_paths = [os.path.join(dirname, filename) for filename in filenames]

        # Add the directory and its associated file paths to the dictionary
        files[dirname] = file_paths

In [None]:
dicom_files=[]
i=0
for key,value in files.items():
    print(f'Loop {i}', end='\r')
    if (value != []) and (len(value) != 1):
        for item in value:
            dicom_data=pydicom.dcmread(item)
            dicom_files.append(dicom_data)

    i+=1

In [None]:
del files

In [None]:
def reproducible_shuffle(lst, seed):
    random.seed(seed)
    shuffled_lst = lst.copy()  # Az eredeti lista megőrzése
    random.shuffle(shuffled_lst)
    
    return shuffled_lst

In [None]:
len(dicom_files)

In [None]:
dicom_files=reproducible_shuffle(dicom_files, 42) #sorrend megkeverése, az adatok túl rendezetten vannak

In [None]:
pixel_data = dicom_files[222].pixel_array

# Now you can work with the NumPy array as needed
print("Shape of NumPy array:", pixel_data.shape)

In [None]:
plt.imshow(pixel_data, cmap='gray')  # 'gray' colormap is suitable for grayscale images
plt.show()

# Image preprocessing

### Format check
The only data thats was pulled into the dataframe is DICOM so it's already done

### Sizing

In [None]:
max_batch=20000 #48327 - ennyi adatunk van. ha nagyon lassú, állítsd 10000-re!
batch_size = 50  # Válaszd meg a megfelelő batch méretet

In [None]:
resized_data = []

for i in range(0, max_batch, batch_size):
    batch = islice(dicom_files, i, i + batch_size)
    print(f'Processing {i}', end='\r')
    batch_resized_images = []
    for item in batch:
        resized_image = cv2.resize(item.pixel_array, (256, 256))
        batch_resized_images.append([item.PatientID, resized_image])

    resized_data.extend(batch_resized_images)


In [None]:
del dicom_files

### Normalizing data

In [None]:
normalized_data = []

for i in range(0, max_batch, batch_size):
    batch = islice(resized_data, i, i + batch_size)
    print(f'Processing {i}', end='\r')
    for item in batch:
        pixel = item[1]
        minimum = pixel.min()
        maximum = pixel.max()

        if minimum < 0:
            pixel = pixel + abs(minimum)
            maximum += abs(minimum)
        
        if maximum != 0:
            pixel = pixel / maximum
        
        normalized_data.append([item[0], pixel])


In [None]:
del resized_data

### Augmentation

If needed. For now, it's empty.

# Merging the data

In [None]:
df_image=pd.DataFrame(normalized_data, columns=['PatientID', 'Image'])

In [None]:
df_image.head(10)

In [None]:
ids = df_image['PatientID'].values
images = df_image['Image'].values

In [None]:
np.savez('data/images.npz', ids=ids, images=images)

# Exploratory Data Analysis

In [None]:
# df_eda=pd.DataFrame(df.dtypes)
# df_eda.rename(columns={0: 'type'}, inplace = True)
# df_eda.reset_index(inplace=True)

# data = df_eda["type"].value_counts()
# ax = data.plot(kind="pie", autopct='%1.1f%%', legend=True, title='Adattípusok', ylabel='', pctdistance=0.7, labeldistance=None, figsize=(5,5), fontsize=11)
# ax.legend(bbox_to_anchor=(1, 1.02), loc='upper left')
# plt.show()

# Defining input and output


### Is_Male
Flag változó

In [None]:
df_patientdata['Is_Male'] = df_patientdata['Gender'].apply(lambda x: 1 if x == 'Male' else 0)

### Pathology
Dummy változóvá alakítás

| Szöveges Érték                       | Címke | 
|--------------------------------------|-------|
| Normal                               | 0     | |
| Heart failure with infarct           | 1    |  |
| Heart failure without infarct        | 2   |   |
| Hypertrophy                          | 3  |    |


In [None]:
dict = {'Normal': 0, 'Heart failure with infarct': 1, 'Heart failure without infarct' :2, 'Hypertrophy': 3}

In [None]:
df_patientdata['Pathology']=df_patientdata['Pathology'].map(dict)

### Age Group: 
10 évenkénti csoportokban vannak, így dummy változóként lehet használni. Nem tudhatjuk hogy a kor lineárisan befolyásolja-e a betegségek kialakulását.

In [None]:
df_patientdata['Age_Group'] = (df_patientdata['Age']/10).astype(int)

In [None]:
ps_dummies = pd.get_dummies(df_patientdata['Age_Group'], prefix = 'Age_Group', drop_first = True)
df_patientdata = df_patientdata.merge(ps_dummies, left_index = True, right_index = True, how = 'left')

In [None]:
df_patientdata=df_patientdata.drop(['Gender', 'Age', 'Age_Group', 'OriginalID'], axis=1)

In [None]:
df_patientdata.to_csv('data/df_patientdata.csv', index=False)


In [None]:
del df_patientdata