In [11]:
import os
import pydicom
import matplotlib.pyplot as plt
import numpy as np
import sys
import pickle
import time

In [12]:
class TrainPatient:
    def __init__(self, p_id, dcm_obj, mask_obj):
        self.name = p_id
        self.dcm = dcm_obj
        self.mask = mask_obj 

class TestPatient:
    def __init__(self, p_id, dcm_obj):
        self.name = p_id
        self.dcm = dcm_obj

In [13]:
# Files should be containing the flair, axial t1 & t2 along with post t1
# patient instance
#     |___ name
#     |___ scans (dict)
#          |___ axial t1 - dcms
#          |___ axial t2 - dcms
#          |___ flair    - dcms
#          |___ ax post  - dcms

# Original Data Directory Hierarchy
# patient_name
#     |__ date and scan dir
#         |_ type
#             |_ dcms
#         |_ type
#             |_ dcms
#         |_ type
#             |_ dcms



In [14]:
# Paths to dir
original_data_path = '/media/omar/New Volume/TCIA Data/GBM/Data/TCGA-GBM'
segmented_data_path = '/home/omar/Desktop/DropboxData/Dropbox-GBM/TCGA-GBM'

# Testing dump on widnows
# dump_folder = '/media/omar/New Volume/TCIA Data/pickle dump'

# Key to alternates for solving issue with classifying original data
# K1: Storing names of folders chosen that contain dcm files
# K2: Storing pixel array of all dcm files
# K3: Taking only one dcm


# os.chdir(dump_folder)
# List of files in original and segmented data
original_dir = os.listdir(original_data_path)
segmented_dir = os.listdir(segmented_data_path)

# Pickle Files initialization
train = open("Train.pickle","wb")
test = open("Test.pickle","wb")

# Array to withold data that would be stored in the pickle files
train_patients = []
test_patients = []

# Some datafiles do not have PixelData, Log of these cases are kept for reference
train_log = open('TrainErrorLog.txt', 'w')
test_log = open('TestErrorLog.txt', 'w')


start = time.time()
for i in original_dir:
    # Testing Classifier
    if i not in segmented_dir:                         # i -> patient_name
        # K1
        patient_scans = []
        #K2
        patient_scans_dict = {}                             # dict for dcms of each view -> view: pixel_array
        scans_path = original_data_path + "/" + i      # path/patient_name
        list_of_scans = os.listdir(scans_path)         # date and scan dirs
        for j in list_of_scans:                   
            image_list = scans_path + "/" + j          # path/patient_name/date_and_scan_dir
            list_of_image_set = os.listdir(image_list)
            # Getting all dcms
            # K2
#             all_dcms = []
            for k in list_of_image_set:
                if ("AX" in k and "T2" in k) or ("Axial" in k and "T2" in k) or \
                ("AX" in k and "T1" in k) or ("AXIAL" in k and "T1" in k) or \
                ("AX" in k and "POST" in k) or ("Ax" in k and "POST" in k) or \
                ("ax" in k and "post" in k) or ("AX" in k and "Post" in k) or \
                ("Ax" in k and "Post" in k) or ("AX" in k and "FLAIR" in k) \
                or ("ax" in k and "flair" in k) or ("AXIAL" in k and "FLAIR" in k) \
                or ("AX" in k and "Post" in k) or ("AxFlair" in k) or ("AX T2 FLAIR" in k) \
                or ("AX T1 FLAIR" in k):
                    dcms_of_image_set = image_list + "/" + k    # path/patient_name/date_and_scan_dir/type
                    list_of_dcms = os.listdir(dcms_of_image_set)
                    for l in list_of_dcms:                      # path/patient_name/date_and_scan_dir/type/dcms
                        # Some images have no pixel_arrays, try and catch to handle these cases
                        ds = pydicom.dcmread(dcms_of_image_set + "/" + l)
                        try:
                            # K3 and also for testing
                            patient_scans_dict[k] = ds.pixel_array
                            # K2
                            # all_dcms.append(ds.pixel_array)
                            # K1
                        except:
                            log = "Patient name: %s - Scan File: %s - View: %s - Faulty DCM: %s" % (i,j,k,l)
                            test_log.write("%s\n" % log)
                        patient_scans.append(k)
                    # K2
                    # patient_scans[k] = all_dcms
        patient = TestPatient(i, patient_scans)
        test_patients.append(patient)
    # Training Classifier
    else:
        original_scans = []
        segmentation_scans = {}
        seg_dcm_path = segmented_data_path + "/" + i + "/DSO/"
        seg_dcm_files = os.listdir(seg_dcm_path)
        scans_path = original_data_path + "/" + i
        list_of_scans = os.listdir(scans_path)
        for j in list_of_scans:
            image_list = scans_path + "/" + j
            list_of_image_set = os.listdir(image_list)
            for k in list_of_image_set:
                # Getting all dcms
                all_dcms = []
                for seg_k in seg_dcm_files:
                    if ("seg_corrected" in seg_k) and (seg_k.split(".dcm")[0][-5:] == k.split(".dcm")[0][-5:]):
                        seg_ds = pydicom.dcmread(seg_dcm_path + "/" + seg_k)
                        segmentation_scans[seg_k] = seg_ds.pixel_array
                        dcms_of_image_set = image_list + "/" + k
                        list_of_dcms = os.listdir(dcms_of_image_set)
                        for l in list_of_dcms:
                            ds = pydicom.dcmread(dcms_of_image_set + "/" + l)
                            all_dcms.append(ds.pixel_array)
#                             original_scans[k] = ds.pixel_array
                        original_scans.append(k)
        patient = TrainPatient(i, original_scans, segmentation_scans)
        train_patients.append(patient)
            

pickle.dump(test_patients, test)
pickle.dump(train_patients, train)
test.close()
train.close()
test_log.close()
train_log.close()
end = time.time()
print ("Execution time for classifying test cases: ", end-start)

KeyboardInterrupt: 

In [5]:
load_train = open("Train.pickle", "rb")
load_test = open("Test.pickle","rb")

train_data = pickle.load(load_train)
test_data = pickle.load(load_test)


print("TRAIN DATA \n-----------------------------------", len(train_data))
for i in train_data:
    print ("Name: ", i.name)
    print ("DCMs: ", i.dcm)
#     print ("MASK: ", i.mask)
    print("__________________\n")
    
print("TEST DATA \n-----------------------------------", len(test_data))
for i in train_data:
    print ("Name: ", i.name)
    print ("DCMs: ", i.dcm)
    print("__________________\n")

    
# Notes:
# """
# original data issues:
# 1- Some dcms dont have pixel array - resolved with try and catch + detailed log kept in text file
# 2- Storing all dcms per scan in the object - UNRESOLVED due to kernel dying
# """

TRAIN DATA 
----------------------------------- 29
Name:  TCGA-02-0006
DCMs:  ['3-AX T2 FSE-27597', '4-AX FLAIR-23922', '5-AX T1-62419', '6-AX T1 POST-26504']
__________________

Name:  TCGA-02-0009
DCMs:  ['3-AX T2 FSE-01423', '4-AX FLAIR-31290', '5-AX T1-38458', '6-AX T1 POST-70502']
__________________

Name:  TCGA-02-0046
DCMs:  ['3-AX T2 FSE-71371', '4-AX FLAIR-45181', '5-AX T1-73320', '6-AX T1 POST-76642']
__________________

Name:  TCGA-02-0054
DCMs:  ['4-AX T2 FSE-25529', '5-AX FLAIR-89982', '6-AX T1-67057', '9-AX T1 POST-33611']
__________________

Name:  TCGA-02-0075
DCMs:  ['10-FSE T2-35736', '11-T2 FLAIR-76096', '14-FSPGR 3D-90164', '17-Ax SE T1 Post-08723']
__________________

Name:  TCGA-02-0085
DCMs:  ['3-AX T2 FSE-87118', '4-AX FLAIR-14351', '5-AX T1-84008', '6-AX T1 POST-23335']
__________________

Name:  TCGA-02-0086
DCMs:  ['11-AX T1 POST-86265', '3-AX T2 FSE-43139', '4-AX FLAIR-08895', '6-AX T1-94367']
__________________

Name:  TCGA-02-0106
DCMs:  ['3-AX T2 FSE-0337