### Import Libraries and install packages

In [9]:
!pip install -U dicom

Collecting dicom
  Downloading https://files.pythonhosted.org/packages/a0/51/6c3134fb6f33bb674b25929b8091a921d5ceedd571d79844ce5b244fab1d/dicom-0.9.9.post1.tar.gz (423kB)
Building wheels for collected packages: dicom
  Running setup.py bdist_wheel for dicom: started
  Running setup.py bdist_wheel for dicom: finished with status 'done'
  Stored in directory: C:\Users\Ayala\AppData\Local\pip\Cache\wheels\4e\d2\a3\ca1790a3223c07110ce515272de55b890484fa39b82a3d040f
Successfully built dicom
Installing collected packages: dicom
Successfully installed dicom-0.9.9.post1


distributed 1.21.8 requires msgpack, which is not installed.
grin 1.2.1 requires argparse>=1.1, which is not installed.


In [11]:
import dicom
import os
import numpy as np
from matplotlib import pyplot, cm
import tarfile
import urllib
import pandas

### Get Files

In [12]:
url = 'https://s3.amazonaws.com/viz_data/DM_TH.tgz' # Import file from URL to notebook location 
urllib.urlretrieve(url, './DM_TH.tgz')  

('./DM_TH.tgz', <httplib.HTTPMessage instance at 0x0000000008744088>)

In [13]:
TarPath = "./DM_TH.tgz" # Extract tar file in the same location - get all the DICOM files
tar = tarfile.open(TarPath)
tar.extractall()
tar.close()

In [14]:
DicomPath = "./"
DcomFilesList = []  # Create an empty list
for dirName, subdirList, fileList in os.walk(DicomPath):
    for filename in fileList:
        if ".dcm" in filename.lower():  # Add to list only if the file is DICOM
            DcomFilesList.append(os.path.join(dirName,filename))

### Read and understand data

In [63]:
# Save one file as a sample record
RefDs = dicom.read_file(DcomFilesList[0])

In [64]:
RefDs # view sample record

(0008, 0000) Group Length                        UL: 956
(0008, 0005) Specific Character Set              CS: 'ISO_IR 100'
(0008, 0008) Image Type                          CS: ['ORIGINAL', 'PRIMARY', 'AXIAL']
(0008, 0012) Instance Creation Date              DA: '20150120'
(0008, 0013) Instance Creation Time              TM: '163239'
(0008, 0016) SOP Class UID                       UI: CT Image Storage
(0008, 0018) SOP Instance UID                    UI: 1.2.840.113619.2.337.3.2831186181.442.1421722000.429.1
(0008, 0020) Study Date                          DA: '20000101'
(0008, 0021) Series Date                         DA: ''
(0008, 0022) Acquisition Date                    DA: ''
(0008, 0023) Content Date                        DA: ''
(0008, 0030) Study Time                          TM: '163150'
(0008, 0031) Series Time                         TM: '163222'
(0008, 0032) Acquisition Time                    TM: '163234.571806'
(0008, 0033) Content Time                        TM: '163239'


### Transfer data into dataframe

In [71]:
column_names = ['PatientName','StudyInstanceUID','SeriesInstanceUID','PatientAge','PatientSex','InstitutionName'] # dataframe headers

In [94]:
n = len(DcomFilesList); # length of loop
DcmArray = np.empty((0, 6)) # init array

# This loop inserts each DICOM file into one array row, extracting relevant info only using the data_element function
for i in range(0,n):
    
    RefDs = dicom.read_file(DcomFilesList[i])
    PatientName = RefDs.data_element("PatientName")
    PatientName = PatientName.value
    StudyInstanceUID = RefDs.data_element("StudyInstanceUID") 
    StudyInstanceUID = StudyInstanceUID.value
    SeriesInstanceUID = RefDs.data_element("SeriesInstanceUID")  
    SeriesInstanceUID = SeriesInstanceUID.value
    PatientAge = RefDs.data_element("PatientAge")
    PatientAge = PatientAge.value
    PatientSex = RefDs.data_element("PatientSex")
    PatientSex = PatientSex.value
    InstitutionName = RefDs.data_element("InstitutionName")
    InstitutionName = InstitutionName.value
  
    result = [PatientName,StudyInstanceUID,SeriesInstanceUID,PatientAge,PatientSex,InstitutionName]
    DcmArray = np.append(DcmArray,[result], axis=0) # adding file #i data into the array
   
    i=i+1

df = pd.DataFrame(DcmArray, columns=column_names)
print df.head()


                                         PatientName  \
0  1.2.840.113619.2.337.3.2831186181.442.14217220...   
1  1.2.840.113619.2.337.3.2831186181.442.14217220...   
2  1.2.840.113619.2.337.3.2831186181.442.14217220...   
3  1.2.840.113619.2.337.3.2831186181.442.14217220...   
4  1.2.840.113619.2.337.3.2831186181.442.14217220...   

                                    StudyInstanceUID  \
0  1.2.840.113619.2.337.3.2831186181.442.14217220...   
1  1.2.840.113619.2.337.3.2831186181.442.14217220...   
2  1.2.840.113619.2.337.3.2831186181.442.14217220...   
3  1.2.840.113619.2.337.3.2831186181.442.14217220...   
4  1.2.840.113619.2.337.3.2831186181.442.14217220...   

                                   SeriesInstanceUID PatientAge PatientSex  \
0  1.2.840.113619.2.337.3.2831186181.442.14217220...       011Y          M   
1  1.2.840.113619.2.337.3.2831186181.442.14217220...       011Y          M   
2  1.2.840.113619.2.337.3.2831186181.442.14217220...       011Y          M   
3  1.2.840.113

### Question #1: generate a list of patients, their age and sex

In [104]:
PatientList = df[['PatientName','PatientAge','PatientSex']] # Subsetting the dataframe
PatientList.drop_duplicates(subset=None, keep="first", inplace=True) # Dropping duplicates
PatientList.to_csv('PatientList.csv') # Save as CSV

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


### Question #2: how many different hospitals do the data come from?

In [106]:
df['InstitutionName'].nunique() # count unique records in the InstitutionName column

3

### Question #3: explore the following DICOM tags, and try to explain what they mean, and the differences and relationships between them. Feel free to use appropriate visualizations as necessary. 

In [187]:
column_names = ['PatientName','StudyInstanceUID','SeriesInstanceUID','PatientAge','PatientSex','InstitutionName','Col1','Col2','Col3','Col4']

# Same array as before, adding 4 new columns
n = len(DcomFilesList);
DcmArray1 = np.empty((0, 10))
for i in range(0,n):
    
    RefDs = dicom.read_file(DcomFilesList[i])
    PatientName = RefDs.data_element("PatientName")
    PatientName = PatientName.value
    StudyInstanceUID = RefDs.data_element("StudyInstanceUID") 
    StudyInstanceUID = StudyInstanceUID.value
    SeriesInstanceUID = RefDs.data_element("SeriesInstanceUID")  
    SeriesInstanceUID = SeriesInstanceUID.value
    PatientAge = RefDs.data_element("PatientAge")
    PatientAge = PatientAge.value
    PatientSex = RefDs.data_element("PatientSex")
    PatientSex = PatientSex.value
    InstitutionName = RefDs.data_element("InstitutionName")
    InstitutionName = InstitutionName.value
    if [0x0008,0x0013] in RefDs: # Saving specific tag values in meaningless column headers (since I don't know what they mean yet)
        Col1 = RefDs[0x0008,0x0013].value
    if [0x0008,0x0013] in RefDs:
        Col2 = RefDs[0x0008,0x0032].value
    if [0x0008,0x0013] in RefDs:
        Col3 = RefDs[0x0020,0x0012].value
    if [0x0008,0x0013] in RefDs:
        Col4 = RefDs[0x0020,0x0013].value
    result = [PatientName,StudyInstanceUID,SeriesInstanceUID,PatientAge,PatientSex,InstitutionName,Col1,Col2,Col3,Col4]
    DcmArray1 = np.append(DcmArray1,[result], axis=0)
    i=i+1

df1 = pd.DataFrame(DcmArray1, columns=column_names)
print df1.head()
df1.to_csv('df1.csv') # Save as CSV

                                         PatientName  \
0  1.2.840.113619.2.337.3.2831186181.442.14217220...   
1  1.2.840.113619.2.337.3.2831186181.442.14217220...   
2  1.2.840.113619.2.337.3.2831186181.442.14217220...   
3  1.2.840.113619.2.337.3.2831186181.442.14217220...   
4  1.2.840.113619.2.337.3.2831186181.442.14217220...   

                                    StudyInstanceUID  \
0  1.2.840.113619.2.337.3.2831186181.442.14217220...   
1  1.2.840.113619.2.337.3.2831186181.442.14217220...   
2  1.2.840.113619.2.337.3.2831186181.442.14217220...   
3  1.2.840.113619.2.337.3.2831186181.442.14217220...   
4  1.2.840.113619.2.337.3.2831186181.442.14217220...   

                                   SeriesInstanceUID PatientAge PatientSex  \
0  1.2.840.113619.2.337.3.2831186181.442.14217220...       011Y          M   
1  1.2.840.113619.2.337.3.2831186181.442.14217220...       011Y          M   
2  1.2.840.113619.2.337.3.2831186181.442.14217220...       011Y          M   
3  1.2.840.113

After exploring the CSV file, it seems that Col 1 and Col 2 hold date or time data - I will convert them as my next step. Col 4 is the instance number. Col 3 is not the series number, but some kind of other breakdown - possibly change between angles(?) You can see that the time columns have the same values in all instances of the same series, probably refering to the start and the end times of the series.

### Question #4: How long does a typical CT scan take? 

In [144]:
from datetime import datetime # I need this package to deal with the date and time data

In [193]:
# Calculating the min and max time for each series and then extracting the time differences so get the scan length
df2 = df1.groupby(['PatientName','SeriesInstanceUID'])['Col1'].max().reset_index()
df3 = df1.groupby(['PatientName','SeriesInstanceUID'])['Col2'].min().reset_index()
df4 = pd.concat([df2, df3], axis=1)

df4['Col1'] = pandas.to_datetime(df1['Col1'], unit='s') # convert to date and time
df4['Col2'] = pandas.to_datetime(df1['Col2'], unit='s')
df4['ExamTime'] = (df4['Col1']-df4['Col2']).dt.total_seconds() # calculate differences between the time columns to get the number of seconds the exam takes

df4

Unnamed: 0,PatientName,SeriesInstanceUID,Col1,PatientName.1,SeriesInstanceUID.1,Col2,ExamTime
0,1.2.840.113619.2.337.3.2831186181.442.14217220...,1.2.840.113619.2.337.3.2831186181.442.14217220...,1970-01-02 21:20:39,1.2.840.113619.2.337.3.2831186181.442.14217220...,1.2.840.113619.2.337.3.2831186181.442.14217220...,1970-01-02 21:20:34.571806,4.428194
1,1.2.840.113619.2.337.3.2831186181.704.14202533...,1.2.840.113619.2.337.3.2831186181.704.14202533...,1970-01-02 21:20:39,1.2.840.113619.2.337.3.2831186181.704.14202533...,1.2.840.113619.2.337.3.2831186181.704.14202533...,1970-01-02 21:20:34.571806,4.428194
2,1.2.840.113619.2.337.3.2831186181.801.14145504...,1.2.840.113619.2.337.3.2831186181.801.14145504...,1970-01-02 21:20:40,1.2.840.113619.2.337.3.2831186181.801.14145504...,1.2.840.113619.2.337.3.2831186181.801.14145504...,1970-01-02 21:20:34.571806,5.428194
3,1.2.840.113619.2.337.3.2831186181.801.14145504...,1.2.840.113619.2.337.3.2831186181.801.14145504...,1970-01-02 21:20:40,1.2.840.113619.2.337.3.2831186181.801.14145504...,1.2.840.113619.2.337.3.2831186181.801.14145504...,1970-01-02 21:20:34.571806,5.428194
4,1.3.12.2.1107.5.1.4.0.300000160805094520515000...,1.3.12.2.1107.5.1.4.0.300000160805094525000000...,1970-01-02 21:20:44,1.3.12.2.1107.5.1.4.0.300000160805094520515000...,1.3.12.2.1107.5.1.4.0.300000160805094525000000...,1970-01-02 21:20:37.877725,6.122275
5,1.3.12.2.1107.5.1.4.0.300000160820013410609000...,1.3.12.2.1107.5.1.4.0.300000160820011900390000...,1970-01-02 21:20:44,1.3.12.2.1107.5.1.4.0.300000160820013410609000...,1.3.12.2.1107.5.1.4.0.300000160820011900390000...,1970-01-02 21:20:37.877725,6.122275
6,1.3.12.2.1107.5.1.4.0.300000160820013410609000...,1.3.12.2.1107.5.1.4.0.300000160820011900390000...,1970-01-02 21:20:44,1.3.12.2.1107.5.1.4.0.300000160820013410609000...,1.3.12.2.1107.5.1.4.0.300000160820011900390000...,1970-01-02 21:20:37.877725,6.122275


In [194]:
df4['ExamTime'].mean() # calculate the mean of the ExamTime column

5.439943