### Imports

In [None]:
import logging

from IPython.core.display import display, HTML

from scripts.main import *
from scripts.retrieve_data import *
from scripts.extract_data import *

%load_ext autoreload
%reload_ext autoreload
%autoreload 2

# set the width of the notebook
display(HTML("<style>.container { width:95% !important; }</style>"))

### Prepare for tests

In [None]:
create_logger()
config = load_config()

In [None]:
retrieve_and_save_data_from_PACS(config)

In [None]:
df, df_count = extract_transform_and_save_data_from_files(config)
#display(df)
display(df_count)

In [None]:
df[(df['Start Time'].isnull())
    | (df['End Time'].isnull())
    | (df['Machine'] == '')
    | (df['Institution Name'] == '')]

### Try to figure out why some series failed

In [None]:
df_failed_with_info = fetch_info_for_series(config, df_failed)

In [None]:
df2 = pd.concat([df, df_failed_with_info], sort=True)
df2.drop_duplicates('Series Instance UID')

In [None]:
df_rescued_series = df_failed_with_info.copy()
df_failed_series = df_rescued_series[
            (df_rescued_series['Start Time'].isnull())
            | (df_rescued_series['End Time'].isnull())
            | (df_rescued_series['Machine'] == '')
            | (df_rescued_series['Institution Name'] == '')]
# exclude series where some information could still not be gathered (e.g. no end time or no machine)
df_rescued_series = df_rescued_series.loc[~df_rescued_series.index.isin(df_failed_series.index), :]
df2 = pd.concat([df, df_rescued_series], sort=True)
df_failed_series
df2


### Find all studies and series

In [None]:
df_studies = find_studies_for_day(config, config['main']['start_date'].replace('-', ''), ['PT', 'NM'])
df_all_series = find_series_for_studies(config, df_studies)

In [None]:
df_series = df_all_series.copy()

df_series_subset = pd.concat([df_all_series[df_all_series.Modality == modality].head(n = 10) for modality in set(df_all_series.Modality)]).sort_values('Series Time').reset_index(drop=True)
df_series_subset
df_series = df_series_subset.copy()
df_series

In [None]:
df_series = fetch_info_for_series(config, df_series)

In [None]:
df_series

### Step by step before turing it to an API

In [None]:
# list of field names to extract for each modality
to_fetch_fields_ctpt = ['SeriesInstanceUID', 'PatientID', 'InstanceNumber', 'ManufacturerModelName',
    'AcquisitionTime', 'Modality']
to_fetch_fields_nm = ['SeriesInstanceUID', 'PatientID', 'InstanceNumber', 'ManufacturerModelName',
    'AcquisitionTime', 'Modality', 'ActualFrameDuration', 'NumberOfFrames', '0x00540032', '0x00540052']

# create modality specific masks of the DataFrame
df_series_ctpt = df_series[df_series['Modality'].isin(['PT', 'CT'])]
df_series_nm = df_series[df_series['Modality'] == 'NM']
display(df_series_ctpt)
display(df_series_nm)

In [None]:
# prepare the CT/PT queries for the first instance (first image)
query_dicts_ctpt = list(df_series_ctpt.apply(lambda row: {
    'SeriesDate': row['Series Date'],
    'PatientID': row['Patient ID'],
    'SeriesInstanceUID': row['Series Instance UID'],
    'InstanceNumber': '1'
}, axis=1))
# prepare the CT/PT queries for the last instance (last image)
df_last_frames = df_series_ctpt[df_series_ctpt['Number of Series Related Instances'] != '1']
if len(df_last_frames) > 0:
    query_dicts_ctpt.extend(
        df_last_frames.apply(lambda row: {
            'SeriesDate': row['Series Date'],
            'PatientID': row['Patient ID'],
            'SeriesInstanceUID': row['Series Instance UID'],
            'InstanceNumber': row['Number of Series Related Instances']
        }, axis=1))
# fetch the CT/PT data
logging.info('Getting CT/PT data ({} queries)'.format(len(query_dicts_ctpt)))
df_info_ctpt = get_data(config, query_dicts_ctpt, to_fetch_fields_ctpt)

# prepare the NM queries for the first instance (first image)
query_dicts_nm = list(df_series_nm.apply(lambda row: {
    'SeriesDate': row['Series Date'],
    'PatientID': row['Patient ID'],
    'SeriesInstanceUID': row['Series Instance UID']
}, axis=1))
# fetch the NM data
logging.info('Getting NM data ({} queries)'.format(len(query_dicts_nm)))
df_info_nm = get_data(config, query_dicts_nm, to_fetch_fields_nm)

In [None]:
df_series_save = df_series.copy()
df_info_ctpt_save = df_info_ctpt.copy()

In [None]:
df_series = df_series_save.copy()
df_info_ctpt_save = df_info_ctpt_save.copy()

# get the images with a single instance
single_instances_UIDs = df_series.loc[
    (df_series['Series Instance UID'].isin(df_info_ctpt['SeriesInstanceUID']))\
    & (df_series['Number of Series Related Instances'] == '1'), 'Series Instance UID']
# duplicated them into the info DataFrame, so that they can also be merged together, as if there was two frames
df_info_ctpt_single_inst_copies = df_info_ctpt[df_info_ctpt['SeriesInstanceUID'].isin(single_instances_UIDs)].copy()
df_info_ctpt_single_inst_copies['InstanceNumber'] = 999999
df_info_ctpt_extended = pd.concat([df_info_ctpt, df_info_ctpt_single_inst_copies], sort=True)

# clean up the start times
df_info_ctpt_extended.loc[:, 'AcquisitionTime'] = df_info_ctpt_extended.loc[:, 'AcquisitionTime']\
    .apply(lambda t: str(t).split('.')[0])
# regroup the first and last instance rows on a single row
df_info_ctpt_merged = df_info_ctpt_extended[df_info_ctpt_extended['InstanceNumber'] == 1]\
    .merge(df_info_ctpt_extended[df_info_ctpt_extended['InstanceNumber'] > 1],
           on=['SeriesInstanceUID', 'PatientID', 'ManufacturerModelName', 'Modality'],
           suffixes=['_start', '_end'])
# rename the columns and keep the appropriate ones
df_info_ctpt_clean = df_info_ctpt_merged.rename(columns={
        'SeriesInstanceUID': 'Series Instance UID',
        'PatientID': 'Patient ID',
        'ManufacturerModelName': 'Machine',
        'AcquisitionTime_start': 'Start Time',
        'AcquisitionTime_end': 'End Time'})\
    .drop(columns=['InstanceNumber_start', 'InstanceNumber_end'])
# merge the info into the series DataFrame
df_series = df_series.merge(df_info_ctpt_clean, on=['Patient ID', 'Series Instance UID', 'Modality'],
    how='outer')
# keep only the relevant columns
for f in ['Start Time', 'End Time', 'Machine']:
    df_series[f] = df_series[f + '_y'].where(df_series[f + '_y'].notnull(), df_series[f + '_x'])
    df_series.drop(columns=[f + '_y', f + '_x'], inplace=True)  

In [None]:
# clean up the start times
df_info_nm.loc[:, 'AcquisitionTime'] = df_info_nm.loc[:, 'AcquisitionTime']\
    .apply(lambda t: str(t).split('.')[0])
# use the AcquisitionTime as Start Time
df_info_nm['Start Time'] = df_info_nm['AcquisitionTime']
# call a function to calculate the End Times
df_info_nm['End Time'] = df_info_nm.apply(get_NM_series_end_time, axis=1)
# rename the columns and select the appropriate ones
df_info_nm_clean = df_info_nm.rename(columns={
        'SeriesInstanceUID': 'Series Instance UID',
        'PatientID': 'Patient ID',
        'ManufacturerModelName': 'Machine'})\
    [['Series Instance UID', 'Patient ID', 'Modality', 'Start Time', 'End Time', 'Machine']]
# merge the info into the series DataFrame
df_series = df_series.merge(df_info_nm_clean, on=['Patient ID', 'Series Instance UID', 'Modality'],
    how='outer')
# keep only the relevant columns
for f in ['Start Time', 'End Time', 'Machine']:
    df_series[f] = df_series[f + '_y'].where(df_series[f + '_y'].notnull(), df_series[f + '_x'])
    df_series.drop(columns=[f + '_y', f + '_x'], inplace=True)

In [None]:
dfmf = df_series[
    (df_series['Start Time'].isnull())
    | (df_series['End Time'].isnull())
    | (df_series['Machine'] == '')
    | (df_series['Institution Name'] == '')]
dfmf

In [None]:
dfmf2 = fetch_info_for_series(config, dfmf)
dfmf2

# Test to query all images in one query data set

### Fetch info for all "first" CT/PT images

In [None]:
df_ctpt

In [None]:
df_info_ctpt_first = []
while len(df_info_ctpt_first) == 0:
    ds = Dataset()
    ds.QueryRetrieveLevel = 'IMAGE'
    ds.SeriesDate = '20191021'
    ds.SeriesInstanceUID = list(set(df_ctpt['Series Instance UID']))
    ds.PatientID =  list(set(df_ctpt['Patient ID']))
    ds.InstanceNumber = '1'
    ds.Modality = ['CT','PT']

    # fields to fetch from the DICOM header
    to_fetch_fields = ['SeriesInstanceUID', 'PatientID', 'InstanceNumber', 'ManufacturerModelName', 'AcquisitionTime',
        'Modality', 'ImageType', 'ActualFrameDuration', 'NumberOfFrames', '0x00540032', '0x00540052']

    # find information about this series by fetching some images
    df_info_ctpt_first = get_data(config, [ds], to_fetch_fields)

In [None]:
df_info_ctpt_first

### Fetch info for all "last" CT/PT images

In [None]:
df_info_last = []
while len(df_info_last) == 0:
    ds = Dataset()
    ds.QueryRetrieveLevel = 'IMAGE'
    ds.SeriesDate = '20191021'
    ds.SeriesInstanceUID = list(set(df_ctpt['Series Instance UID']))
    ds.PatientID =  list(set(df_ctpt['Patient ID']))
    ds.InstanceNumber = list(set(df_ctpt['Number of Series Related Instances']))
    ds.Modality = ['CT','PT']

    # fields to fetch from the DICOM header
    to_fetch_fields = ['SeriesInstanceUID', 'PatientID', 'InstanceNumber', 'ManufacturerModelName', 'AcquisitionTime',
        'Modality', 'ImageType', 'ActualFrameDuration', 'NumberOfFrames', '0x00540032', '0x00540052']

    # find information about this series by fetching some images
    df_info_last = get_data(config, [ds], to_fetch_fields)

In [None]:
df_info_last.sort_values(by="InstanceNumber").drop_duplicates(subset=["SeriesInstanceUID"], keep="last").reset_index()
df_grouped.index += 1
df_grouped

### Fetch info for all NM images

In [None]:
df_info_nm = []
while len(df_info_nm) == 0:
    ds = Dataset()
    ds.QueryRetrieveLevel = 'IMAGE'
    ds.SeriesDate = '20191021'
    ds.SeriesInstanceUID = list(set(df_nm['Series Instance UID'][0:5]))
    ds.PatientID =  list(set(df_nm['Patient ID'][0:5]))
    ds.Modality = 'NM'

    # fields to fetch from the DICOM header
    to_fetch_fields = ['SeriesInstanceUID', 'PatientID', 'ManufacturerModelName', 'AcquisitionTime',
        'Modality', 'ImageType', 'ActualFrameDuration', 'NumberOfFrames', '0x00540032', '0x00540052']

    # find information about this series by fetching some images
    df_info_nm = get_data(config, [ds], to_fetch_fields)

In [None]:
df_info_nm

###  Merge the results

In [None]:
df_merged = df_series.merge(df_info.drop(columns='Modality').rename(columns={'SeriesInstanceUID': 'Series Instance UID'}), on='Series Instance UID', how='outer')
#df_merged[['Series Date', 'Series Time', 'AcquisitionTime']]
df_merged

In [None]:
df_info = []
while len(df_info) == 0:
    ds = Dataset()
    ds.QueryRetrieveLevel = 'IMAGE'
    ds.SeriesInstanceUID = df_series['Series Instance UID']
    ds.PatientID =  df_series['Patient ID']
    ds.Modality = 'NM'

    # fields to fetch from the DICOM header
    to_fetch_fields = ['SeriesInstanceUID', 'PatientID', 'InstanceNumber', 'ManufacturerModelName', 'AcquisitionTime',
        'Modality', 'ImageType', 'ActualFrameDuration', 'NumberOfFrames', '0x00540032', '0x00540052']

    # find information about this series by fetching some images
    df_info = get_data(config, [ds], to_fetch_fields)