### Imports

In [None]:
import logging
import datetime
import numpy as np

from datetime import datetime as dt
import pandas as pd

from IPython.core.display import display, HTML

import os
os.chdir('H:/Mes Documents/ServiceCivil2019/schedvisu')
import sys
sys.path.append('src')

from main import run, load_config, get_day_range
from retrieve_data import *
from extract_data import load_transform_and_save_data_from_files
from create_report import create_report, get_report_type

%load_ext autoreload
%reload_ext autoreload
%autoreload 2

# set the width of the notebook
display(HTML("<style>.container { width:95% !important; }</style>"))

### Tests for creating a multi-page PDF

In [None]:
main.run()

### Tests for "Dose report #5"

###### Prepare the studies and series

In [None]:
df_series = pd.read_pickle('C:/TEMP/SchedVisu_data/series.pkl')
print(df_series.columns)
df_ctdivol = df_series[(~df_series['CTDIvol'].isnull()) | (~df_series['CTDIvol_start'].isnull())][['Date', 'CTDIvol_start', 'Series Description', 'Study Description', 'Protocol Name', 'Modality', 'Patient ID', 'ImageType']]
display(set(df_ctdivol['Series Description']))
display(set(df_ctdivol['Protocol Name']))
display(set(df_ctdivol['Study Description']))
display(df_ctdivol[(~df_ctdivol['Series Description'].str.match('.*low.*')) & (~df_ctdivol['Series Description'].str.match('.*Topogram.*'))])

df_studies = pd.read_pickle('C:/TEMP/SchedVisu_data/studies.pkl')

### Tests for "Fetch more fields from the DICOMs #38"

###### Prepare the studies and series

In [None]:
day = dt(2019, 2, 27)
day_str = day.strftime('%Y%m%d')
patientID = ['2026682', '138821']

config = load_config()
config['main']['start_date'] = day.strftime('%Y%m%d')
config['main']['end_date'] = day.strftime('%Y%m%d')

df_studies = find_studies_for_day(config, day_str, ['PT'])
df_studies = df_studies.query('`Patient ID` in @patientID')

df_series = find_series_for_studies(config, df_studies)

###### Manually fetch the info

In [None]:
df_series_save = df_series.copy()

In [None]:
df_series, df_series_failed = fetch_info_for_series_with_batches(config, df_series_save.copy())

In [None]:
df_series.iloc[0]

## Tests for retrieving all data for calculating correct End Times

### Modify the fetching so that it keeps all info for calculating end times

In [None]:
retrieve_and_save_single_day_data_from_PACS(config, datetime.datetime.strptime('20190111', '%Y%m%d'))

### Run the pipeline for a single day

In [None]:
create_logger()
config = load_config()
config['main']['start_date'] = '2019-01-10'
config['main']['end_date'] = '2019-01-10'

retrieve_and_save_data_from_PACS(config)
load_transform_and_save_data_from_files(config)
create_report(config)

### Prepare for tests

In [None]:
create_logger()
config = load_config()
config['main']['start_date'] = '2019-09-05'
config['main']['end_date'] = '2019-09-05'

In [None]:
retrieve_and_save_single_day_data_from_PACS(config, datetime.datetime(2019, 9, 5))

In [None]:
df = extract_transform_and_save_data_from_files(config)
#display(df)
with pd.option_context("display.max_rows", 1000): display(df)

In [None]:
df_failed_with_info = fetch_info_for_series(config, df_failed)

In [None]:
df2 = pd.concat([df, df_failed_with_info], sort=True)
df2.drop_duplicates('Series Instance UID')

In [None]:
df_rescued_series = df_failed_with_info.copy()
df_failed_series = df_rescued_series[
            (df_rescued_series['Start Time'].isnull())
            | (df_rescued_series['End Time'].isnull())
            | (df_rescued_series['Machine'] == '')
            | (df_rescued_series['Institution Name'] == '')]
# exclude series where some information could still not be gathered (e.g. no end time or no machine)
df_rescued_series = df_rescued_series.loc[~df_rescued_series.index.isin(df_failed_series.index), :]
df2 = pd.concat([df, df_rescued_series], sort=True)
df_failed_series
df2


### Find all studies and series

In [None]:
config = load_config()
config['main']['start_date'] = '2019-01-10'
config['main']['end_date'] = '2019-01-10'
df_studies = find_studies_for_day(config, '20190110', ['PT', 'NM'])
df_studies = df_studies[df_studies['Patient ID'] == '2370187']
df_all_series = find_series_for_studies(config, df_studies)

In [None]:
set(df_all_series[df_all_series['Number of Series Related Instances'].astype(int) > 1].Modality)

In [None]:
df_series = df_all_series.copy()

df_series_subset = pd.concat([df_all_series[df_all_series.Modality == modality].head(n = 30) for modality in set(df_all_series.Modality)]).sort_values('Series Time').reset_index(drop=True)
df_series = df_series_subset.copy()
df_series

In [None]:
df_series = fetch_info_for_series(config, df_series)

In [None]:
df_series

### Step by step before turing it to an API

In [None]:
# list of field names to extract for each modality
to_fetch_fields_ctpt = ['SeriesInstanceUID', 'PatientID', 'InstanceNumber', 'ManufacturerModelName',
    'AcquisitionTime', 'Modality', 'ActualFrameDuration']
to_fetch_fields_nm = ['SeriesInstanceUID', 'PatientID', 'InstanceNumber', 'ManufacturerModelName',
    'AcquisitionTime', 'Modality', 'ActualFrameDuration', 'NumberOfFrames', '0x00540032', '0x00540052']

# create modality specific masks of the DataFrame
df_series_ctpt = df_series[df_series['Modality'].isin(['PT', 'CT'])]
df_series_nm = df_series[df_series['Modality'] == 'NM']
display(df_series_ctpt)
display(df_series_nm)

# prepare the CT/PT queries for the first instance (first image)
query_dicts_ctpt = list(df_series_ctpt.apply(lambda row: {
    'SeriesDate': row['Series Date'],
    'PatientID': row['Patient ID'],
    'SeriesInstanceUID': row['Series Instance UID'],
    'InstanceNumber': '1'
}, axis=1))
# prepare the CT/PT queries for the last instance (last image)
df_last_frames = df_series_ctpt[df_series_ctpt['Number of Series Related Instances'] != '1']
if len(df_last_frames) > 0:
    query_dicts_ctpt.extend(
        df_last_frames.apply(lambda row: {
            'SeriesDate': row['Series Date'],
            'PatientID': row['Patient ID'],
            'SeriesInstanceUID': row['Series Instance UID'],
            'InstanceNumber': row['Number of Series Related Instances']
        }, axis=1))
# fetch the CT/PT data
logging.info('Getting CT/PT data ({} queries)'.format(len(query_dicts_ctpt)))
df_info_ctpt = get_data(config, query_dicts_ctpt, to_fetch_fields_ctpt)

# prepare the NM queries for the first instance (first image)
query_dicts_nm = list(df_series_nm.apply(lambda row: {
    'SeriesDate': row['Series Date'],
    'PatientID': row['Patient ID'],
    'SeriesInstanceUID': row['Series Instance UID']
}, axis=1))
# fetch the NM data
logging.info('Getting NM data ({} queries)'.format(len(query_dicts_nm)))
df_info_nm = get_data(config, query_dicts_nm, to_fetch_fields_nm)

In [None]:
with pd.option_context("display.max_rows", 1000): display(df_info_ctpt.sort_values(['ManufacturerModelName', 'PatientID','SeriesInstanceUID', 'AcquisitionTime', 'InstanceNumber']))
with pd.option_context("display.max_rows", 1000): display(df_info_nm.sort_values(['ManufacturerModelName', 'PatientID', 'SeriesInstanceUID', 'Modality', 'AcquisitionTime', 'InstanceNumber']))

### Save the retrieved info DataFrames

df_series_save = df_series.copy()
df_info_ctpt_save = df_info_ctpt.copy()
df_info_nm_save = df_info_nm.copy()

### Manually process the info and merge it back to the series DataFrame

In [None]:
df_series = df_series_save.copy()
df_info_ctpt_save = df_info_ctpt_save.copy()
df_info_nm_save = df_info_nm_save.copy()

# Process PT/CT images
if len(df_info_ctpt) > 0:

    # get the images with a single instance
    single_instances_UIDs = df_series.loc[
        (df_series['Series Instance UID'].isin(df_info_ctpt['SeriesInstanceUID']))\
        & (df_series['Number of Series Related Instances'] == '1'), 'Series Instance UID']
    logging.info('single_instances_UIDs')
    display(single_instances_UIDs)
    # duplicated them into the info DataFrame, so that they can also be merged together, as if there was two frames
    df_info_ctpt_single_inst = df_info_ctpt[df_info_ctpt['SeriesInstanceUID'].isin(single_instances_UIDs)].copy()
    df_info_ctpt_single_inst['InstanceNumber'] = 999999
    df_info_ctpt_extended = pd.concat([df_info_ctpt, df_info_ctpt_single_inst], sort=True)
    logging.info('df_info_ctpt_extended')
    display(df_info_ctpt_extended)

    # clean up the start times
    df_info_ctpt_extended.loc[:, 'AcquisitionTime'] = df_info_ctpt_extended.loc[:, 'AcquisitionTime']\
        .apply(lambda t: str(t).split('.')[0])

    # regroup the first and last instance rows on a single row
    df_info_ctpt_merged = df_info_ctpt_extended[df_info_ctpt_extended['InstanceNumber'] == 1]\
        .merge(df_info_ctpt_extended[df_info_ctpt_extended['InstanceNumber'] > 1],
               on=['SeriesInstanceUID', 'PatientID', 'ManufacturerModelName', 'Modality'],
               suffixes=['_start', '_end'])
    logging.info('df_info_ctpt_merged')
    display(df_info_ctpt_merged)

    # rename the columns and keep the appropriate ones
    df_info_ctpt_clean = df_info_ctpt_merged.rename(columns={
            'SeriesInstanceUID': 'Series Instance UID',
            'PatientID': 'Patient ID',
            'ManufacturerModelName': 'Machine',
            'AcquisitionTime_start': 'Start Time',
            'AcquisitionTime_end': 'End Time'})\
        .drop(columns=['InstanceNumber_start', 'InstanceNumber_end'])
    logging.info('df_info_ctpt_clean')
    display(df_info_ctpt_clean)
    
    s = pd.to_datetime(df_info_ctpt_clean['Start Time'], format='%H%M%S')
    e = pd.to_datetime(df_info_ctpt_clean['End Time'], format='%H%M%S')
    df_inv = df_info_ctpt_clean[s > e].copy()
    df_inv[['Start Time','End Time']] = df_inv[['End Time','Start Time']]
    df_info_ctpt_clean[s > e] = df_inv
    logging.info('df_info_ctpt_clean 2')
    display(df_info_ctpt_clean)
    
    # merge the info into the series DataFrame
    df_series = df_series.merge(df_info_ctpt_clean, on=['Patient ID', 'Series Instance UID', 'Modality'], how='outer')
    logging.info('df_series 1')
    display(df_series)

    # keep only the relevant columns
    for f in ['Start Time', 'End Time', 'Machine']:
        df_series[f] = df_series[f + '_y'].where(df_series[f + '_y'].notnull(), df_series[f + '_x'])
        df_series.drop(columns=[f + '_y', f + '_x'], inplace=True)
    logging.info('df_series post-CT')
    display(df_series)

# Process NM images
if len(df_info_nm) > 0:
    # clean up the start times
    df_info_nm.loc[:, 'AcquisitionTime'] = df_info_nm.loc[:, 'AcquisitionTime']\
        .apply(lambda t: str(t).split('.')[0])
    # use the AcquisitionTime as Start Time
    df_info_nm['Start Time'] = df_info_nm['AcquisitionTime']
    # call a function to calculate the End Times
    df_info_nm['End Time'] = df_info_nm.apply(get_NM_series_end_time, axis=1)
    # rename the columns and select the appropriate ones
    df_info_nm_clean = df_info_nm.rename(columns={
            'SeriesInstanceUID': 'Series Instance UID',
            'PatientID': 'Patient ID',
            'ManufacturerModelName': 'Machine'})
    # merge the info into the series DataFrame
    df_series = df_series.merge(df_info_nm_clean, on=['Patient ID', 'Series Instance UID', 'Modality'],
        how='outer')
    # keep only the relevant columns
    for f in ['Start Time', 'End Time', 'Machine']:
        df_series[f] = df_series[f + '_y'].where(df_series[f + '_y'].notnull(), df_series[f + '_x'])
        df_series.drop(columns=[f + '_y', f + '_x'], inplace=True)
    logging.info('df_series post-NM')
    display(df_series)

# remove duplicates
df_series = df_series.drop_duplicates('Series Instance UID')

In [None]:
df_series[['Patient ID', 'Modality', 'Start Time', 'End Time']]

display(df_series)
df_series.to_pickle('data/2019/2019-01/2019-01-10.pkl')

### Finish the exploration after all this manual processing

In [None]:
# load in the data
#df_series = load_data_from_files(config)
# mark the rektakes and the machine group for each series
df_series = mark_retakes(config, df_series)
df_series = mark_machine_group(config, df_series)
display(df_series[df_series['Patient ID'] == '2370187'])

df_studies = df_series.replace(np.nan, '').groupby('SUID').agg({
        'Series Date': lambda x: '/'.join(set(x)),
        'Start Time': 'min',
        'End Time': 'max',
        'Study Description': lambda x: '/'.join(set(x)),
        'Patient ID': lambda x: '/'.join(set(x)),
        'Machine Group': lambda x: '/'.join(set(x)),
        'Modality': lambda x: '/'.join(set(x)),
        'Protocol Name': lambda x: '/'.join(set(x))
    }).sort_values(['Series Date', 'Start Time', 'Machine Group', 'SUID'])\
    .rename(columns={'Series Date': 'Date'})

df = df_studies[df_studies['Patient ID'] == '2370187'].copy()
display(df)

df['Machine'] = df['Machine Group'].str.replace('NoCT', '')
df = df[df['Machine'] != 'mixed cases'].drop(columns='Machine Group')
df.sort_values('Machine')[['Patient ID', 'Machine', 'Modality', 'Start Time', 'End Time', 'Study Description']]

# Test to query all images in one query data set

### Fetch info for all "first" CT/PT images

In [None]:
df_ctpt

In [None]:
df_info_ctpt_first = []
while len(df_info_ctpt_first) == 0:
    ds = Dataset()
    ds.QueryRetrieveLevel = 'IMAGE'
    ds.SeriesDate = '20191021'
    ds.SeriesInstanceUID = list(set(df_ctpt['Series Instance UID']))
    ds.PatientID =  list(set(df_ctpt['Patient ID']))
    ds.InstanceNumber = '1'
    ds.Modality = ['CT','PT']

    # fields to fetch from the DICOM header
    to_fetch_fields = ['SeriesInstanceUID', 'PatientID', 'InstanceNumber', 'ManufacturerModelName', 'AcquisitionTime',
        'Modality', 'ImageType', 'ActualFrameDuration', 'NumberOfFrames', '0x00540032', '0x00540052']

    # find information about this series by fetching some images
    df_info_ctpt_first = get_data(config, [ds], to_fetch_fields)

In [None]:
df_info_ctpt_first

### Fetch info for all "last" CT/PT images

In [None]:
df_info_last = []
while len(df_info_last) == 0:
    ds = Dataset()
    ds.QueryRetrieveLevel = 'IMAGE'
    ds.SeriesDate = '20191021'
    ds.SeriesInstanceUID = list(set(df_ctpt['Series Instance UID']))
    ds.PatientID =  list(set(df_ctpt['Patient ID']))
    ds.InstanceNumber = list(set(df_ctpt['Number of Series Related Instances']))
    ds.Modality = ['CT','PT']

    # fields to fetch from the DICOM header
    to_fetch_fields = ['SeriesInstanceUID', 'PatientID', 'InstanceNumber', 'ManufacturerModelName', 'AcquisitionTime',
        'Modality', 'ImageType', 'ActualFrameDuration', 'NumberOfFrames', '0x00540032', '0x00540052']

    # find information about this series by fetching some images
    df_info_last = get_data(config, [ds], to_fetch_fields)

In [None]:
df_info_last.sort_values(by="InstanceNumber").drop_duplicates(subset=["SeriesInstanceUID"], keep="last").reset_index()
df_grouped.index += 1
df_grouped

### Fetch info for all NM images

In [None]:
df_info_nm = []
while len(df_info_nm) == 0:
    ds = Dataset()
    ds.QueryRetrieveLevel = 'IMAGE'
    ds.SeriesDate = '20191021'
    ds.SeriesInstanceUID = list(set(df_nm['Series Instance UID'][0:5]))
    ds.PatientID =  list(set(df_nm['Patient ID'][0:5]))
    ds.Modality = 'NM'

    # fields to fetch from the DICOM header
    to_fetch_fields = ['SeriesInstanceUID', 'PatientID', 'ManufacturerModelName', 'AcquisitionTime',
        'Modality', 'ImageType', 'ActualFrameDuration', 'NumberOfFrames', '0x00540032', '0x00540052']

    # find information about this series by fetching some images
    df_info_nm = get_data(config, [ds], to_fetch_fields)

In [None]:
df_info_nm

###  Merge the results

In [None]:
df_merged = df_series.merge(df_info.drop(columns='Modality').rename(columns={'SeriesInstanceUID': 'Series Instance UID'}), on='Series Instance UID', how='outer')
#df_merged[['Series Date', 'Series Time', 'AcquisitionTime']]
df_merged

In [None]:
df_info = []
while len(df_info) == 0:
    ds = Dataset()
    ds.QueryRetrieveLevel = 'IMAGE'
    ds.SeriesInstanceUID = df_series['Series Instance UID']
    ds.PatientID =  df_series['Patient ID']
    ds.Modality = 'NM'

    # fields to fetch from the DICOM header
    to_fetch_fields = ['SeriesInstanceUID', 'PatientID', 'InstanceNumber', 'ManufacturerModelName', 'AcquisitionTime',
        'Modality', 'ImageType', 'ActualFrameDuration', 'NumberOfFrames', '0x00540032', '0x00540052']

    # find information about this series by fetching some images
    df_info = get_data(config, [ds], to_fetch_fields)