Tests for retrieving data from PACS
--

Imports
--

In [None]:
import logging
import configparser
import pandas as pd
from collections import namedtuple

from datetime import datetime as dt

from IPython.core import display as ICD

from pydicom.dataset import Dataset

from scripts.run_all import run_all
from scripts.retrieve_data_from_PACS import *

#from pynetdicom import debug_logger
#debug_logger()

# set the width of display to infinite for all pandas DataFrame
pd.set_option('display.max_colwidth', -1)
# set the level of pynetdicom module's logger to ERROR, to avoid any logs
logging.getLogger('pynetdicom').setLevel(logging.ERROR)

%load_ext autoreload
%reload_ext autoreload
%autoreload 2

Initialize the "config" object
--

In [None]:
config = run_all()

Find all 'PT' and 'NM' studies for a day
--

In [None]:
df_studies = find_studies_for_day(config, config['main']['start_date'], ['PT', 'NM'])

# filter out irrelevant studies
df_studies = df_studies[df_studies['Patient ID'].str.match('^\d+$')]
df_studies = df_studies[~df_studies['Study Description'].isin(['EXTRINSEQUE'])]
df_studies.reset_index(drop=True, inplace=True)

df_studies

Get all series for the found studies and get their time ranges
--

In [None]:
accepted_inst_names = ['centrehospitalieruniversitairevaudois', 'medecinenucleairechuvlausanne',
                        'radiologiechuv', 'petctchuv']

# go through each study and find its series
logging.info('Going through {} studie(s)'.format(len(df_studies)))
for i_study in range(len(df_studies)):
    logging.debug('DataFrame row:\n' + str(df_studies.loc[i_study, :]))
    df_series = find_series_for_study(config, df_studies.loc[i_study, :])
    
    # filter for the institution name
    inst_name = df_series.loc[0, 'Institution Name'].lower().replace(' ', '')
    if inst_name not in accepted_inst_names:
        logging.warning('Skipping study because it is not from CHUV (institution name: "{}")'.format(inst_name))
    
    # go through each study and find relevant information by fetching its image(s)
    logging.info('Going through {} series'.format(len(df_series)))
    for i_series in range(len(df_series)):
        logging.debug('Series: ' + str(df_series.loc[i_series, :]))
        row_info = fetch_info_for_series(config, df_series.loc[i_series, :])
        
        # abort processing for this series no data
        if row_info is None:
            logging.error('Skipping series {}: no data found.'.format(df_series.loc[i_series, 'Series Instance UID']))
            continue

        # copy the relevant parameters into the main DataFrame
        df_series.loc[i_series, 'start_time'] = row_info['start_time']
        df_series.loc[i_series, 'end_time'] = row_info['end_time']
        df_series.loc[i_series, 'machine'] = row_info['machine']
    
    # remove redundant series
    df_series = prunes_series_by_time_overlap(df_series)
    
    # create time ranges from the start/end times
    time_ranges = []
    for i_serie in range(len(df_series)):
        time_ranges.append('{}-{}'.format(df_series.loc[i_serie, 'start_time'],
                                          df_series.loc[i_serie, 'end_time']))
    
    # propagate back the time range and machine name information
    df_studies.loc[i_study, 'machines'] = ','.join(list(set(df_series['machine'])))
    df_studies.loc[i_study, 'time_ranges'] = ','.join(time_ranges)
    df_studies.loc[i_study, 'overall_time_range'] = '{}-{}'.format(
        df_series.iloc[0]['start_time'], df_series.iloc[-1]['end_time'])

Save the data
--

In [None]:
df_studies.to_pickle(config['main']['start_date'] + '.pickle')

Further process the studies
--

In [None]:
df = df_studies.loc[:, ['Study Date', 'Study Description', 'Patient ID', 'machines']]
df.columns = ['date', 'descr', 'pid', 'machine']
df['start_time'] = df_studies.loc[:, 'Study Time']
df['end_time'] = df_studies.loc[:, 'overall_time_range'].apply(lambda s: s.split('-')[1])
df

Rename the machines to have some consensus
--

In [None]:
# filter out some Series that are not primary acquisitions (and do not contain any relevant time information)
machine_names = ['Vision 600', 'Discovery 690', 'Millennium MPR', 'Intevo 16', 'Biograph20', 'Encore2']
for machine_name in machine_names:
    matching_rows = df['machine'].str.match('.*' + machine_name + '.*', case=False)
    if matching_rows.sum() > 0:
        logging.info('Found {} rows matching the name "{}":'.format(matching_rows.sum(), machine_name))
    df.loc[matching_rows, 'machine'] = machine_name

# replace the "Encore2" machine name to "Intevo", since it is the same machine
df.loc[df['machine'] == 'Encore2', 'machine'] = 'Intevo 16'
machine_names.remove('Encore2')

df

Rename the descriptions to have some consensus
--

In [None]:
# filter out some Series that are not primary acquisitions (and do not contain any relevant time information)
description_patterns = {'FDG Corps Entier': 'fdgcorpsentier', 'FDG Tronc': 'fdgtronc', 'Rb82 Coeur': 'rb82coeur',
                       'FDG Abdomen TAP Veineux Corps Entier': 'abdomen1fdgtapveineuxpetcorpsentierflowadult',
                       'Scintigraphie OctreoScan': 'scintioctreoscan'}
for descr in description_patterns.keys():
    matching_rows = df['descr'].str.lower().str.replace('[-_^ ()]', '').str\
        .match('.*' + description_patterns[descr] + '.*', case=False)
    if matching_rows.sum() > 0:
        logging.info('Found {} rows matching the name "{}":'.format(matching_rows.sum(), descr))
    df.loc[matching_rows, 'descr'] = descr

df

Check for overlap for each machine
--

In [None]:
for machine in machine_names:
    logging.info('Checking overlap for machine "{}"'.format(machine))
    df_machine = df[df['machine'] == machine]
    df[df['machine'] == machine] = prunes_series_by_time_overlap(df_machine)
df

Save processed data
--

In [None]:
df.to_pickle(config['main']['start_date'] + '_processed.pickle')