In [None]:
import logging

from IPython.core.display import display, HTML

import os
os.chdir('H:/Mes Documents/ServiceCivil2019/schedvisu')
import sys
sys.path.append('scripts')

from main import *
from retrieve_data import *
from extract_data import *
from create_report import *

from datetime import datetime as dt
import pandas as pd

%load_ext autoreload
%reload_ext autoreload
%autoreload 2

# set the width of the notebook
display(HTML("<style>.container { width:95% !important; }</style>"))

### DEBUGGING "Some studies are overlapping, although they should be split up" #19

###### Prepare the studies and series

In [None]:
config = load_config()

day = dt(2019, 2, 27)
day_str = day.strftime('%Y%m%d')
config['main']['start_date'] = day_str
config['main']['end_date'] = day_str
#patientIDs = ['2026682', '138821']
#config['retrieve']['debug_patient_ids'] = ','.join(patientIDs)

retrieve_and_save_data_from_PACS(config)
df_studies, df_series = load_transform_and_save_data_from_files(config)

#display(df_studies)
#display(df_series)

######  Fetch info for series

In [None]:
df_series, df_series_failed = fetch_info_for_series_with_batches(config, df_series)

###### Mark retakes

In [None]:
# get the list of Study Instance UIDs
study_UIDs = list(set(df_series['Study Instance UID']))
logging.info('Found {} unique study UIDs'.format(len(study_UIDs)))

FMT = '%H%M%S'
# get from the config the threshold in seconds for splitting a study in "first/second take"
study_split_thresh = int(config['extract']['n_sec_second_take_split_thresh'])

# create a column to mark the "take" index of the series. By default, everything is a first take
df_series['i_take'] = None

# build the list of rows to keep
indices_to_keep = []
for i_study in range(len(study_UIDs)):

    # get the series related to the current Study Instance UID and sort them
    sUID = study_UIDs[i_study]
    df_series_for_study = df_series[df_series['Study Instance UID'] == sUID].sort_values('Start Time')

    study_str = '[{:4d}/{:4d}] IPP {}, {:52}'.format(i_study, len(study_UIDs) - 1, df_series_for_study.iloc[0]['Patient ID'], sUID)
    logging.debug('Processing  {}: found {:2d} series'.format(study_str, len(df_series_for_study)))

    # there must be at least 2 series for any splitting
    if len(df_series_for_study) < 2:
        df_series.loc[df_series_for_study.index, 'i_take'] = 1
        continue

    # convert the columns to datetime format
    df_series_for_study['Start Time'] = pd.to_datetime(df_series_for_study['Start Time'], format=FMT)
    df_series_for_study['End Time'] = pd.to_datetime(df_series_for_study['End Time'], format=FMT)
    # compare the start time of a row with the end time of the previous row
    df_series_for_study['time_to_prev'] = df_series_for_study['End Time'].shift() - df_series_for_study['Start Time']
    # correct for negative durations (when start time is before end time of previous row)
    df_series_for_study.loc[df_series_for_study['time_to_prev'] < timedelta(0), 'time_to_prev'] *= -1
    # get the series where a split should be done
    df_series_split = df_series_for_study[df_series_for_study['time_to_prev'] > timedelta(seconds=study_split_thresh)]
    
    # also check whether there is a series from another study inbetween our study
    df_series_other = df_series[(df_series['Study Instance UID'] != sUID) & (df_series['Machine Group'] == df_series_for_study.iloc[0]['Machine Group'])]
    start_times_other = pd.to_datetime(df_series_other['Start Time'], format=FMT)
    end_times_other = pd.to_datetime(df_series_other['End Time'], format=FMT)
    study_start = min(df_series_for_study['Start Time'])
    study_end = max(df_series_for_study['End Time'])    
    df_series_other_inbetween = df_series_other[(start_times_other > study_start) & (end_times_other < study_end)]
    n_series_inbetween = len(df_series_other_inbetween)
    if n_series_inbetween > 0:
        inbetween_start = min(pd.to_datetime(df_series_other_inbetween['Start Time'], format=FMT))
        new_series_split = df_series_for_study[df_series_for_study['Start Time'] > inbetween_start].sort_values('Start Time').iloc[0]
        df_series_split = df_series_split.append(df_series.loc[new_series_split.name])
    # if there is no splitting indices
    if len(df_series_split) == 0:
        logging.debug('  Passing   {}: no second take (max time diff: {})'
            .format(study_str, max(df_series_for_study['time_to_prev'])))
        df_series.loc[df_series_for_study.index, 'i_take'] = 1
        continue

    # if there is more than one split point, throw an error and do not do any splitting
    elif len(df_series_split) >= 1:
        logging.info('  Found {} series to split'.format(len(df_series_split)))
        # go through all the series
        i_take = 1
        for ind in df_series_for_study.index:
            if ind in df_series_split.index:
                if ind <= 0:
                    logging.error('  Error at {}: trying to split at index "{}". Aborting.'
                        .format(study_str, ind))
                    continue
                logging.info('  Splitting {}: split {} between {:3d}/{:3d} [T={}/{}, D={}]'
                    .format(study_str, i_take, ind - 1, ind, df_series.loc[ind - 1, 'End Time'],
                    df_series.loc[ind, 'Start Time'], df_series_for_study.loc[ind, 'time_to_prev']))
                i_take += 1
            # mark the series according to the split index
            df_series.loc[ind, 'i_take'] = i_take

# create a new unique ID that includes the retake information
df_series['SUID'] = df_series['Study Instance UID'] + '_' + df_series['i_take'].astype(str)

display(df_series.iloc[:, [0,1,2,3,4,5,6,7,24,25]])

###  Read in all the data files day by day and search for problems

In [None]:
config = load_config()
config['main']['start_date'] = '2019-12-02'
config['main']['end_date'] = '2019-12-02'

# get the date range from the config
start_date = dt.strptime(config['main']['start_date'], '%Y-%m-%d')
end_date = dt.strptime(config['main']['end_date'], '%Y-%m-%d')
days_range = pd.date_range(start_date, end_date)

# create the variable holding all the series for all days
df_all_problems = None
df_all_series = None

# go through the date range day by day
for day in days_range:

    config['main']['start_date'] = day.strftime('%Y-%m-%d')
    config['main']['end_date'] = day.strftime('%Y-%m-%d')
    
    df = load_data_from_files(config)
    if df is None: continue
    df_problem = df[
            (df['Start Time'].isnull())
            | (df['End Time'].isnull())
            | (df['Machine'] == '')
            | (df['Machine'].isnull())
            | (df['Institution Name'] == '')
            | (df['Institution Name'] == 'NONE')
            | (df['Institution Name'].isnull())]
    df_all_series = pd.concat([df_all_series, df], sort=False)
    if len(df_problem) <= 0: continue
    df_all_problems = pd.concat([df_all_problems, df_problem], sort=False)
    logging.info('Current day: {}'.format(day.strftime('%Y-%m-%d')))
    with pd.option_context("display.max_colwidth", 1000): display(df_problem)
    with pd.option_context("display.max_colwidth", 1000): display(df[df['Series Instance UID'].isin(df_problem['Series Instance UID'])])