In [None]:
import logging
import datetime

from IPython.core.display import display, HTML

from scripts.main import *
from scripts.retrieve_data import *
from scripts.extract_data import *

%load_ext autoreload
%reload_ext autoreload
%autoreload 2

# set the width of the notebook
display(HTML("<style>.container { width:95% !important; }</style>"))

### Load the config and get the series using the API

In [None]:
df_studies = load_transform_and_save_data_from_files(config)

### Load the config and get the series manually

In [None]:
config = load_config()
config['main']['start_date'] = '2019-01-07'
config['main']['end_date'] = '2019-04-26'

In [None]:
config = load_config()
config['main']['start_date'] = '2019-01-07'
config['main']['end_date'] = '2019-04-26'
df_series = load_data_from_files(config)

indices_to_exclude = []
logging.info('Found {} series before filtering description'.format(len(df_series)))
for descr_pattern in config['retrieve']['series_descr_patterns_to_exclude'].split('\n'):
    to_exclude_rows = df_series['Series Description'].str.match(descr_pattern, case=False)
    # gather all the indices
    indices_to_exclude.append(to_exclude_rows[to_exclude_rows == True].index)
# flatten the list
indices_to_exclude = [index for indices in indices_to_exclude for index in indices.values]
# if there is something to exclude, show a message and drop the rows
if len(indices_to_exclude) > 0:
    logging.info('Found {} series to exclude based on their description: "{}"'.format(len(indices_to_exclude),
        '", "'.join(df_series.loc[indices_to_exclude]['Series Description'])))
    df_series.drop(indices_to_exclude, inplace=True)
logging.info('Found {} series after filtering description'.format(len(df_series)))

# further filter out some Series that are not primary acquisitions (and do not contain any relevant time information)
df_series = df_series[~df_series['Protocol Name'].isin(config['retrieve']['series_protocols_to_exclude'].split('\n'))]
logging.debug('Found {} series after filtering protocol names'.format(len(df_series)))

#df_series = df_series[~df_series['Series Description'].isin(['Protocole patient', 'Enhancement curve'])]
#df_series = df_series[df_series.Machine != 'syngo.via.VB30A']
df_series

In [None]:
df_series[df_series.Machine == 'syngo.via.VB30A']

### Do the marking of the re-takes

In [None]:
df_series = mark_retakes(config, df_series)

### Do the grouping

In [None]:
df_series, df_count_series, df_count_studies, df_count_series_day, df_count_study_day, df_count_study_weekday = do_series_groupby(config, df_series)

In [None]:
df_count_studies

### Check what happened in the "mixed cases" studies

In [None]:
with pd.option_context("display.max_rows", 20): display(df_series[df_series['Machine Group'] == 'mixed cases'])

In [None]:
set(df_series[df_series['Machine Group'] == 'mixed cases']['Study Description'])

### Do some counting on different fields

In [None]:
for field in ['Institution Name', 'Machine', 'Machine Group', 'Modality', 'Series Description', 'Study Description', 'Patient ID', 'i_take']:
    logging.info('Number of *Series* groupped by "{}"'.format(field))
    display(df_series.groupby(field)['SUID'].count())
    logging.info('Number of *Studies* groupped by "{}"'.format(field))
    display(df_series.groupby([field, 'SUID']).count().reset_index().groupby(field)['SUID'].count())
    logging.info('='*160)

### Figure out the start and end times of each study

In [None]:
df_studies = df_series.dropna().groupby('SUID').agg({
    'Series Date': lambda x: '/'.join(set(x)),
    'Start Time': 'min',
    'End Time': 'max',
    'Study Description': lambda x: '/'.join(set(x)),
    'Machine Group': lambda x: '/'.join(set(x)),
    'Modality': lambda x: '/'.join(set(x)),
    'Institution Name': lambda x: '/'.join(set(x)),
    'Protocol Name': lambda x: '/'.join(set(x))
}).sort_values(['Series Date', 'Machine Group', 'Start Time', 'SUID'])
studies_save_path = 'data/studies/studies_{}_{}.pkl'.format(config['main']['start_date'], config['main']['end_date']).replace('-', '')
df_studies.to_pickle(studies_save_path)

In [None]:
df_studies = load_transform_and_save_data_from_files(config)