## Tests for retrieving data from PACS

### Imports

In [None]:
import time
import re
import logging
import configparser
import pandas as pd
from collections import namedtuple

from datetime import datetime as dt

from IPython.core import display as ICD

from pydicom.dataset import Dataset

from scripts.run_all import run_all
from scripts.retrieve_data_from_PACS import *

# set the width of display to infinite for all pandas DataFrame
#pd.set_option('display.max_colwidth', -1)
#pd.set_option('display.max_rows', None)

# set the level of pynetdicom module's logger to ERROR, to avoid any logs
logging.getLogger('pynetdicom').setLevel(logging.ERROR)

%load_ext autoreload
%reload_ext autoreload
%autoreload 2

### Initialize the "config" object

In [None]:
config = run_all()

### Find all 'PT' and 'NM' studies for a day

In [None]:
df_studies = find_studies_for_day(config, config['main']['start_date'], ['PT', 'NM'])

# filter out irrelevant studies
df_studies = df_studies[df_studies['Patient ID'].str.match('^\d+$')]
df_studies = df_studies[~df_studies['Study Description'].isin(['EXTRINSEQUE'])]
df_studies.reset_index(drop=True, inplace=True)

df_studies

### Get all series for the found studies

In [None]:
# this DataFrame stores the list of all the series found for all studies
df_all_series = pd.DataFrame()

# go through each study
logging.info('Going through {} studie(s)'.format(len(df_studies)))
for i_study in range(len(df_studies)):
    
    # find all series of the current study
    df_series = find_series_for_study(config, df_studies.iloc[i_study])
    if df_series is None:
        logging.warning('Skipping study because there are no usable Series inside')
        continue
        
    
    # filter for the institution name
    accepted_inst_names = config['extract']['accepted_institution_names'].split('\n')
    inst_names = list(set([inst_name.replace('  ', ' ') for inst_name in df_series.loc[:, 'Institution Name']]))
    if len(inst_names) > 1:
        logging.warning('Multiple institution names for study: "{}"'.format(' / '.join(inst_names)))
        inst_name = 'mixed'
    else:
        inst_name = inst_names[0]
    df_studies.loc[i_study, 'Institution Name'] = inst_name
    if inst_name.lower().replace(' ', '') not in accepted_inst_names:
        logging.warning('Skipping study because it is not from CHUV (but from "{}")'.format(inst_name))
        continue
    
    # append the new series to the main series DataFrame
    df_all_series = df_all_series.append(df_series, sort=False, ignore_index=True)
    if inst_name == 'mixed':
        ICD.display(df_series)
    
df_all_series

###  Check whether there are any studies/series with mixed "Institution Name"

In [None]:
df_all_series[df_all_series['Study Instance UID'].isin(df_studies[df_studies['Institution Name'] == 'mixed']['Study Instance UID'])]

### Go through each series and find information about them, allowing re-tries if a Series cannot be immediately fetched

In [None]:
df_all_series_save = df_all_series.copy()

In [None]:
# go through each study
logging.info('Going through {} series'.format(len(df_all_series)))
df_all_series['i_try'] = None
n_max_try = 10
for i_series in df_all_series.index:
    
    row_info = None
    i_try = 0
    while row_info is None:
        i_try += 1
        df_all_series.loc[i_series, 'i_try'] = i_try
        # find information about this series by fetching some images
        row_info = fetch_info_for_series(config, df_all_series.loc[i_series])
        # if there is no data and we reached our maximum number of tries
        if row_info is None and i_try >= n_max_try:
            # mark row as a failed trial and abort
            df_all_series.loc[i_series, 'i_try'] = -1
            break
        # if there is no data but we did not reach (yet) our maximum number of tries
        elif row_info is None:
            # delay the next retry
            time.sleep(0.5)
            
    # abort processing for this series no data
    if row_info is None:
        logging.error('ERROR with series {}: no data found'.format(df_all_series.loc[i_series, 'Series Instance UID']))
        continue

    # copy the relevant parameters into the main DataFrame
    df_all_series.loc[i_series, 'start_time'] = row_info['start_time']
    df_all_series.loc[i_series, 'end_time'] = row_info['end_time']
    df_all_series.loc[i_series, 'machine'] = row_info['machine']

### Get some statistics on the success / failure rates of fetching info for SERIES

In [None]:
i_try = df_all_series['i_try']
n = len(df_all_series)
failures = i_try[i_try == -1]
successes = i_try[i_try > 0]
first_tries = successes[successes == 1]
multi_tries = successes[successes > 1]
logging.info('Success:     {:03d} / {:03d} ({:.1f}%)'.format(len(successes), n, 100 * len(successes) / n))
logging.info('Failures:    {:03d} / {:03d} ({:.1f}%)'.format(len(failures), n, 100 * len(failures) / n))
logging.info('First tries: {:03d} / {:03d} ({:.1f}%)'.format(len(first_tries), len(successes), 100 * len(first_tries) / len(successes)))
logging.info('Multi-tries: {:03d} / {:03d} ({:.1f}%)'.format(len(multi_tries), len(successes), 100 * len(multi_tries) / len(successes)))
logging.info('Mean ± SD multi-tries: {:.2f} ± {:.2f}'.format(multi_tries.mean(), multi_tries.std()))

### Go through the failed series again

In [None]:
df_all_series_save2 = df_all_series.copy()

In [None]:
# go through each study
logging.info('Going through {} series'.format(len(df_all_series)))
df_all_series['i_try2'] = None
n_max_try = 10
failed_indices = df_all_series[df_all_series['i_try'] == -1].index
for i_series in failed_indices:
    
    row_info = None
    i_try = 0
    while row_info is None:
        i_try += 1
        df_all_series.loc[i_series, 'i_try2'] = i_try
        # find information about this series by fetching some images
        row_info = fetch_info_for_series(config, df_all_series.loc[i_series])
        if row_info is None and i_try >= n_max_try:
            df_all_series.loc[i_series, 'i_try2'] = -1
            break
            
    # abort processing for this series no data
    if row_info is None:
        logging.error('ERROR with series {}: no data found'.format(df_all_series.loc[i_series, 'Series Instance UID']))
        continue

    # copy the relevant parameters into the main DataFrame
    df_all_series.loc[i_series, 'start_time'] = row_info['start_time']
    df_all_series.loc[i_series, 'end_time'] = row_info['end_time']
    df_all_series.loc[i_series, 'machine'] = row_info['machine']

### Get some statistics on the success / failure rates of fetching info for SERIES

In [None]:
i_try = df_all_series['i_try']
i_try2 = df_all_series['i_try2']
n = len(df_all_series)
failures = i_try[i_try == -1]
successes = i_try[i_try != -1]
first_tries = successes[successes == 1]
multi_tries = successes[successes > 1]
recoveries = i_try2[(i_try == -1) & (i_try2 != -1)]
total_failures = i_try2[(i_try == -1) & (i_try2 == -1)]
recov_first_tries = recoveries[recoveries == 1]
recov_multi_tries = recoveries[recoveries > 1]
logging.info('Success    (1):     {:03d} / {:03d} ({:.1f}%)'.format(len(successes), n, 100 * len(successes) / n))
logging.info('Failures    (1):    {:03d} / {:03d} ({:.1f}%)'.format(len(failures), n, 100 * len(failures) / n))
logging.info('First tries (1): {:03d} / {:03d} ({:.1f}%)'.format(len(first_tries), len(successes), 100 * len(first_tries) / len(successes)))
logging.info('Multi-tries (1): {:03d} / {:03d} ({:.1f}%)'.format(len(multi_tries), len(successes), 100 * len(multi_tries) / len(successes)))
logging.info('Mean ± SD multi-tries (1): {:.2f} ± {:.2f}'.format(multi_tries.mean(), multi_tries.std()))
logging.info('Recoveries  (2):  {:03d} / {:03d} ({:.1f}%)'.format(len(recoveries), len(failures), 100 * len(recoveries) / len(failures)))
logging.info('Total fails (2):  {:03d} / {:03d} ({:.1f}%)'.format(len(total_failures), len(failures), 100 * len(total_failures) / len(failures)))
logging.info('First tries (2): {:03d} / {:03d} ({:.1f}%)'.format(len(recov_first_tries), len(recoveries), 100 * len(recov_first_tries) / len(recoveries)))
logging.info('Multi-tries (2): {:03d} / {:03d} ({:.1f}%)'.format(len(recov_multi_tries), len(recoveries), 100 * len(recov_multi_tries) / len(recoveries)))
logging.info('Mean ± SD multi-tries (2): {:.2f} ± {:.2f}'.format(recov_multi_tries.mean(), recov_multi_tries.std()))

### Check how many failed series we have

In [None]:
df_no_start_time = df_all_series[df_all_series['start_time'].isnull()]
df_with_start_time = df_all_series[~df_all_series['start_time'].isnull()]
logging.info('{}/{} rows with start time ({:.1f} %), {}/{} rows without start time ({:.1f} %)'.format(
    len(df_with_start_time), len(df_all_series), 100 * len(df_with_start_time) / len(df_all_series),
    len(df_no_start_time), len(df_all_series), 100 * len(df_no_start_time) / len(df_all_series)))

### Find the 2 latest series for each study

In [None]:
df_all_series = df_all_series_save

In [None]:
study_UIDs = list(set(df_all_series['Study Instance UID']))
logging.info('Found {} unique study UIDs'.format(len(study_UIDs)))

FMT = '%H%M%S'
# put a threshold at 59 minutes
study_split_thresh = 59 * 60

# create a column to mark the second-take series
df_all_series['second_take'] = False

# build the list of rows to keep
indices_to_keep = []
for i in range(len(study_UIDs)):
    
    # get the series related to the current Study Instance UID
    sUID = study_UIDs[i]
    series_str = '[{:2d}/{:2d}] {}...{}'.format(i, len(study_UIDs) - 1, sUID[:8], sUID[-5:])
    df_series_for_study = df_all_series[df_all_series['Study Instance UID'] == sUID]
    logging.debug('Processing {}: found {:2d} series'.format(series_str, len(df_series_for_study)))
    
    # extract the list of indices for the current series
    indices_for_study = list(df_series_for_study.index.values)
    
    # sort according to time and keep the last 2 indices (last 2 series)
    df_series_for_study = df_series_for_study.sort_values('Series Time')
    indices_to_add = df_series_for_study.iloc[-2:,:].index.values
    logging.debug(indices_to_add)
    indices_to_keep.extend(list(indices_to_add))
    
    # there must be at least 2 series for any splitting
    if len(df_series_for_study) < 2:
        continue
    
    # check if there is any big difference in the successive series time
    time_diff = df_series_for_study['Series Time'].apply(lambda t: dt.strptime(t, FMT)).diff()
    time_diff_sec = time_diff[1:].apply(lambda td: td.seconds)
    # get the DataFrame indices where the time difference is bigger than the threshold
    split_indices = list(time_diff_sec[time_diff_sec > study_split_thresh].index)
    
    # if there is no splitting indices, keep the last 2 series
    if len(split_indices) == 0:
        logging.debug('No splitting required for study {}'.format(series_str))
        continue
    
    # if there is more than one split point, throw an error and do not do any splitting
    elif len(split_indices) > 1:
        logging.error('Error: too many splitting points found for study {}: [{}]'
                      .format(series_str, ', '.join(str(split_indices))))
        continue
    
    # if there is only a single splitting time
    else:
        
        indices_to_add = [
            max(split_indices[0] - 2, min(indices_for_study)),
            max(split_indices[0] - 1, min(indices_for_study))
        ]
        indices_to_add = list(set(indices_to_add))
        logging.info(indices_to_add)
        indices_to_keep.extend(indices_to_add)
        logging.info('Splitting {} at {}'.format(series_str, split_indices[0]))
        df_all_series.loc[[i for i in indices_for_study if i >= split_indices[0]], 'second_take'] = True

In [None]:
# keep only the relevant rows
df_series_pruned = df_all_series.loc[indices_to_keep].sort_values(['Patient ID', 'Series Time'])

# create a column of unique ID (including the information about second takes)
df_series_pruned['UID'] = ''
i_UID = 0

# create a unique ID for the relevant series
for ind in df_series_pruned.index:
    UID = '{}-{}'.format(*df_series_pruned.loc[ind, ['Series Date', 'Patient ID']])
    UID += '-{:04d}-A'.format(i_UID)
    i_UID += 1
    df_series_pruned.loc[ind, 'UID'] = UID

df_series_pruned.iloc[:,[0,1,2,3,6,12]]

### Get a summary of what machines are used in which institution names

In [None]:
# filter for series where information could be gathered
df_series_filtered = df_all_series[~df_all_series.start_time.isnull()]
# get a list of machines
machines = set(df_series_filtered.machine[~df_series_filtered.machine.isnull()])
logging.info('Machines:\n  - {}'.format('\n  - '.join(machines)))
# create groups of machines
groups = []
# create a DataFrame that regroups all the sub-DataFrames
all_grouped_df = None
# go through the list of machines
for machine in machines:
    # get all the study IDs for this machine
    study_UIDs = set(df_series_filtered[df_series_filtered.machine == machine]['Study Instance UID'])
    rows_with_same_study_UIDs = df_series_filtered['Study Instance UID'].isin(study_UIDs)
    sub_df = df_series_filtered[rows_with_same_study_UIDs].loc[:, ['machine', 'Institution Name', 'Study Instance UID']] \
        .rename(columns = {'machine': 'Machine Name', 'Study Instance UID': 'Number of Series'})
    sub_df_grouped = sub_df.groupby(['Machine Name', 'Institution Name']).count()
    logging.debug(sub_df)
    group_name = ', '.join(sorted(list(set(sub_df['Machine Name']))))
    if group_name in groups: continue
    for conf_machine in config['machines']:
        logging.info('conf_machine: {}'.format(conf_machine))
        for conf_machine_pattern in config['machines'][conf_machine].split(','):
            if re.match(conf_machine_pattern, machine, re.IGNORECASE):
                logging.info('match')
    sub_df_grouped['Machine Group'] = i_group
    i_group += 1
    groups.append(group_name)
    if all_grouped_df is None:
        all_grouped_df = sub_df_grouped
    else:
        all_grouped_df = pd.concat([all_grouped_df, sub_df_grouped])
    ICD.display(sub_df_grouped)
logging.debug(set(groups))
all_grouped_df.groupby(['Machine Group', 'Machine Name', 'Institution Name']).sum()

### Get a summary of what machines are used together

In [None]:
# filter for series where information could be gathered
df_series_filtered = df_all_series[~df_all_series.start_time.isnull()]
# get a list of machines
machines = set(df_series_filtered.machine[~df_series_filtered.machine.isnull()])
logging.info('Machines:\n  - {}'.format('\n  - '.join(machines)))
# create groups of machines
groups = []
# create a DataFrame that regroups all the sub-DataFrames
all_grouped_df = None
# go through the list of machines
for machine in machines:
    # get all the study IDs for this machine
    study_UIDs = set(df_series_filtered[df_series_filtered.machine == machine]['Study Instance UID'])
    rows_with_same_study_UIDs = df_series_filtered['Study Instance UID'].isin(study_UIDs)
    sub_df = df_series_filtered[rows_with_same_study_UIDs].loc[:, ['machine', 'Modality', 'Study Instance UID']] \
        .rename(columns = {'machine': 'Machine Name', 'Study Instance UID': 'Number of Series'})
    sub_df_grouped = sub_df.groupby(['Machine Name', 'Modality']).count()
    logging.debug(sub_df_grouped.index)
    group_name = ', '.join(sorted(list(set(sub_df['Machine Name']))))
    if group_name in groups: continue
    for conf_machine in config['machines']:
        logging.info('conf_machine: {}'.format(conf_machine))
        for conf_machine_pattern in config['machines'][conf_machine].split(','):
            if re.match(conf_machine_pattern, machine, re.IGNORECASE):
                logging.info('match')
    sub_df_grouped['Machine Group'] = i_group
    i_group += 1
    groups.append(group_name)
    if all_grouped_df is None:
        all_grouped_df = sub_df_grouped
    else:
        all_grouped_df = pd.concat([all_grouped_df, sub_df_grouped])
    #ICD.display(sub_df_grouped)
logging.debug(set(groups))
all_grouped_df.groupby(['Machine Group', 'Machine Name', 'Modality']).sum()

### Go through failed series again

In [None]:
# go through each study
logging.info('Going through {} series'.format(len(df_series_pruned[df_series_pruned['start_time'].isnull()])))
for i_series in df_series_pruned[df_series_pruned['start_time'].isnull()].index:
    
    # find information about this series by fetching some images
    row_info = fetch_info_for_series(config, df_series_pruned.loc[i_series])

    # abort processing for this series no data
    if row_info is None:
        logging.error('ERROR with series {}: no data found'.format(df_series_pruned.loc[i_series, 'Series Instance UID']))
        continue

    # copy the relevant parameters into the main DataFrame
    df_series_pruned.loc[i_series, 'start_time'] = row_info['start_time']
    df_series_pruned.loc[i_series, 'end_time'] = row_info['end_time']
    df_series_pruned.loc[i_series, 'machine'] = row_info['machine']

###  Remove failed series

In [None]:
df_series_pruned[df_series_pruned['start_time'].isnull()]

### Prune series by time overlap

In [None]:
study_UIDs = list(set(df_all_series['Study Instance UID']))
logging.info('Found {} unique study UIDs'.format(len(study_UIDs)))

# build the list of rows to exclude
indices_to_exclude = []
for i in range(len(study_UIDs)):
    
    # get the series related to the current Study Instance UID
    sUID = study_UIDs[i]
    df_series_for_study = df_all_series[df_all_series['Study Instance UID'] == sUID]
    logging.info('Processing [{:2d}/{:2d}] {}...{}: found {:2d} series'.format(i, len(study_UIDs) - 1,
        sUID[:8], sUID[-5:], len(df_series_for_study)))
    
    # prune the series by time overlap
    df_series_for_study_pruned = prune_by_time_overlap(df_series_for_study)
    logging.info(len(df_series_for_study_pruned))
    
    """
    to_exclude_rows = df_series['Series Description'].str.match(descr_pattern, case=False)
    # gather all the indices
    indices_to_exclude.append(to_exclude_rows[to_exclude_rows == True].index)
# flatten the list
indices_to_exclude = [index for indices in indices_to_exclude for index in indices.values]
# if there is something to exclude, show a message and drop the rows
if len(indices_to_exclude) > 0:
    logging.debug('Found {} series to exclude based on their description: "{}"'.format(len(indices_to_exclude),
        '", "'.join(df_series.loc[indices_to_exclude]['Series Description'])))
    df_series.drop(indices_to_exclude, inplace=True)
logging.debug('Found {} series after filtering description'.format(len(df_series)))
df_series = prune_by_time_overlap(df_series)
    """

In [None]:
df_series

### Get all series for the found studies and get their time ranges

In [None]:
# get the list of accepted (valid) institution names from the config
accepted_inst_names = config['extract']['accepted_institution_names'].split('\n')
    
# this DataFrame stores the list of all the series found for all studies
df_all_series = pd.DataFrame()

# go through each study
logging.info('Going through {} studie(s)'.format(len(df_studies)))
#for i_study in df_studies.index:
for i_study in df_studies.index[0:1]:
    
    # find all series of the current study
    df_series = find_series_for_study(config, df_studies.loc[i_study, :])
    
    # filter for the institution name
    inst_name = df_series.loc[0, 'Institution Name'].replace('  ', ' ')
    df_studies.loc[i_study, 'Institution Name'] = inst_name
    if inst_name.lower().replace(' ', '') not in accepted_inst_names:
        logging.warning('  Skipping study because it is not from CHUV (institution name: "{}")'.format(inst_name))
        df_studies.drop(i_study, inplace=True)
        continue
    
    # go through each series
    logging.info('Going through {} series'.format(len(df_series)))
    #for i_series in df_series.index:
    for i_series in df_series.index[2:3]:
        
        # find information about this series by fetching some images
        row_info = fetch_info_for_series(config, df_series.loc[i_series, :])
        
        # abort processing for this series no data
        if row_info is None:
            logging.error('  Skipping series {}: no data found.'.format(df_series.loc[i_series, 'Series Instance UID']))
            continue

        raise(TypeException, 'stop')
            
        # copy the relevant parameters into the main DataFrame
        df_series.loc[i_series, 'start_time'] = row_info['start_time']
        df_series.loc[i_series, 'end_time'] = row_info['end_time']
        df_series.loc[i_series, 'machine'] = row_info['machine']
    
    if 'start_time' not in df_series.columns:
        logging.error('  Skipping study {} because no valid series was found.'
                      .format(df_series.loc[i_series, 'Study Instance UID']))
        continue
        
    # remove redundant series
    df_series = prune_by_time_overlap(df_series)
    
    # append the new series to the main series DataFrame
    df_all_series = df_all_series.append(df_series, sort=False, ignore_index=True)
    
logging.debug("""
    # create time ranges from the start/end times
    time_ranges = []
    for i_serie in range(len(df_series)):
        time_ranges.append('{}-{}'.format(df_series.loc[i_serie, 'start_time'],
                                          df_series.loc[i_serie, 'end_time']))
    
    # propagate back the time range and machine name information
    df_studies.loc[i_study, 'machines'] = ','.join(list(set(df_series['machine'])))
    df_studies.loc[i_study, 'time_ranges'] = ','.join(time_ranges)
    df_studies.loc[i_study, 'overall_time_range'] = '{}-{}'.format(
        df_series.iloc[0]['start_time'], df_series.iloc[-1]['end_time'])
""")

### Show some example data

In [None]:
#df_studies
df_series.iloc[:, [0,3,4,5,6,7,13,14,15]]

### Look for a study with a "reprise"

In [None]:
#logging.info(df_studies.iloc[2, :])
#df_series = find_series_for_study(config, df_studies.iloc[2, :])
ICD.display(df_series.iloc[:, [1,3,4,5,6,7,11,12]])

### Save the data

In [None]:
#df_studies.to_pickle(config['main']['start_date'] + '.pickle')

### Further process the studies

In [None]:
df = df_studies.loc[:, ['Study Date', 'Study Description', 'Patient ID', 'machines']]
df.columns = ['date', 'descr', 'pid', 'machine']
df['start_time'] = df_studies.loc[:, 'Study Time']
df['end_time'] = df_studies.loc[:, 'overall_time_range'].apply(lambda s: s.split('-')[1])
df

Rename the machines to have some consensus
--

In [None]:
machine_names = ['Vision 600', 'Discovery 690', 'Millennium MPR', 'Intevo 16', 'Discovery 670']

"""
Biograph64/vision PT
discovery 690 PT
*discovery 670 SPECT
Millennium SPECT sans CT
Encore2/Intevo SPECT
"""

for machine_name in machine_names:
    matching_rows = df['machine'].str.match('.*' + machine_name + '.*', case=False)
    if matching_rows.sum() > 0:
        logging.info('Found {} rows matching the name "{}":'.format(matching_rows.sum(), machine_name))
    df.loc[matching_rows, 'machine'] = machine_name

# replace the "Encore2" machine name to "Intevo", since it is the same machine
#df.loc[df['machine'] == 'Encore2', 'machine'] = 'Intevo 16'
#machine_names.remove('Encore2')

df

Rename the descriptions to have some consensus
--

In [None]:
description_patterns = {'FDG Corps Entier': 'fdgcorpsentier', 'FDG Tronc': 'fdgtronc', 'Rb82 Coeur': 'rb82coeur',
                       'FDG Abdomen TAP Veineux Corps Entier': 'abdomen1fdgtapveineuxpetcorpsentierflowadult',
                       'Scintigraphie OctreoScan': 'scintioctreoscan', 'FDG WB Child': 'pet1petfdgwbflowchild'}
for descr in description_patterns.keys():
    matching_rows = df['descr'].str.lower().str.replace('[-_^ ()]', '').str\
        .match('.*' + description_patterns[descr] + '.*', case=False)
    if matching_rows.sum() > 0:
        logging.info('Found {} rows matching the name "{}":'.format(matching_rows.sum(), descr))
    df.loc[matching_rows, 'descr'] = descr

df

Check for overlap for each machine
--

In [None]:
for machine in machine_names:
    logging.info('Checking overlap for machine "{}"'.format(machine))
    df_machine = df[df['machine'] == machine]
    df[df['machine'] == machine] = prune_by_time_overlap(df_machine)
df

In [None]:
logging.info(set(df['machine']))
df.sort_values('machine')

Save processed data
--

In [None]:
#df.to_pickle(config['main']['start_date'] + '_processed.pickle')

In [None]:
df2 = pd.read_pickle('20190806.pickle')

In [None]:
df2

In [None]:
end_times = df2['time_ranges'].str.split(',').apply(lambda tr1: [tr2.split('-')[1] for tr2 in tr1[-2:]])
end_times = end_times[end_times.apply(len) == 2]
end_times.apply(lambda l: [i[0] for i in sorted(enumerate(l), key=lambda x:x[1])])

In [None]:
end_times = df2['time_ranges'].str.split(',').apply(lambda tr1: [tr2.split('-')[1] for tr2 in tr1[-2:]])
end_times = end_times[end_times.apply(len) == 2]
df2