### Matching echo and pet studies based on study dates ###

In [9]:
import os
import pandas as pd

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 1000)

#### PET REPORTS ####

In [10]:
# PET REPORTS
cfr_data_root = os.path.normpath('/mnt/obi0/andreas/data/cfr')
pet_reports_file = os.path.join(cfr_data_root, '210_getStressTest.xlsx')
pet_df = pd.read_excel(pet_reports_file)
keep_cols = ['reportID', 'mrn', 'subjectid', 'report_number', 'study_date', 'report_date', 'cfr']
pet = pet_df[keep_cols]
pet.loc[pet.study_date.index].study_date = pd.to_datetime(pet.study_date)
pet.loc[pet.report_date.index].study_date = pd.to_datetime(pet.report_date)

print('Number of reports: {}'.format(len(pet.report_number.unique())))
print('Unique patients:   {}'.format(len(pet.mrn.unique())))
print('Unique cfr values: {}'.format(len(pet.cfr.unique())))

# Lets keep only a few columns 
pet.head()

Number of reports: 3350
Unique patients:   3072
Unique cfr values: 3414


Unnamed: 0,reportID,mrn,subjectid,report_number,study_date,report_date,cfr
0,72946,4474599,3,0904-0007,28-Sep-07,04-Sep-07,1.428232
1,91480,9774688,6,EVS0154093,11-Nov-08,11-Nov-08,2.015579
2,78543,8880684,7,0802806B,31-Oct-07,29-Jan-08,2.401606
3,73048,16480857,10,0728923B,26-Nov-07,29-Oct-07,1.1075
4,100873,7633498,11,EVS0266850,29-Jan-10,24-Sep-10,1.776545


#### NPY ECHO DATA ####

In [11]:
echo_files_name = 'echo_npyFiles_BWH.parquet'
echo = pd.read_parquet(os.path.join(cfr_data_root, echo_files_name))
echo = echo.astype({'mrn': 'int64'})
echo = echo.assign(echo_study_date = echo.datetime.dt.date,
                   echo_study_time = echo.datetime.dt.time)

In [12]:
print('Number of npy files: {}'.format(len(echo.filename.unique())))
print('Unique echo studies: {}'.format(len(echo.study.unique())))
print('Unique patients:     {}'.format(len(echo.mrn.unique())))
print()
echo.head()

Number of npy files: 1021868
Unique echo studies: 20371
Unique patients:     11846



Unnamed: 0,filename,dir,study,mrn,datetime,echo_study_date,echo_study_time
0,4a1ed563ca4f684e_4903a580509f95be6ede48ce4fec_...,/mnt/obi0/phi/echo/npyFiles/BWH/4a1e/4a1ed563c...,4a1ed563ca4f684e_4903a580509f95be6ede48ce4fec,18789404,2015-08-11 16:05:24,2015-08-11,16:05:24
1,4a1ed563ca4f684e_4903a580509f95be6ede48ce4fec_...,/mnt/obi0/phi/echo/npyFiles/BWH/4a1e/4a1ed563c...,4a1ed563ca4f684e_4903a580509f95be6ede48ce4fec,18789404,2015-08-11 16:05:24,2015-08-11,16:05:24
2,4a1ed563ca4f684e_4903a580509f95be6ede48ce4fec_...,/mnt/obi0/phi/echo/npyFiles/BWH/4a1e/4a1ed563c...,4a1ed563ca4f684e_4903a580509f95be6ede48ce4fec,18789404,2015-08-11 16:05:24,2015-08-11,16:05:24
3,4a1ed563ca4f684e_4903a580509f95be6ede48ce4fec_...,/mnt/obi0/phi/echo/npyFiles/BWH/4a1e/4a1ed563c...,4a1ed563ca4f684e_4903a580509f95be6ede48ce4fec,18789404,2015-08-11 16:05:24,2015-08-11,16:05:24
4,4a1ed563ca4f684e_4903a580509f95be6ede48ce4fec_...,/mnt/obi0/phi/echo/npyFiles/BWH/4a1e/4a1ed563c...,4a1ed563ca4f684e_4903a580509f95be6ede48ce4fec,18789404,2015-08-11 16:05:24,2015-08-11,16:05:24


#### PET STUDIES + NPY ECHO DATA ####

In [13]:
pet_echo = pet.merge(right = echo, on = 'mrn', how = 'left')
pet_echo.head()

Unnamed: 0,reportID,mrn,subjectid,report_number,study_date,report_date,cfr,filename,dir,study,datetime,echo_study_date,echo_study_time
0,72946,4474599,3,0904-0007,28-Sep-07,04-Sep-07,1.428232,,,,NaT,,
1,91480,9774688,6,EVS0154093,11-Nov-08,11-Nov-08,2.015579,,,,NaT,,
2,78543,8880684,7,0802806B,31-Oct-07,29-Jan-08,2.401606,4b7351bee96d1016_4903a444c23a2830ab58223ad84d_...,/mnt/obi0/phi/echo/npyFiles/BWH/4b73/4b7351bee...,4b7351bee96d1016_4903a444c23a2830ab58223ad84d,2007-10-29 08:56:21,2007-10-29,08:56:21
3,78543,8880684,7,0802806B,31-Oct-07,29-Jan-08,2.401606,4b7351bee96d1016_4903a444c23a2830ab58223ad84d_...,/mnt/obi0/phi/echo/npyFiles/BWH/4b73/4b7351bee...,4b7351bee96d1016_4903a444c23a2830ab58223ad84d,2007-10-29 08:56:21,2007-10-29,08:56:21
4,78543,8880684,7,0802806B,31-Oct-07,29-Jan-08,2.401606,4b7351bee96d1016_4903a444c23a2830ab58223ad84d_...,/mnt/obi0/phi/echo/npyFiles/BWH/4b73/4b7351bee...,4b7351bee96d1016_4903a444c23a2830ab58223ad84d,2007-10-29 08:56:21,2007-10-29,08:56:21


In [14]:
# Filter pet studies with current echo
pet_echo_nofiles = pet_echo.loc[pet_echo.filename.isnull()].reset_index(drop = True)
pet_echo_files = pet_echo.loc[~pet_echo.filename.isnull()].reset_index(drop = True)

# Save those
pet_echo_nofiles_name = 'pet_no_npyFiles_BWH.parquet'
pet_echo_files_name = 'pet_npyFiles_BWH.parquet'

pet_echo_nofiles.to_parquet(os.path.join(cfr_data_root, pet_echo_nofiles_name))
pet_echo_files.to_parquet(os.path.join(cfr_data_root, pet_echo_files_name))

In [15]:
print('PET with ECHO:')
print('Number of npy files: {}'.format(len(pet_echo_files.filename.unique())))
print('Unique echo studies: {}'.format(len(pet_echo_files.study.unique())))
print('Unique patients:     {}'.format(len(pet_echo_files.mrn.unique())))

PET with ECHO:
Number of npy files: 170158
Unique echo studies: 3711
Unique patients:     1759


In [16]:
print('PET without ECHO:')
print('Number of reports:   {}'.format(len(pet_echo_nofiles.reportID.unique())))
print('Unique patients:     {}'.format(len(pet_echo_nofiles.mrn.unique())))
print('Total PET patients:  {}'.format(len(pet.mrn.unique())))

PET without ECHO:
Number of reports:   1384
Unique patients:     1313
Total PET patients:  3072


In [18]:
pet_echo_nofiles.head()

Unnamed: 0,reportID,mrn,subjectid,report_number,study_date,report_date,cfr,filename,dir,study,datetime,echo_study_date,echo_study_time
0,72946,4474599,3,0904-0007,28-Sep-07,04-Sep-07,1.428232,,,,NaT,,
1,91480,9774688,6,EVS0154093,11-Nov-08,11-Nov-08,2.015579,,,,NaT,,
2,73048,16480857,10,0728923B,26-Nov-07,29-Oct-07,1.1075,,,,NaT,,
3,104144,25001462,13,EVS0310491,30-Jun-10,29-Jun-11,1.603968,,,,NaT,,
4,103368,3584661,17,EVS0246472,21-May-10,11-May-10,2.457241,,,,NaT,,
