Merge 3 metadata files to create `../data/meta/meta_merged.csv`:

1. `_ImageID_PDMRID_CrossRef.xlsx`:  meta comes with PDX slides; crossref
2. `PDX_Meta_Information.csv`:       meta from Yitan; pdx_meta
3. `meta_from_wsi_slides.csv`:       meta extracted from SVS slides using openslide; slides_meta

Note! Before running this code, generate meta_from_wsi_slides.csv with get_meta_from_slides.py or `01_get_meta_from_slides.ipynb`.

Note! Yitan's file has some missing samples for which we do have the slides.<br>
The missing samples either don't have response data or expression data.

In [1]:
import os
import sys
from pathlib import Path
import glob
from pprint import pprint
import pandas as pd
import numpy as np

dirpath = Path.cwd()
print(dirpath)

sys.path.append(str(dirpath/'../src'))
# print(sys.path)

/vol/ml/apartin/projects/pdx-histo/nbs


In [2]:
# Path
datapath = dirpath/'../data'
slidespath  = datapath/'doe-globus-pdx-data'  # path to raw WSI data
metapath = datapath/'meta'
crossref_meta_fname = '_ImageID_PDMRID_CrossRef.xlsx'  # comes with the svs slides

# Meta file names
crossref_fname = '_ImageID_PDMRID_CrossRef.xlsx'
slides_meta_fname = 'meta_from_wsi_slides.csv'
pdx_meta_fname = 'PDX_Meta_Information.xlsx'

# PDX image meta (from NCI/Globus)
df_img = pd.read_excel(metapath/crossref_meta_fname, engine='openpyxl', header=2)
display(df_img[:2])

Unnamed: 0,Model,Sample ID,Image ID,Capture Date,Date Loaded to BW_Transfers,Notes
0,114434~197-R,A38WG0JH1,29249.0,2017-01-05,2020-09-15,
1,114434~197-R,A38WG3J91,29250.0,2017-01-05,2020-09-15,


In [4]:
# Load meta
from merge_meta_files import load_crossref, load_pdx_meta, load_slides_meta
cref = load_crossref(metapath, crossref_fname)
pdx = load_pdx_meta(metapath, pdx_meta_fname)
slides_meta = load_slides_meta(metapath, slides_meta_fname)

print('Crossref: {}'.format(cref.shape))
print('PDX meta: {}'.format(pdx.shape))
pprint(cref[:2])
pprint(pdx[:2])

Crossref: (593, 8)
PDX meta: (97, 7)
          model patient_id specimen_id  sample_id  image_id capture_date  \
0  114434~197-R     114434       197-R  A38WG0JH1     29249   2017-01-05   
1  114434~197-R     114434       197-R  A38WG3J91     29250   2017-01-05   

  date_loaded_to_bw_transfers Notes  
0                  2020-09-15   NaN  
1                  2020-09-15   NaN  
  patient_id specimen_id    tumor_site_from_data_src tumor_type_from_data_src  \
0     135848       042-T  Digestive/Gastrointestinal   Adenocarcinoma - colon   
1     172845       121-B  Digestive/Gastrointestinal   Adenocarcinoma - colon   

        simplified_tumor_site       simplified_tumor_type stage_or_grade  
0  digestive/gastrointestinal  digestive/gastrointestinal        grade 2  
1  digestive/gastrointestinal  digestive/gastrointestinal        grade 2  


In [5]:
#
# Merge crossref and pdx_meta
#
df_mrg = cref.merge(pdx, on=['patient_id', 'specimen_id'], how='inner').reset_index(drop=True)
df_mrg = df_mrg.drop(columns=['capture_date', 'date_loaded_to_bw_transfers'])
df_mrg = df_mrg.sort_values(['patient_id', 'specimen_id', 'sample_id'], ascending=True).reset_index(drop=True)

print('Crossref:  {}'.format(cref.shape))
print('PDX meta:  {}'.format(pdx.shape))
print('1st merge: {}'.format(df_mrg.shape))
# pprint(df_mrg[:2])

Crossref:  (593, 8)
PDX meta:  (97, 7)
1st merge: (584, 11)


In [6]:
#
# Merge with slides_meta
#
df_final = df_mrg.merge(slides_meta, how='inner', on='image_id')

print('\n1st merge:   {}'.format(df_mrg.shape))
print('slides_meta: {}'.format(slides_meta.shape))
print('df_final:    {}\n'.format(df_final.shape))
pprint(df_final[:3])


1st merge:   (584, 11)
slides_meta: (596, 5)
df_final:    (583, 15)

          model patient_id specimen_id  sample_id  image_id Notes  \
0  114434~197-R     114434       197-R     A35YC3     27166   NaN   
1  114434~197-R     114434       197-R     A36YC9     25127   NaN   
2  114434~197-R     114434       197-R  A38WG0JH1     29249   NaN   

  tumor_site_from_data_src         tumor_type_from_data_src  \
0          Musculoskeletal  Non-Rhabdo. soft tissue sarcoma   
1          Musculoskeletal  Non-Rhabdo. soft tissue sarcoma   
2          Musculoskeletal  Non-Rhabdo. soft tissue sarcoma   

  simplified_tumor_site simplified_tumor_type stage_or_grade  power  width  \
0       musculoskeletal  sarcoma/mesothelioma        grade 3     20  23904   
1       musculoskeletal  sarcoma/mesothelioma        grade 3     20  19920   
2       musculoskeletal  sarcoma/mesothelioma        grade 3     20  11952   

   height     MPP  
0   17995  0.5027  
1   17339  0.5027  
2   13573  0.5027  


In [None]:
# df_final.to_csv(metapath/'meta_merged.csv', index=False)
print('\nDone.')