Merge 3 metadata files to create `../data/meta/meta_merged.csv`:

1. `_ImageID_PDMRID_CrossRef.xlsx`:  meta comes with PDX slides; crossref
2. `PDX_Meta_Information.csv`:       meta from Yitan; pdx_meta
3. `meta_from_wsi_slides.csv`:       meta extracted from SVS slides using openslide; slides_meta

Note! Before running this code, generate `meta_from_wsi_slides` with `get_meta_from_slides.py` or `01_get_meta_from_slides.ipynb`.

Note! Yitan's file has some missing samples for which we have the histology slides.<br>
The missing samples either don't have response data or expression data.

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import os
import sys
from pathlib import Path
import glob
from pprint import pprint
import pandas as pd
import numpy as np

fdir = Path.cwd()
print(fdir)
sys.path.append(str(fdir/'../src'))
from config import cfg

/vol/ml/apartin/projects/pdx-histo/nbs


In [2]:
# Path
METAPATH = cfg.DATADIR/'meta'

Unnamed: 0,Model,Sample ID,Image ID,Capture Date,Date Loaded to BW_Transfers,Notes
0,114434~197-R,A38WG0JH1,29249.0,2017-01-05,2020-09-15,
1,114434~197-R,A38WG3J91,29250.0,2017-01-05,2020-09-15,


In [4]:
# Load meta
from merge_meta_files import load_crossref, load_pdx_meta, load_slides_meta

cref = load_crossref( METAPATH/cfg.CROSSREF_FNAME )
pdx = load_pdx_meta( METAPATH/cfg.PDX_META_FNAME )
slides_meta = load_slides_meta( METAPATH/cfg.SLIDES_META_FNAME )

print('Crossref: {}'.format(cref.shape))
print('PDX meta: {}'.format(pdx.shape))
display(cref[:2])
display(pdx[:2])

Crossref: (593, 8)
PDX meta: (97, 7)


Unnamed: 0,model,patient_id,specimen_id,sample_id,image_id,capture_date,date_loaded_to_bw_transfers,Notes
0,114434~197-R,114434,197-R,A38WG0JH1,29249,2017-01-05,2020-09-15,
1,114434~197-R,114434,197-R,A38WG3J91,29250,2017-01-05,2020-09-15,


Unnamed: 0,patient_id,specimen_id,tumor_site_from_data_src,tumor_type_from_data_src,simplified_tumor_site,simplified_tumor_type,stage_or_grade
0,135848,042-T,Digestive/Gastrointestinal,Adenocarcinoma - colon,digestive/gastrointestinal,digestive/gastrointestinal,grade 2
1,172845,121-B,Digestive/Gastrointestinal,Adenocarcinoma - colon,digestive/gastrointestinal,digestive/gastrointestinal,grade 2


In [5]:
#
# Merge crossref and pdx_meta
#
df_mrg = cref.merge(pdx, on=['patient_id', 'specimen_id'], how='inner').reset_index(drop=True)
df_mrg = df_mrg.drop(columns=['capture_date', 'date_loaded_to_bw_transfers'])
df_mrg = df_mrg.sort_values(['patient_id', 'specimen_id', 'sample_id'], ascending=True).reset_index(drop=True)

print('Crossref:  {}'.format(cref.shape))
print('PDX meta:  {}'.format(pdx.shape))
print('1st merge: {}'.format(df_mrg.shape))
# pprint(df_mrg[:2])

Crossref:  (593, 8)
PDX meta:  (97, 7)
1st merge: (584, 11)


In [6]:
#
# Merge with slides_meta
#
df_final = df_mrg.merge(slides_meta, how='inner', on='image_id')

print('\n1st merge:   {}'.format(df_mrg.shape))
print('slides_meta: {}'.format(slides_meta.shape))
print('df_final:    {}\n'.format(df_final.shape))
display(df_final[:3])


1st merge:   (584, 11)
slides_meta: (596, 5)
df_final:    (583, 15)



Unnamed: 0,model,patient_id,specimen_id,sample_id,image_id,Notes,tumor_site_from_data_src,tumor_type_from_data_src,simplified_tumor_site,simplified_tumor_type,stage_or_grade,width,height,power,MPP
0,114434~197-R,114434,197-R,A35YC3,27166,,Musculoskeletal,Non-Rhabdo. soft tissue sarcoma,musculoskeletal,sarcoma/mesothelioma,grade 3,23904,17995,20,0.5027
1,114434~197-R,114434,197-R,A36YC9,25127,,Musculoskeletal,Non-Rhabdo. soft tissue sarcoma,musculoskeletal,sarcoma/mesothelioma,grade 3,19920,17339,20,0.5027
2,114434~197-R,114434,197-R,A38WG0JH1,29249,,Musculoskeletal,Non-Rhabdo. soft tissue sarcoma,musculoskeletal,sarcoma/mesothelioma,grade 3,11952,13573,20,0.5027


In [7]:
# df_final.to_csv(metapath/'meta_merged.csv', index=False)
print('\nDone.')


Done.
