Merge three metadata files to create `../data/meta/meta_merged.csv`:
1. `ImageID_PDMRID_CrossRef.csv` Meta that came with PDX images
2. `PDX_Meta_Information.csv` Meta from Yitan
3. `meta_from_wsi_images.csv` Meta that I (ap) extracted from WSI (SVS) images using histolab

Note! Before you run this, you need to generate `meta_from_wsi_images.csv` with `01-get-meta-from-images.ipynb`.

Note! Yitan's file has some missing samples for which we do have the images.<br>
The missing samples either don't have response data or expression data.

In [2]:
import os
from pathlib import Path
import pandas as pd
import numpy as np
from glob import glob
from pprint import pprint

dirpath = Path.cwd()
print(dirpath)

/vol/ml/apartin/projects/pdx-histo/nbs


# Part 1 - Merge meta files 1 and 2

## Load data
Loads two meta files.

In [3]:
# Data path
datapath = dirpath/'../data'
metapath = datapath/'meta'

# PDX image meta (from NCI/Globus)
df_img = pd.read_csv(metapath/'ImageID_PDMRID_CrossRef.csv')
display(df_img[:2])

Unnamed: 0,Model,Sample ID,Image ID,Capture Date,Date Loaded to BW_Transfers,Notes
0,114434~197-R,A38WG0JH1,29249,1/5/17,9/15/20,
1,114434~197-R,A38WG3J91,29250,1/5/17,9/15/20,


In [4]:
# [x for x in df_img['model'].values if '~' not in x ] #.map(lambda x: x.split('~')[1])

In [5]:
df_img = df_img.rename(columns={'Model': 'model'})
df_img = df_img[df_img['model'].notna()]
df_img = df_img.rename(columns={'Sample ID': 'sample_id', 
                                'Image ID': 'image_id',
                                'Capture Date': 'capture_date',
                                'Date Loaded to BW_Transfers': 'date_loaded_to_bw_transfers'})
df_img.insert(loc=1, column='patient_id',  value=df_img['model'].map(lambda x: x.split('~')[0]), allow_duplicates=True)
df_img.insert(loc=2, column='specimen_id', value=df_img['model'].map(lambda x: x.split('~')[1]), allow_duplicates=True)

# PDX Meta (from Yitan)
# Yitan doesn't have sample_id (??)
df_typ = pd.read_csv(metapath/'PDX_Meta_Information.csv')
print('Image meta', df_img.shape)
print('Yitan meta', df_typ.shape)

display(df_img[:2])
display(df_typ[:2])

Image meta (593, 8)
Yitan meta (97, 7)


Unnamed: 0,model,patient_id,specimen_id,sample_id,image_id,capture_date,date_loaded_to_bw_transfers,Notes
0,114434~197-R,114434,197-R,A38WG0JH1,29249,1/5/17,9/15/20,
1,114434~197-R,114434,197-R,A38WG3J91,29250,1/5/17,9/15/20,


Unnamed: 0,patient_id,specimen_id,tumor_site_from_data_src,tumor_type_from_data_src,simplified_tumor_site,simplified_tumor_type,stage_or_grade
0,135848,042-T,Digestive/Gastrointestinal,Adenocarcinoma - colon,digestive/gastrointestinal,digestive/gastrointestinal,grade 2
1,172845,121-B,Digestive/Gastrointestinal,Adenocarcinoma - colon,digestive/gastrointestinal,digestive/gastrointestinal,grade 2


In [6]:
# Drop items w/o images
df_img = df_img.dropna(subset=['image_id'])
df_img = df_img.astype({'image_id': int})
print(df_img.shape)
display(df_img[:2])

(593, 8)


Unnamed: 0,model,patient_id,specimen_id,sample_id,image_id,capture_date,date_loaded_to_bw_transfers,Notes
0,114434~197-R,114434,197-R,A38WG0JH1,29249,1/5/17,9/15/20,
1,114434~197-R,114434,197-R,A38WG3J91,29250,1/5/17,9/15/20,


In [7]:
c = 'tumor_site_from_data_src'
print(df_typ[c].nunique())
tt = df_typ.groupby([c]).agg({'patient_id': 'nunique', 'specimen_id': 'nunique'}).reset_index()
tt

9


Unnamed: 0,tumor_site_from_data_src,patient_id,specimen_id
0,Digestive/Gastrointestinal,25,31
1,Endocrine and Neuroendocrine,3,3
2,Genitourinary,11,12
3,Gynecologic,6,6
4,Head and Neck,8,8
5,Musculoskeletal,18,17
6,Respiratory/Thoracic,8,7
7,Skin,9,8
8,Unknown Primary,2,2


## Explore data (but don't modify)

In [8]:
# Subset the columns
df1 = df_img[['model', 'patient_id', 'specimen_id', 'sample_id', 'image_id']]
df2 = df_typ[['patient_id', 'specimen_id', 'stage_or_grade']]

display(df1[:2])
display(df2[:2])

Unnamed: 0,model,patient_id,specimen_id,sample_id,image_id
0,114434~197-R,114434,197-R,A38WG0JH1,29249
1,114434~197-R,114434,197-R,A38WG3J91,29250


Unnamed: 0,patient_id,specimen_id,stage_or_grade
0,135848,042-T,grade 2
1,172845,121-B,grade 2


In [9]:
# Merge meta files
df = df1.merge(df2, on=['patient_id', 'specimen_id'], how='inner').reset_index(drop=True)

print(df1.shape)
print(df2.shape)
print(df.shape)

(593, 5)
(97, 3)
(584, 6)


Note that some items are missing in Yitan's file.<br>
The missing samples either don't have response or expression data.

In [10]:
# Explore (merge and identify from which df the items are coming from)
# https://kanoki.org/2019/07/04/pandas-difference-between-two-dataframes/
df = df1.merge(df2, on=['patient_id', 'specimen_id'], how='outer', indicator=True)
print('Inner merge', df.shape)
display(df[:2])

print('In both         ', df[df['_merge']=='both'].shape)
print('In left or right', df[df['_merge']!='both'].shape)

# Find which items are missing in Yitan's file
# df = df1.merge(df2, on=['patient_id', 'specimen_id'], how='outer' ,indicator=True).loc[lambda x : x['_merge']=='right_only']
df_miss = df1.merge(df2, on=['patient_id', 'specimen_id'], how='outer', indicator=True).loc[lambda x : x['_merge']=='left_only']
df_miss = df_miss.sort_values(['patient_id', 'specimen_id'], ascending=True)
print('\nMissing items', df_miss.shape)
display(df_miss)

Inner merge (593, 7)


Unnamed: 0,model,patient_id,specimen_id,sample_id,image_id,stage_or_grade,_merge
0,114434~197-R,114434,197-R,A38WG0JH1,29249,grade 3,both
1,114434~197-R,114434,197-R,A38WG3J91,29250,grade 3,both


In both          (584, 7)
In left or right (9, 7)

Missing items (9, 7)


Unnamed: 0,model,patient_id,specimen_id,sample_id,image_id,stage_or_grade,_merge
45,146476~266-R,146476,266-R,ORIGINATOR,13946,,left_only
46,146476~266-R,146476,266-R,E19YJ0,20252,,left_only
47,146476~266-R,146476,266-R,E19YJ1,20307,,left_only
48,146476~266-R,146476,266-R,E20F08F67,20976,,left_only
49,146476~266-R,146476,266-R,E20F07,21545,,left_only
148,237351~077-R,237351,077-R,AL-IR0_AL-M977,20467,,left_only
149,237351~077-R,237351,077-R,AL-IR0,20729,,left_only
150,237351~077-R,237351,077-R,AL-IR0_AL-M977_AL-M302,23429,,left_only
151,237351~077-R,237351,077-R,AL-IR0_AL-M977_AL-M303_AL-ER0,26305,,left_only


## Merge the two files

In [11]:
# Merge meta files
df_mrg = df_img.merge(df_typ, on=['patient_id', 'specimen_id'], how='inner').reset_index(drop=True)

# Drop cols
df_mrg = df_mrg.drop(columns=['capture_date', 'date_loaded_to_bw_transfers'])

# Sort
df_mrg = df_mrg.sort_values(['patient_id', 'specimen_id', 'sample_id'], ascending=True).reset_index(drop=True)

print(df1.shape)
print(df2.shape)
print(df_mrg.shape)
display(df_mrg[:2])

(593, 5)
(97, 3)
(584, 11)


Unnamed: 0,model,patient_id,specimen_id,sample_id,image_id,Notes,tumor_site_from_data_src,tumor_type_from_data_src,simplified_tumor_site,simplified_tumor_type,stage_or_grade
0,114434~197-R,114434,197-R,A35YC3,27166,,Musculoskeletal,Non-Rhabdo. soft tissue sarcoma,musculoskeletal,sarcoma/mesothelioma,grade 3
1,114434~197-R,114434,197-R,A36YC9,25127,,Musculoskeletal,Non-Rhabdo. soft tissue sarcoma,musculoskeletal,sarcoma/mesothelioma,grade 3


## Stats

In [12]:
aa = df_mrg.groupby(['patient_id', 'specimen_id']).agg({'image_id': 'nunique'}).reset_index().sort_values(by=['patient_id', 'specimen_id'])
print('Total images', aa['image_id'].sum())
display(aa)

Total images 584


Unnamed: 0,patient_id,specimen_id,image_id
0,114434,197-R,6
1,114551,080-T,9
2,119177,322-R1,6
3,128128,338-R,6
4,135848,042-T,6
...,...,...,...
92,BL0293,F563,5
93,BL0382,F1232,5
94,BL0479,F1894,8
95,CN0330,F216,6


In [13]:
c = 'tumor_site_from_data_src'
print(df_mrg[c].nunique())
tt = df_mrg.groupby([c]).agg({'patient_id': 'nunique', 'specimen_id': 'nunique', 'image_id': 'nunique'}).reset_index()
tt

9


Unnamed: 0,tumor_site_from_data_src,patient_id,specimen_id,image_id
0,Digestive/Gastrointestinal,25,31,180
1,Endocrine and Neuroendocrine,3,3,18
2,Genitourinary,11,12,73
3,Gynecologic,6,6,38
4,Head and Neck,8,8,52
5,Musculoskeletal,18,17,110
6,Respiratory/Thoracic,8,7,50
7,Skin,9,8,51
8,Unknown Primary,2,2,12


In [14]:
# print(df_mrg['simplified_tumor_site'].nunique())
# df_mrg['simplified_tumor_site'].value_counts()

In [15]:
c = 'tumor_type_from_data_src'
print(df_mrg[c].nunique())
# tt = df_mrg[c].value_counts().reset_index().sort_values('index')
tt = df_mrg.groupby(['tumor_type_from_data_src']).agg({'patient_id': 'nunique', 'specimen_id': 'nunique', 'image_id': 'nunique'}).reset_index()
tt

37


Unnamed: 0,tumor_type_from_data_src,patient_id,specimen_id,image_id
0,Adenocarcinoma - cervix,1,1,6
1,Adenocarcinoma - colon,10,13,77
2,Adenocarcinoma - pancreas,9,12,74
3,Adenocarcinoma - rectum,2,2,8
4,Adenocarcinoma - small intest.,1,1,7
5,Carcinosarcoma of the uterus,2,2,12
6,Chondrosarcoma,1,1,7
7,"Colorectal cancer, NOS",1,1,4
8,Ewing sarcoma/Peripheral PNET,2,2,12
9,"Female reprod. system cancer, NOS",1,1,7


In [16]:
# print(df_mrg['simplified_tumor_type'].nunique())
# df_mrg['simplified_tumor_type'].value_counts()

In [17]:
df_mrg['stage_or_grade'].value_counts()

grade 3    211
grade 2    210
grade 1     47
Name: stage_or_grade, dtype: int64

# Part 2 - Merge with meta that we extracted from images

In [16]:
meta_img = pd.read_csv(metapath/'meta_from_wsi_images.csv')
col_rename = {'aperio.ImageID': 'image_id',
              'aperio.MPP': 'MPP',
              'openslide.level[0].height': 'height',
              'openslide.level[0].width': 'width',
              'openslide.objective-power': 'power'
             }
meta_img = meta_img.rename(columns=col_rename)
print(meta_img.shape)

(469, 45)


In [17]:
cols = set(meta_img.columns).intersection(set(list(col_rename.values())))
cols = ['image_id'] + [c for c in cols if c != 'image_id']
meta_img = meta_img[cols]
meta_img[:3]

Unnamed: 0,image_id,MPP,width,power,height
0,10060,0.5006,15935,20,15220
1,10065,0.5006,13943,20,18556
2,10096,0.5006,25895,20,18922


In [18]:
# Finally, merge
df_final = df_mrg.merge(meta_img, how='inner', on='image_id')

print(df_mrg.shape)
print(meta_img.shape)
print(df_final.shape)
df_final[:3]

(461, 11)
(469, 5)
(458, 15)


Unnamed: 0,model,patient_id,specimen_id,sample_id,image_id,Notes,tumor_site_from_data_src,tumor_type_from_data_src,simplified_tumor_site,simplified_tumor_type,stage_or_grade,MPP,width,power,height
0,114434~197-R,114434,197-R,A35YC3,27166,,Musculoskeletal,Non-Rhabdo. soft tissue sarcoma,musculoskeletal,sarcoma/mesothelioma,grade 3,0.5027,23904,20,17995
1,114434~197-R,114434,197-R,A36YC9,25127,,Musculoskeletal,Non-Rhabdo. soft tissue sarcoma,musculoskeletal,sarcoma/mesothelioma,grade 3,0.5027,19920,20,17339
2,114434~197-R,114434,197-R,A38WG0JH1,29249,,Musculoskeletal,Non-Rhabdo. soft tissue sarcoma,musculoskeletal,sarcoma/mesothelioma,grade 3,0.5027,11952,20,13573


In [19]:
df_final.to_csv(metapath/'meta_merged.csv', index=False)