Merge three metadata files to create `../data/meta_merged.csv`:
1. `ImageID_PDMRID_CrossRef.csv` Meta that came with PDX images
2. `PDX_Meta_Information.csv` Meta from Yitan
3. `meta_from_wsi_images.csv` Meta that I (ap) extracted from WSI (SVS) images using histolab

Note! Before you run this, you need to generate `meta_from_wsi_images.csv` with `01-get-meta-from-images.ipynb`.

Note! Yitan's file has some missing samples for which we have images.<br>
The missing samples either don't have response or expression data.

In [1]:
import os
from pathlib import Path
import pandas as pd
import numpy as np
from glob import glob

dirpath = Path.cwd()
print(dirpath)

/Users/apartin/work/jdacs/pdx-histo/nbs


# Part 1 - Merge meta files 1 and 2

## Load data
Loads two meta files.

In [2]:
# Data path
datapath = dirpath/'../data'

In [3]:
# PDX image meta (from NCI/Globus)
df_img = pd.read_csv(datapath/'ImageID_PDMRID_CrossRef.csv')
df_img = df_img.rename(columns={'Model': 'model'})
df_img.insert(loc=1, column='patient_id',  value=df_img['model'].map(lambda x: x.split('~')[0]), allow_duplicates=True)
df_img.insert(loc=2, column='specimen_id', value=df_img['model'].map(lambda x: x.split('~')[1]), allow_duplicates=True)

# PDX Meta (from Yitan)
# Yitan doesn't have sample_id (??)
df_typ = pd.read_csv(datapath/'PDX_Meta_Information.csv')

print('Image meta', df_img.shape)
print('Yitan meta', df_typ.shape)

display(df_img[:2])
display(df_typ[:2])

Image meta (594, 7)
Yitan meta (97, 7)


Unnamed: 0,model,patient_id,specimen_id,sample_id,image_id,capture_date,date_loaded_to_bw_transfers
0,114434~197-R,114434,197-R,A36YC9,25127.0,9/16/16,9/17/20
1,114434~197-R,114434,197-R,A38WG5,26591.0,11/2/16,9/17/20


Unnamed: 0,patient_id,specimen_id,tumor_site_from_data_src,tumor_type_from_data_src,simplified_tumor_site,simplified_tumor_type,stage_or_grade
0,135848,042-T,Digestive/Gastrointestinal,Adenocarcinoma - colon,digestive/gastrointestinal,digestive/gastrointestinal,grade 2
1,172845,121-B,Digestive/Gastrointestinal,Adenocarcinoma - colon,digestive/gastrointestinal,digestive/gastrointestinal,grade 2


In [4]:
# Drop items w/o images
df_img = df_img.dropna(subset=['image_id'])
df_img = df_img.astype({'image_id': int})
print(df_img.shape)
display(df_img[:2])

(146, 7)


Unnamed: 0,model,patient_id,specimen_id,sample_id,image_id,capture_date,date_loaded_to_bw_transfers
0,114434~197-R,114434,197-R,A36YC9,25127,9/16/16,9/17/20
1,114434~197-R,114434,197-R,A38WG5,26591,11/2/16,9/17/20


## Explore data
Explore but don't modify the original data.

In [5]:
# Subset the columns
df1 = df_img[['model', 'patient_id', 'specimen_id', 'sample_id', 'image_id']]
df2 = df_typ[['patient_id', 'specimen_id', 'stage_or_grade']]

display(df1[:2])
display(df2[:2])

Unnamed: 0,model,patient_id,specimen_id,sample_id,image_id
0,114434~197-R,114434,197-R,A36YC9,25127
1,114434~197-R,114434,197-R,A38WG5,26591


Unnamed: 0,patient_id,specimen_id,stage_or_grade
0,135848,042-T,grade 2
1,172845,121-B,grade 2


In [6]:
# Merge meta files
df = df1.merge(df2, on=['patient_id', 'specimen_id'], how='inner').reset_index(drop=True)

print(df1.shape)
print(df2.shape)
print(df.shape)

(146, 5)
(97, 3)
(137, 6)


Note that some items are missing in Yitan's file.<br>
The missing samples either don't have response or expression data.

In [7]:
# Explore (merge and identify from which df the items are coming from)
# https://kanoki.org/2019/07/04/pandas-difference-between-two-dataframes/
df = df1.merge(df2, on=['patient_id', 'specimen_id'], how='outer', indicator=True)
print('Inner merge', df.shape)
display(df[:2])

print('In both         ', df[df['_merge']=='both'].shape)
print('In left or right', df[df['_merge']!='both'].shape)

# Find which items are missing in Yitan's file
# df = df1.merge(df2, on=['patient_id', 'specimen_id'], how='outer' ,indicator=True).loc[lambda x : x['_merge']=='right_only']
df_miss = df1.merge(df2, on=['patient_id', 'specimen_id'], how='outer', indicator=True).loc[lambda x : x['_merge']=='left_only']
df_miss = df_miss.sort_values(['patient_id', 'specimen_id'], ascending=True)
print('\nMissing items', df_miss.shape)
display(df_miss)

Inner merge (221, 7)


Unnamed: 0,model,patient_id,specimen_id,sample_id,image_id,stage_or_grade,_merge
0,114434~197-R,114434,197-R,A36YC9,25127.0,grade 3,both
1,114434~197-R,114434,197-R,A38WG5,26591.0,grade 3,both


In both          (137, 7)
In left or right (84, 7)

Missing items (9, 7)


Unnamed: 0,model,patient_id,specimen_id,sample_id,image_id,stage_or_grade,_merge
39,146476~266-R,146476,266-R,ORIGINATOR,13946.0,,left_only
40,146476~266-R,146476,266-R,E19YJ0,20252.0,,left_only
41,146476~266-R,146476,266-R,E19YJ1,20307.0,,left_only
42,146476~266-R,146476,266-R,E20F08F67,20976.0,,left_only
43,146476~266-R,146476,266-R,E20F07,21545.0,,left_only
142,237351~077-R,237351,077-R,AL-IR0_AL-M977,20467.0,,left_only
143,237351~077-R,237351,077-R,AL-IR0,20729.0,,left_only
144,237351~077-R,237351,077-R,AL-IR0_AL-M977_AL-M302,23429.0,,left_only
145,237351~077-R,237351,077-R,AL-IR0_AL-M977_AL-M303_AL-ER0,26305.0,,left_only


## Merge meta files
Merge the two files and save.

In [8]:
# Merge meta files
df_mrg = df_img.merge(df_typ, on=['patient_id', 'specimen_id'], how='inner').reset_index(drop=True)

# Drop cols
df_mrg = df_mrg.drop(columns=['capture_date', 'date_loaded_to_bw_transfers'])

# Sort
df_mrg = df_mrg.sort_values(['patient_id', 'specimen_id', 'sample_id'], ascending=True).reset_index(drop=True)

print(df1.shape)
print(df2.shape)
print(df_mrg.shape)
display(df_mrg[:2])

(146, 5)
(97, 3)
(137, 10)


Unnamed: 0,model,patient_id,specimen_id,sample_id,image_id,tumor_site_from_data_src,tumor_type_from_data_src,simplified_tumor_site,simplified_tumor_type,stage_or_grade
0,114434~197-R,114434,197-R,A35YC3,27166,Musculoskeletal,Non-Rhabdo. soft tissue sarcoma,musculoskeletal,sarcoma/mesothelioma,grade 3
1,114434~197-R,114434,197-R,A36YC9,25127,Musculoskeletal,Non-Rhabdo. soft tissue sarcoma,musculoskeletal,sarcoma/mesothelioma,grade 3


## Stats
Show some counts.

In [9]:
aa = df_mrg.groupby(['patient_id', 'specimen_id']).agg({'image_id': 'nunique'}).reset_index().sort_values(by=['patient_id', 'specimen_id'])
print('Total images', aa['image_id'].sum())
display(aa)

Total images 137


Unnamed: 0,patient_id,specimen_id,image_id
0,114434,197-R,6
1,114551,080-T,9
2,119177,322-R1,6
3,135848,042-T,6
4,138582,337-R,6
5,144126,210-T,6
6,156681,154-R,5
7,165739,295-R,7
8,172845,121-B,6
9,172845,121-T,6


In [10]:
c = 'tumor_site_from_data_src'
print(df_mrg[c].nunique())
# tt = df_mrg[c].value_counts().reset_index().sort_values('index')
tt = df_mrg.groupby([c]).agg({'patient_id': 'nunique', 'specimen_id': 'nunique', 'image_id': 'nunique'}).reset_index()
tt

7


Unnamed: 0,tumor_site_from_data_src,patient_id,specimen_id,image_id
0,Digestive/Gastrointestinal,7,10,62
1,Endocrine and Neuroendocrine,1,1,6
2,Genitourinary,1,1,6
3,Gynecologic,2,2,13
4,Head and Neck,2,2,14
5,Musculoskeletal,3,3,19
6,Skin,3,3,17


In [11]:
# print(df_mrg['simplified_tumor_site'].nunique())
# df_mrg['simplified_tumor_site'].value_counts()

In [12]:
c = 'tumor_type_from_data_src'
print(df_mrg[c].nunique())
# tt = df_mrg[c].value_counts().reset_index().sort_values('index')
tt = df_mrg.groupby(['tumor_type_from_data_src']).agg({'patient_id': 'nunique', 'specimen_id': 'nunique', 'image_id': 'nunique'}).reset_index()
tt

13


Unnamed: 0,tumor_type_from_data_src,patient_id,specimen_id,image_id
0,Adenocarcinoma - cervix,1,1,6
1,Adenocarcinoma - colon,4,7,42
2,Adenocarcinoma - pancreas,2,2,14
3,"Female reprod. system cancer, NOS",1,1,7
4,"Gastric cancer, NOS",1,1,6
5,"H & N squamous cell car., NOS",1,1,5
6,Melanoma,2,2,11
7,Merkel cell tumor,1,1,6
8,"Neuroendocrine cancer, NOS",1,1,6
9,Non-Rhabdo. soft tissue sarcoma,1,1,6


In [13]:
# print(df_mrg['simplified_tumor_type'].nunique())
# df_mrg['simplified_tumor_type'].value_counts()

In [14]:
df_mrg['stage_or_grade'].value_counts()

grade 2    70
grade 3    25
Name: stage_or_grade, dtype: int64

# Part 2 - Merge with meta that we extracted from images

In [15]:
meta_img = pd.read_csv(datapath/'meta_from_wsi_images.csv')
col_rename = {'aperio.ImageID': 'image_id',
              'aperio.MPP': 'MPP',
              'openslide.level[0].height': 'height',
              'openslide.level[0].width': 'width',
              'openslide.objective-power': 'power'
             }
meta_img = meta_img.rename(columns=col_rename)
print(meta_img.shape)

(146, 45)


In [16]:
cols = set(meta_img.columns).intersection(set(list(col_rename.values())))
cols = ['image_id'] + [c for c in cols if c != 'image_id']
meta_img = meta_img[cols]
meta_img[:3]

Unnamed: 0,image_id,power,MPP,width,height
0,10394,20,0.5006,21911,24376
1,10545,20,0.5006,19919,21595
2,10547,20,0.5006,13943,11944


In [17]:
# Finally, merge
df_final = df_mrg.merge(meta_img, how='inner', on='image_id')

print(df_mrg.shape)
print(meta_img.shape)
print(df_final.shape)
df_final[:3]

(137, 10)
(146, 5)
(137, 14)


Unnamed: 0,model,patient_id,specimen_id,sample_id,image_id,tumor_site_from_data_src,tumor_type_from_data_src,simplified_tumor_site,simplified_tumor_type,stage_or_grade,power,MPP,width,height
0,114434~197-R,114434,197-R,A35YC3,27166,Musculoskeletal,Non-Rhabdo. soft tissue sarcoma,musculoskeletal,sarcoma/mesothelioma,grade 3,20,0.5027,23904,17995
1,114434~197-R,114434,197-R,A36YC9,25127,Musculoskeletal,Non-Rhabdo. soft tissue sarcoma,musculoskeletal,sarcoma/mesothelioma,grade 3,20,0.5027,19920,17339
2,114434~197-R,114434,197-R,A38WG0JH1,29249,Musculoskeletal,Non-Rhabdo. soft tissue sarcoma,musculoskeletal,sarcoma/mesothelioma,grade 3,20,0.5027,11952,13573


In [18]:
df_final.to_csv(datapath/'meta_merged.csv', index=False)