<a href="https://colab.research.google.com/github/naomifridman/BreastDCEDL/blob/main/ISPY2/BreastDCEDL_ISPY2_tcia_metadata.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ISPY2 - Read TCIA metada data
## Harmonize ISPY2 medical data into BreastDCEDL
#### Author: Bubby Solway
#### Date: 2025-02-20
> BreastDCEDL/ISPY2/BreastDCEDL_ISPY2_tcia_metadata.ipynb
[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/naomifridman/BreastDCEDL/blob/main/BreastDCEDL_ISPY2_tcia_metadata.ipynb)

In [None]:
from sklearn.metrics import classification_report,auc,roc_auc_score
from PIL import Image
import time
from pathlib import Path


import os
import numpy as np
import pandas as pd
from PIL import Image


import warnings
warnings.filterwarnings('ignore', '.*do not.*', )
warnings.warn('Do not show this message')

import matplotlib
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec

from glob import glob
#from skimage import io
from sklearn.utils import shuffle

#from nipype.interfaces.ants import N4BiasFieldCorrection
import sys
import os
import ast

import warnings
warnings.filterwarnings('ignore')

# Utils

In [None]:
# Check if running in Google Colab

if 'google.colab' in str(get_ipython()):
    print("Running in Google Colab")
    # Clone the repository
    !git clone https://github.com/naomifridman/BreastDCEDL.git
    !pip install pydicom

    # Change to the repository directory
    os.chdir('/content/BreastDCEDL/ISPY2/')


# Upload medical data

In [None]:
df_med = pd.read_excel(os.path.join('TCIA_metadata','ISPY2-Imaging-Cohort-1-Clinical-Data.xlsx'))
df_med

Unnamed: 0,Patient_ID,Arm,HR,HER2,MP,pCR,Age_at_Screening,Race,menopausal_status,ethnicity
0,756412,Paclitaxel + ABT 888 + Carboplatin,1,0,0,0,46.0,White,Premenopausal(< 6 months since LMP AND no prio...,Not Hispanic or Latino
1,111881,Paclitaxel + Neratinib,1,1,1,1,37.0,White,Premenopausal(<6 months since LMP AND no prior...,Not Hispanic or Latino
2,451816,Paclitaxel + ABT 888 + Carboplatin,1,0,0,0,43.0,White,Premenopausal(< 6 months since LMP AND no prio...,Not Hispanic or Latino
3,243836,Paclitaxel + Neratinib,1,0,1,1,31.0,White,Premenopausal(< 6 months since LMP AND no prio...,Not Hispanic or Latino
4,402265,Paclitaxel + Trastuzumab,1,1,0,0,56.0,White,Postmenopausal (prior bilateral ovariectomy OR...,Not Hispanic or Latino
...,...,...,...,...,...,...,...,...,...,...
980,231633,Paclitaxel + Pembrolizumab,0,0,1,1,53.0,White,Postmenopausal (prior bilateral ovariectomy OR...,Not Hispanic or Latino
981,892390,Paclitaxel + Pembrolizumab,0,0,1,1,59.0,White,Postmenopausal (prior bilateral ovariectomy OR...,Not Hispanic or Latino
982,666715,Paclitaxel + Ganetespib,1,0,0,0,32.0,White,Premenopausal(<6 months since LMP AND no prior...,Hispanic or Latino
983,241998,Paclitaxel,1,0,0,0,54.0,Black or African American,Postmenopausal (prior bilateral ovariectomy OR...,Not Hispanic or Latino


In [None]:
df_med.Arm.value_counts(dropna=False)

Paclitaxel                               178
Paclitaxel + AMG 386                     115
Paclitaxel + Neratinib                   114
Paclitaxel + Ganitumab                   106
Paclitaxel + Ganetespib                   93
Paclitaxel + ABT 888 + Carboplatin        70
Paclitaxel + Pembrolizumab                69
Paclitaxel + MK-2206                      60
T-DM1 + Pertuzumab                        52
Paclitaxel + Pertuzumab + Trastuzumab     44
Paclitaxel + MK-2206 + Trastuzumab        34
Paclitaxel + Trastuzumab                  31
Paclitaxel + AMG 386 + Trastuzumab        19
Name: Arm, dtype: int64

In [None]:
# Split drug combinations into lists
df_med['Arm_split'] = df_med['Arm'].str.split(' \+ ')

# Get a sorted list of all unique drugs
all_drugs = sorted({drug for sublist in df_med['Arm_split'].dropna() for drug in sublist})

# Create binary columns
for drug in all_drugs:
    df_med[drug] = df_med['Arm_split'].apply(lambda x: int(drug in x) if isinstance(x, list) else 0)
df_med.head()

Unnamed: 0,Patient_ID,Arm,HR,HER2,MP,pCR,Age_at_Screening,Race,menopausal_status,ethnicity,...,Carboplatin,Ganetespib,Ganitumab,MK-2206,Neratinib,Paclitaxel,Pembrolizumab,Pertuzumab,T-DM1,Trastuzumab
0,756412,Paclitaxel + ABT 888 + Carboplatin,1,0,0,0,46.0,White,Premenopausal(< 6 months since LMP AND no prio...,Not Hispanic or Latino,...,1,0,0,0,0,1,0,0,0,0
1,111881,Paclitaxel + Neratinib,1,1,1,1,37.0,White,Premenopausal(<6 months since LMP AND no prior...,Not Hispanic or Latino,...,0,0,0,0,1,1,0,0,0,0
2,451816,Paclitaxel + ABT 888 + Carboplatin,1,0,0,0,43.0,White,Premenopausal(< 6 months since LMP AND no prio...,Not Hispanic or Latino,...,1,0,0,0,0,1,0,0,0,0
3,243836,Paclitaxel + Neratinib,1,0,1,1,31.0,White,Premenopausal(< 6 months since LMP AND no prio...,Not Hispanic or Latino,...,0,0,0,0,1,1,0,0,0,0
4,402265,Paclitaxel + Trastuzumab,1,1,0,0,56.0,White,Postmenopausal (prior bilateral ovariectomy OR...,Not Hispanic or Latino,...,0,0,0,0,0,1,0,0,0,1


In [None]:
df_med['AMG 386'].value_counts()

0    851
1    134
Name: AMG 386, dtype: int64

### T‑DM1
is an antibody–drug conjugate rather than a traditional chemotherapeutic agent, but when it's combined with pertuzumab and administered before surgery, it is used as a neoadjuvant treatment. For example, in the KRISTINE trial, T‑DM1 plus pertuzumab was evaluated in the neoadjuvant setting for HER2-positive early breast cancer. So, although it isn’t conventional chemotherapy, when given preoperatively it is considered a neoadjuvant regimen.

In [None]:
df_med.columns

Index(['Patient_ID', 'Arm', 'HR', 'HER2', 'MP', 'pCR', 'Age_at_Screening',
       'Race', 'menopausal_status', 'ethnicity', 'Arm_split', 'ABT 888',
       'AMG 386', 'Carboplatin', 'Ganetespib', 'Ganitumab', 'MK-2206',
       'Neratinib', 'Paclitaxel', 'Pembrolizumab', 'Pertuzumab', 'T-DM1',
       'Trastuzumab'],
      dtype='object')

In [None]:
drag_cc= ['AMG 386',
 'Ganetespib',
 'Neratinib',
 'T-DM1',
 'Ganitumab',
 'Trastuzumab',
 'ABT 888',
 'Carboplatin',
 'Paclitaxel',
 'MK-2206',
 'Pembrolizumab',
 'Pertuzumab']

In [None]:
for drug in drag_cc:
    counts = df_med[drug].value_counts(dropna=False)
    count0 = counts.get(0, 0)
    count1 = counts.get(1, 0)
    # NaN count can be computed separately, though here it's 0 if no NaNs are present.
    count_nan = df_med[drug].isna().sum()
    print(f"{drug}: #1 = {count1}, #0 = {count0}, #NaN = {count_nan}")

AMG 386: #1 = 134, #0 = 851, #NaN = 0
Ganetespib: #1 = 93, #0 = 892, #NaN = 0
Neratinib: #1 = 114, #0 = 871, #NaN = 0
T-DM1: #1 = 52, #0 = 933, #NaN = 0
Ganitumab: #1 = 106, #0 = 879, #NaN = 0
Trastuzumab: #1 = 128, #0 = 857, #NaN = 0
ABT 888: #1 = 70, #0 = 915, #NaN = 0
Carboplatin: #1 = 70, #0 = 915, #NaN = 0
Paclitaxel: #1 = 933, #0 = 52, #NaN = 0
MK-2206: #1 = 94, #0 = 891, #NaN = 0
Pembrolizumab: #1 = 69, #0 = 916, #NaN = 0
Pertuzumab: #1 = 96, #0 = 889, #NaN = 0


In [None]:
header = "{:<15} {:>5} {:>5} {:>5}".format("Drug", "#1", "#0", "#NaN")
print(header)
print("-" * len(header))
for drug in drag_cc:
    counts = df_med[drug].value_counts(dropna=False)
    count_1 = counts.get(1, 0)
    count_0 = counts.get(0, 0)
    count_nan = df_med[drug].isna().sum()  # count of NaN values
    print("{:<15} {:>5} {:>5} {:>5}".format(drug, count_1, count_0, count_nan))

Drug               #1    #0  #NaN
---------------------------------
AMG 386           134   851     0
Ganetespib         93   892     0
Neratinib         114   871     0
T-DM1              52   933     0
Ganitumab         106   879     0
Trastuzumab       128   857     0
ABT 888            70   915     0
Carboplatin        70   915     0
Paclitaxel        933    52     0
MK-2206            94   891     0
Pembrolizumab      69   916     0
Pertuzumab         96   889     0


In [None]:
for c in ['AMG 386',
 'Ganetespib',
 'Neratinib',
 'T-DM1',
 'Ganitumab',
 'Trastuzumab',
 'ABT 888',
 'Carboplatin',
 'Paclitaxel',
 'MK-2206',
 'Pembrolizumab',
 'Pertuzumab']:
    print('=======',c)
    print(df_med[c].value_counts(dropna=False))
    import pandas as pd



0    851
1    134
Name: AMG 386, dtype: int64
0    892
1     93
Name: Ganetespib, dtype: int64
0    871
1    114
Name: Neratinib, dtype: int64
0    933
1     52
Name: T-DM1, dtype: int64
0    879
1    106
Name: Ganitumab, dtype: int64
0    857
1    128
Name: Trastuzumab, dtype: int64
0    915
1     70
Name: ABT 888, dtype: int64
0    915
1     70
Name: Carboplatin, dtype: int64
1    933
0     52
Name: Paclitaxel, dtype: int64
0    891
1     94
Name: MK-2206, dtype: int64
0    916
1     69
Name: Pembrolizumab, dtype: int64
0    889
1     96
Name: Pertuzumab, dtype: int64


In [None]:
df_med.ethnicity.value_counts()

Not Hispanic or Latino    863
Hispanic or Latino        121
Name: ethnicity, dtype: int64

In [None]:
df_med.ethnicity=df_med.ethnicity.map({'Not Hispanic or Latino':0,
                                      'Hispanic or Latino':1})

In [None]:
df_med=df_med.rename(columns={'ethnicity':'e_hispanic_latino'})

In [None]:
df_med.Race.value_counts()

White                                        780
Black or African American                    118
Asian                                         68
Native Hawaiian or Pacific Islander            4
American Indian or Alaska Native               4
Asian;White                                    4
Native Hawaiian or Other Pacific Islande       1
Asian,White                                    1
Native Hawaiian or Pacific Islander;White      1
American Indian or Alaska Native;White         1
Name: Race, dtype: int64

In [None]:
white_mapping = {
    "White": 1,
    "Black or African American": 0,
    "Asian": 0,
    "Native Hawaiian or Pacific Islander": 0,
    "American Indian or Alaska Native": 0,
    "Asian;White": 1,
    "Native Hawaiian or Other Pacific Islande": 0,
    "Asian,White": 1,
    "Native Hawaiian or Pacific Islander;White": 1,
    "American Indian or Alaska Native;White": 1
}

In [None]:
black_mapping = {
    "White": 0,
    "Black or African American": 1,
    "Asian": 0,
    "Native Hawaiian or Pacific Islander": 0,
    "American Indian or Alaska Native": 0,
    "Asian;White": 0,
    "Native Hawaiian or Other Pacific Islande": 0,
    "Asian,White": 0,
    "Native Hawaiian or Pacific Islander;White": 0,
    "American Indian or Alaska Native;White": 0
}

In [None]:
df_med['race_white'] = df_med.Race.map(white_mapping)

In [None]:
df_med['race_black'] = df_med.Race.map(black_mapping)

In [None]:
df_med['perimenops']=0

In [None]:
df_med['perimenops'] = np.where(df_med['menopausal_status'].str.contains('Perimenopausal'),1,
                                df_med['perimenops'])

In [None]:
df_med['perimenops'] = np.where(df_med['menopausal_status'].str.contains('Premenopausal'),1,
                                df_med['perimenops'])

In [None]:
df_med['perimenops'] = np.where(df_med['menopausal_status'].str.contains('Premenopausal'),1,
                                df_med['perimenops'])

In [None]:
df_med['perimenops'] = np.where(df_med['menopausal_status'].str.contains('Premenopausal'),1,
                                df_med['perimenops'])

In [None]:
df_med['perimenops'] = np.where(df_med['menopausal_status']=='Above categories not applicable AND Age < 50',1,
                                df_med['perimenops'])

In [None]:
df_med['perimenops'] = np.where(df_med['menopausal_status']=='Above categories not applicable AND Age > 50',0,
                                df_med['perimenops'])

In [None]:
df_med['perimenops'] = np.where(df_med['menopausal_status'].isna(),np.nan,
                                df_med['perimenops'])

In [None]:
df_med['postmenops']=0

In [None]:
df_med['postmenops'] = np.where(df_med['menopausal_status'].str.contains('Postmenopausal'),1,
                               df_med['postmenops'])

In [None]:
df_med['postmenops'] = np.where(df_med['menopausal_status']=='Above categories not applicable AND Age > 50',1,
                               df_med['postmenops'])

In [None]:
df_med['postmenops'] = np.where(df_med['menopausal_status']=='Above categories not applicable AND Age < 50',0,
                               df_med['postmenops'])

In [None]:
df_med['postmenops'] = np.where(df_med['menopausal_status'].isna(),np.nan,
                               df_med['postmenops'])

In [None]:
pd.crosstab(df_med['perimenops'], df_med['postmenops'], dropna=False)

postmenops,0.0,1.0
perimenops,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,0,383
1.0,564,0


In [None]:
df_med[df_med.postmenops==df_med.perimenops][['menopausal_status','postmenops','perimenops']].menopausal_status.value_counts()

Series([], Name: menopausal_status, dtype: int64)

In [None]:
df_med[df_med.postmenops==df_med.perimenops][['menopausal_status','postmenops','perimenops']]

Unnamed: 0,menopausal_status,postmenops,perimenops


In [None]:
df_med['menopausal_status']=df_med['menopausal_status'].str.replace('Perimenopausal','')

In [None]:
df_med['menopausal_status']=df_med['menopausal_status'].str.replace('Premenopausal','')

In [None]:
df_med['menopausal_status']=df_med['menopausal_status'].str.replace('Postmenopausal','')

In [None]:
df_med['menopausal_status']=df_med['menopausal_status'].str.replace('Above categories not applicable AND Age < 50',
                                                                    '')

In [None]:
df_med=df_med.rename(columns={'Patient_ID':'pid'})
df_med.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 985 entries, 0 to 984
Data columns (total 27 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   pid                985 non-null    int64  
 1   Arm                985 non-null    object 
 2   HR                 985 non-null    int64  
 3   HER2               985 non-null    int64  
 4   MP                 985 non-null    int64  
 5   pCR                985 non-null    int64  
 6   Age_at_Screening   982 non-null    float64
 7   Race               982 non-null    object 
 8   menopausal_status  947 non-null    object 
 9   e_hispanic_latino  984 non-null    float64
 10  Arm_split          985 non-null    object 
 11  ABT 888            985 non-null    int64  
 12  AMG 386            985 non-null    int64  
 13  Carboplatin        985 non-null    int64  
 14  Ganetespib         985 non-null    int64  
 15  Ganitumab          985 non-null    int64  
 16  MK-2206            985 non

In [None]:
df_med = df_med.rename(columns={'postmenops':'menopause'})

In [None]:
df_med[['pid','menopause']].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 985 entries, 0 to 984
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   pid        985 non-null    int64  
 1   menopause  947 non-null    float64
dtypes: float64(1), int64(1)
memory usage: 15.5 KB


In [None]:
df_med.head()

Unnamed: 0,pid,Arm,HR,HER2,MP,pCR,Age_at_Screening,Race,menopausal_status,e_hispanic_latino,...,Neratinib,Paclitaxel,Pembrolizumab,Pertuzumab,T-DM1,Trastuzumab,race_white,race_black,perimenops,menopause
0,756412,Paclitaxel + ABT 888 + Carboplatin,1,0,0,0,46.0,White,(< 6 months since LMP AND no prior bilateral o...,0.0,...,0,1,0,0,0,0,1.0,0.0,1.0,0.0
1,111881,Paclitaxel + Neratinib,1,1,1,1,37.0,White,(<6 months since LMP AND no prior bilateral ov...,0.0,...,1,1,0,0,0,0,1.0,0.0,1.0,0.0
2,451816,Paclitaxel + ABT 888 + Carboplatin,1,0,0,0,43.0,White,(< 6 months since LMP AND no prior bilateral o...,0.0,...,0,1,0,0,0,0,1.0,0.0,1.0,0.0
3,243836,Paclitaxel + Neratinib,1,0,1,1,31.0,White,(< 6 months since LMP AND no prior bilateral o...,0.0,...,1,1,0,0,0,0,1.0,0.0,1.0,0.0
4,402265,Paclitaxel + Trastuzumab,1,1,0,0,56.0,White,(prior bilateral ovariectomy OR > 12 months s...,0.0,...,0,1,0,0,0,1,1.0,0.0,0.0,1.0


In [None]:
df_med.HR.value_counts(dropna=False)

1    537
0    448
Name: HR, dtype: int64

In [None]:
df_med.HER2.value_counts(dropna=False)

0    741
1    244
Name: HER2, dtype: int64

In [None]:
df_med.HER2.value_counts(dropna=False)

0    741
1    244
Name: HER2, dtype: int64

In [None]:
df_med.MP.value_counts(dropna=False)

0    504
1    481
Name: MP, dtype: int64

In [None]:
df_med.pCR.value_counts()

0    668
1    317
Name: pCR, dtype: int64

In [None]:
df_med.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 985 entries, 0 to 984
Data columns (total 27 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   pid                985 non-null    int64  
 1   Arm                985 non-null    object 
 2   HR                 985 non-null    int64  
 3   HER2               985 non-null    int64  
 4   MP                 985 non-null    int64  
 5   pCR                985 non-null    int64  
 6   Age_at_Screening   982 non-null    float64
 7   Race               982 non-null    object 
 8   menopausal_status  947 non-null    object 
 9   e_hispanic_latino  984 non-null    float64
 10  Arm_split          985 non-null    object 
 11  ABT 888            985 non-null    int64  
 12  AMG 386            985 non-null    int64  
 13  Carboplatin        985 non-null    int64  
 14  Ganetespib         985 non-null    int64  
 15  Ganitumab          985 non-null    int64  
 16  MK-2206            985 non

# Map Paitent id

In [None]:
!pwd

/c/Users/naomi/Downloads/BreastDCEDL/ISPY2


In [None]:
!ls TCIA_metadata

ACRIN 6698 ISPY2 DWI and DCE MRI Data Descriptions_20210520.pdf
ACRIN-6698-ISPY2-Shared-Private-Tag-Data-Dictionary_20210520.xlsx
Analysis-mask-files-description.v20211020.docx
ISPY2-Imaging-Cohort-1-Clinical-Data.xlsx
metadata.csv


In [None]:
df = pd.read_csv(os.path.join('TCIA_metadata','metadata.csv'))
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43356 entries, 0 to 43355
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Series UID            43356 non-null  object
 1   Collection            43356 non-null  object
 2   3rd Party Analysis    43356 non-null  object
 3   Data Description URI  43128 non-null  object
 4   Subject ID            43356 non-null  object
 5   Study UID             43356 non-null  object
 6   Study Description     43356 non-null  object
 7   Study Date            43356 non-null  object
 8   Series Description    43352 non-null  object
 9   Manufacturer          43356 non-null  object
 10  Modality              43356 non-null  object
 11  SOP Class Name        43356 non-null  object
 12  SOP Class UID         43356 non-null  object
 13  Number of Images      43356 non-null  int64 
 14  File Size             43356 non-null  object
 15  File Location         43356 non-null

In [None]:
pids = df['Subject ID'].values

In [None]:
for i,row in df_med.iterrows():
    p = str(row['pid'])
    if 'ISPY2-'+p in pids:
        df_med.at[i, 'pid']='ISPY2-'+p
    elif 'ACRIN-6698-'+p in pids:
        df_med.at[i, 'pid']='ACRIN-6698-'+p

    else:
        print(p,'none')

In [None]:
len(set(df_med.pid.values))

985

In [None]:
df_med.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 985 entries, 0 to 984
Data columns (total 27 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   pid                985 non-null    object 
 1   Arm                985 non-null    object 
 2   HR                 985 non-null    int64  
 3   HER2               985 non-null    int64  
 4   MP                 985 non-null    int64  
 5   pCR                985 non-null    int64  
 6   Age_at_Screening   982 non-null    float64
 7   Race               982 non-null    object 
 8   menopausal_status  947 non-null    object 
 9   e_hispanic_latino  984 non-null    float64
 10  Arm_split          985 non-null    object 
 11  ABT 888            985 non-null    int64  
 12  AMG 386            985 non-null    int64  
 13  Carboplatin        985 non-null    int64  
 14  Ganetespib         985 non-null    int64  
 15  Ganitumab          985 non-null    int64  
 16  MK-2206            985 non

# filter only pre-treatment mri

In [None]:
df['Study Description'].value_counts()

ISPY2MRIT0              8610
ISPY2MRIT1              8293
ISPY2MRIT2              7765
ISPY2MRIT3              7743
ACRIN-6698ISPY2MRIT0    3012
ACRIN-6698ISPY2MRIT1    2798
ACRIN-6698ISPY2MRIT2    2608
ACRIN-6698ISPY2MRIT3    2527
Name: Study Description, dtype: int64

In [None]:
df['pre_treatment'] = np.where(df['Study Description'].isin(['ISPY2MRIT0','ACRIN-6698ISPY2MRIT0']),1,0)
pids_pre_treatment=df[df.pre_treatment==1]['Subject ID'].values
len(set(pids_pre_treatment))

982

In [None]:
[p for p in df_med.pid.values if p not in pids_pre_treatment  ]

['ACRIN-6698-547405', 'ISPY2-835137', 'ISPY2-733962']

In [None]:
df_med=df_med[df_med.pid.isin(pids_pre_treatment)]

In [None]:
df_med.to_csv('ISPY2_tcia_metadata.csv', index = False)

In [None]:
df_med.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 982 entries, 0 to 984
Data columns (total 27 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   pid                982 non-null    object 
 1   Arm                982 non-null    object 
 2   HR                 982 non-null    int64  
 3   HER2               982 non-null    int64  
 4   MP                 982 non-null    int64  
 5   pCR                982 non-null    int64  
 6   Age_at_Screening   979 non-null    float64
 7   Race               979 non-null    object 
 8   menopausal_status  944 non-null    object 
 9   e_hispanic_latino  981 non-null    float64
 10  Arm_split          982 non-null    object 
 11  ABT 888            982 non-null    int64  
 12  AMG 386            982 non-null    int64  
 13  Carboplatin        982 non-null    int64  
 14  Ganetespib         982 non-null    int64  
 15  Ganitumab          982 non-null    int64  
 16  MK-2206            982 non

In [None]:
df_med=df_med.rename(columns={'Age_at_Screening':'age'})

In [None]:
df_med.menopause.value_counts()

0.0    562
1.0    382
Name: menopause, dtype: int64

In [None]:
pd.crosstab(df_med.perimenops,df_med.menopause, dropna=False)

menopause,0.0,1.0
perimenops,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,0,382
1.0,562,0


In [None]:

df = df_med

In [None]:
df['HER2pos']=df['HER2']
df['HRposHER2neg']=np.where((df['HR']==1) & (df['HER2']==0),1,0)
df['HRposHER2neg']=np.where((df['HR'].isna()) | (df['HER2'].isna()),None,df['HRposHER2neg'])
df['TripleNeg']=np.where((df['HR']==0) & (df['HER2']==0),1,0)
df['TripleNeg']=np.where((df['HR'].isna()) | (df['HER2'].isna()),None,df['TripleNeg'])

In [None]:
df['dataset']='spy2'

In [None]:
df['HR_HER2_STATUS']=np.where(df['HRposHER2neg']==1, 'HRposHER2neg', None)
df['HR_HER2_STATUS']=np.where(df['HER2pos']==1, 'HER2pos', df['HR_HER2_STATUS'])
df['HR_HER2_STATUS']=np.where(df['TripleNeg']==1, 'TripleNeg', df['HR_HER2_STATUS'])
df['HR_HER2_STATUS']=np.where(df['HR'].isna(), None, df['HR_HER2_STATUS'])
df['HR_HER2_STATUS']=np.where(df['HER2'].isna(), None, df['HR_HER2_STATUS'])

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 982 entries, 0 to 984
Data columns (total 32 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   pid                982 non-null    object 
 1   Arm                982 non-null    object 
 2   HR                 982 non-null    int64  
 3   HER2               982 non-null    int64  
 4   MP                 982 non-null    int64  
 5   pCR                982 non-null    int64  
 6   age                979 non-null    float64
 7   Race               979 non-null    object 
 8   menopausal_status  944 non-null    object 
 9   e_hispanic_latino  981 non-null    float64
 10  Arm_split          982 non-null    object 
 11  ABT 888            982 non-null    int64  
 12  AMG 386            982 non-null    int64  
 13  Carboplatin        982 non-null    int64  
 14  Ganetespib         982 non-null    int64  
 15  Ganitumab          982 non-null    int64  
 16  MK-2206            982 non

## Merge data from DICOM files

In [None]:
dfd = pd.read_csv('spy2_dicom_metadata.csv')
dfd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 982 entries, 0 to 981
Data columns (total 17 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   pid          982 non-null    object 
 1   n_xy         982 non-null    float64
 2   n_z          982 non-null    float64
 3   n_times      982 non-null    float64
 4   pre          982 non-null    float64
 5   post_early   982 non-null    float64
 6   post_late    982 non-null    float64
 7   slice_thick  982 non-null    float64
 8   xy_spacing   982 non-null    float64
 9   mask_start   982 non-null    float64
 10  mask_end     982 non-null    float64
 11  mask_count   982 non-null    float64
 12  is_mask      982 non-null    float64
 13  sraw         982 non-null    float64
 14  eraw         982 non-null    float64
 15  scol         982 non-null    float64
 16  ecol         982 non-null    float64
dtypes: float64(16), object(1)
memory usage: 130.5+ KB


In [None]:
dfd.columns

Index(['pid', 'n_xy', 'n_z', 'n_times', 'pre', 'post_early', 'post_late',
       'slice_thick', 'xy_spacing', 'mask_start', 'mask_end', 'mask_count',
       'is_mask', 'sraw', 'eraw', 'scol', 'ecol'],
      dtype='object')

In [None]:
df = df.merge(dfd[['pid', 'n_xy', 'n_z', 'n_times', 'pre', 'post_early', 'post_late',
       'slice_thick', 'xy_spacing', 'mask_start', 'mask_end', 'mask_count',
                 'is_mask', 'sraw', 'eraw', 'scol', 'ecol']], on="pid", how="left");

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 982 entries, 0 to 981
Data columns (total 48 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   pid                982 non-null    object 
 1   Arm                982 non-null    object 
 2   HR                 982 non-null    int64  
 3   HER2               982 non-null    int64  
 4   MP                 982 non-null    int64  
 5   pCR                982 non-null    int64  
 6   age                979 non-null    float64
 7   Race               979 non-null    object 
 8   menopausal_status  944 non-null    object 
 9   e_hispanic_latino  981 non-null    float64
 10  Arm_split          982 non-null    object 
 11  ABT 888            982 non-null    int64  
 12  AMG 386            982 non-null    int64  
 13  Carboplatin        982 non-null    int64  
 14  Ganetespib         982 non-null    int64  
 15  Ganitumab          982 non-null    int64  
 16  MK-2206            982 non

In [None]:
df['dataset']='spy2'

In [None]:
dd=pd.read_csv('../../BreastDCEDL/BreastDCEDL_metadata.csv')
df = df.merge(dd[["pid",  "test"]], on="pid", how="left");

In [None]:
df.to_csv('BreastDCEDL_spy2_tcia_metadata.csv', index = False)

In [None]:
for c in df.columns[1:]:
    print(c, df[c].value_counts())

Arm Paclitaxel                               178
Paclitaxel + AMG 386                     115
Paclitaxel + Neratinib                   113
Paclitaxel + Ganitumab                   106
Paclitaxel + Ganetespib                   93
Paclitaxel + ABT 888 + Carboplatin        70
Paclitaxel + Pembrolizumab                69
Paclitaxel + MK-2206                      59
T-DM1 + Pertuzumab                        52
Paclitaxel + Pertuzumab + Trastuzumab     44
Paclitaxel + MK-2206 + Trastuzumab        33
Paclitaxel + Trastuzumab                  31
Paclitaxel + AMG 386 + Trastuzumab        19
Name: Arm, dtype: int64
HR 1    536
0    446
Name: HR, dtype: int64
HER2 0    740
1    242
Name: HER2, dtype: int64
MP 0    502
1    480
Name: MP, dtype: int64
pCR 0    666
1    316
Name: pCR, dtype: int64
age 50.0    43
46.0    41
57.0    39
56.0    37
44.0    36
45.0    35
54.0    35
49.0    32
52.0    32
47.0    31
53.0    29
39.0    28
48.0    28
41.0    28
58.0    27
42.0    27
51.0    26
59.0    25
55.