# SPY1 - Read medical data
## Harmonize I-SPY-1 medical data into BreastDCEDL
### Author: Itamar Barnea
### Date: 2025-03-15



[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/naomifridman/BreastDCEDL/blob/main/ISPY1/spy1_metadata.ipynb)

In [1]:
# Check if running in Google Colab
import os
if 'google.colab' in str(get_ipython()):
    print("Running in Google Colab")
    # Clone the repository
    !git clone https://github.com/naomifridman/BreastDCEDL.git
    #!pip install pydicom

    # Change to the repository directory
    os.chdir('/content/BreastDCEDL/ISPY1')



Running in Google Colab
fatal: destination path 'BreastDCEDL' already exists and is not an empty directory.


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
%matplotlib inline
#import pydicom

In [3]:
!pwd

/content/BreastDCEDL/ISPY1


In [4]:
!ls

BreastDCEDL_spy1_metadata.csv  README.md		 spy1_metadata.ipynb
data_samples		       spy1_data_from_dicom.csv  TCIA_metadata


In [5]:
!ls ISPY1/TCIA_metadata

ls: cannot access 'ISPY1/TCIA_metadata': No such file or directory


In [6]:
dname='TCIA_metadata/I-SPY-1-All-Patient-Clinical-and-Outcome-Data.xlsx'

In [7]:
base_path='.'

In [8]:
os.listdir(os.path.join('.'))

['spy1_data_from_dicom.csv',
 'TCIA_metadata',
 'data_samples',
 'README.md',
 'spy1_metadata.ipynb',
 'BreastDCEDL_spy1_metadata.csv']

In [9]:
import pandas as pd
import numpy as np

# Read the Excel sheets
def merge_ispy_data(df_clinical, df_outcomes):
    # Read clinical data sheet


    # Merge the dataframes on subject ID
    df_merged = pd.merge(df_clinical, df_outcomes, on='SUBJECTID', how='left')

    # Rename columns for clarity
    df_merged = df_merged.rename(columns={
        'SUBJECTID': 'pid',
        'age': 'age',
        'race_id': 'race',
        'ERpos': 'ER',
        'PgRpos': 'PR',
        'HR Pos': 'HR',
        'Her2MostPos': 'HER2',
        'HR_HER2_CATEGORY': 'hr_her2_category',
        'HR_HER2_STATUS': 'HR_HER2_STATUS',
        'BilateralCa': 'bilateral',
        'Laterality': 'laterality',
        'MRI LD Baseline': 'MRI_LD_Baseline',
        'MRI LD 1-3dAC': 'early_treatment_mriLD',
        'MRI LD InterReg': 'interreg_mriLD',
        'MRI LD PreSurg': 'presurg_mriLD',
        'sstat': 'survival_status',
        'survDtD2 (tx)': 'survival_days',
        'RFS': 'rfs_days',
        'rfs_ind': 'rfs_indicator',
        'PCR': 'pCR',
        'RCBClass': 'rcb_class'
    })

    # Select and order columns
    columns = [
        'pid', 'age', 'race',
        'ER', 'PR', 'HR', 'HER2',
        'hr_her2_category', 'HR_HER2_STATUS', 'bilateral', 'laterality',
        'MRI_LD_Baseline', 'early_treatment_mriLD', 'interreg_mriLD', 'presurg_mriLD',
        'survival_status', 'survival_days', 'rfs_days', 'rfs_indicator',
        'pCR', 'rcb_class'
    ]

    df_final = df_merged[columns]

    # Print summary
    print(f"Data saved successfully. Total patients: {len(df_final)}")
    print("\nColumns in dataset:")
    for col in df_final.columns:
        print(f"- {col}")

    return df_final

# Execute the function
df_clinical = pd.read_excel(os.path.join(base_path,dname), sheet_name='TCIA Patient Clinical Subset')

    # Read outcomes data sheet
df_outcomes = pd.read_excel(os.path.join(base_path,dname), sheet_name='TCIA Outcomes Subset')
df_med = merge_ispy_data(df_clinical, df_outcomes)

Data saved successfully. Total patients: 221

Columns in dataset:
- pid
- age
- race
- ER
- PR
- HR
- HER2
- hr_her2_category
- HR_HER2_STATUS
- bilateral
- laterality
- MRI_LD_Baseline
- early_treatment_mriLD
- interreg_mriLD
- presurg_mriLD
- survival_status
- survival_days
- rfs_days
- rfs_indicator
- pCR
- rcb_class


  warn(msg)
  warn(msg)


In [10]:
df_clinical

Unnamed: 0,SUBJECTID,DataExtractDt,age,race_id,ERpos,PgRpos,HR Pos,Her2MostPos,HR_HER2_CATEGORY,HR_HER2_STATUS,BilateralCa,Laterality,MRI LD Baseline,MRI LD 1-3dAC,MRI LD InterReg,MRI LD PreSurg
0,1001,2009-09-03,38.73,1,1.0,0.0,1.0,0.0,1.0,HRposHER2neg,0,1,88.0,78.0,30.0,14.0
1,1002,2009-09-03,37.79,1,1.0,1.0,1.0,0.0,1.0,HRposHER2neg,0,2,29.0,26.0,66.0,16.0
2,1003,2009-09-03,49.83,1,1.0,1.0,1.0,0.0,1.0,HRposHER2neg,0,1,50.0,64.0,54.0,46.0
3,1004,2009-09-03,48.28,1,0.0,0.0,0.0,0.0,3.0,TripleNeg,0,1,91.0,90.0,99.0,43.0
4,1005,2009-09-03,45.80,1,1.0,1.0,1.0,0.0,1.0,HRposHER2neg,0,1,98.0,109.0,60.0,42.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
216,1235,2009-09-03,64.06,1,0.0,0.0,0.0,1.0,2.0,HER2pos,0,2,52.0,50.0,28.0,0.0
217,1236,2009-09-03,38.61,3,0.0,1.0,1.0,0.0,1.0,HRposHER2neg,0,2,51.0,52.0,35.0,23.0
218,1237,2009-09-03,46.46,1,1.0,1.0,1.0,1.0,2.0,HER2pos,0,2,60.0,60.0,37.0,36.0
219,1238,2009-09-03,59.05,3,1.0,1.0,1.0,1.0,2.0,HER2pos,0,2,48.0,,,


In [11]:
df_outcomes

Unnamed: 0,SUBJECTID,DataExtractDt,sstat,survDtD2 (tx),RFS,rfs_ind,PCR,RCBClass
0,1001,2009-09-03,8,1264,751,1,0.0,2.0
1,1002,2009-09-03,8,1155,1043,1,0.0,3.0
2,1003,2009-09-03,7,2387,2387,0,0.0,3.0
3,1004,2009-09-03,7,2436,2436,0,0.0,
4,1005,2009-09-03,7,2220,2520,0,0.0,
...,...,...,...,...,...,...,...,...
216,1235,2009-09-03,7,1026,1026,0,1.0,0.0
217,1236,2009-09-03,8,832,510,1,0.0,3.0
218,1237,2009-09-03,7,1031,1031,0,0.0,2.0
219,1238,2009-09-03,7,1248,1248,0,0.0,


In [12]:
df_med.head()

Unnamed: 0,pid,age,race,ER,PR,HR,HER2,hr_her2_category,HR_HER2_STATUS,bilateral,...,MRI_LD_Baseline,early_treatment_mriLD,interreg_mriLD,presurg_mriLD,survival_status,survival_days,rfs_days,rfs_indicator,pCR,rcb_class
0,1001,38.73,1,1.0,0.0,1.0,0.0,1.0,HRposHER2neg,0,...,88.0,78.0,30.0,14.0,8,1264,751,1,0.0,2.0
1,1002,37.79,1,1.0,1.0,1.0,0.0,1.0,HRposHER2neg,0,...,29.0,26.0,66.0,16.0,8,1155,1043,1,0.0,3.0
2,1003,49.83,1,1.0,1.0,1.0,0.0,1.0,HRposHER2neg,0,...,50.0,64.0,54.0,46.0,7,2387,2387,0,0.0,3.0
3,1004,48.28,1,0.0,0.0,0.0,0.0,3.0,TripleNeg,0,...,91.0,90.0,99.0,43.0,7,2436,2436,0,0.0,
4,1005,45.8,1,1.0,1.0,1.0,0.0,1.0,HRposHER2neg,0,...,98.0,109.0,60.0,42.0,7,2220,2520,0,0.0,


In [13]:

df_med.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 221 entries, 0 to 220
Data columns (total 21 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   pid                    221 non-null    int64  
 1   age                    221 non-null    float64
 2   race                   221 non-null    int64  
 3   ER                     219 non-null    float64
 4   PR                     219 non-null    float64
 5   HR                     219 non-null    float64
 6   HER2                   216 non-null    float64
 7   hr_her2_category       216 non-null    float64
 8   HR_HER2_STATUS         216 non-null    object 
 9   bilateral              221 non-null    int64  
 10  laterality             221 non-null    int64  
 11  MRI_LD_Baseline        219 non-null    float64
 12  early_treatment_mriLD  210 non-null    float64
 13  interreg_mriLD         202 non-null    float64
 14  presurg_mriLD          208 non-null    float64
 15  surviv

In [14]:
df_med.HR_HER2_STATUS.value_counts()

Unnamed: 0_level_0,count
HR_HER2_STATUS,Unnamed: 1_level_1
HRposHER2neg,96
HER2pos,67
TripleNeg,53


In [15]:
df_med.race.value_counts()

Unnamed: 0_level_0,count
race,Unnamed: 1_level_1
1,165
3,42
4,9
0,2
50,2
5,1


In [16]:
df_med['race_white']=np.where(df_med.race==1, 1, 0)
df_med['race_black']=np.where(df_med.race==3, 1, 0)

In [17]:

df_med.columns

Index(['pid', 'age', 'race', 'ER', 'PR', 'HR', 'HER2', 'hr_her2_category',
       'HR_HER2_STATUS', 'bilateral', 'laterality', 'MRI_LD_Baseline',
       'early_treatment_mriLD', 'interreg_mriLD', 'presurg_mriLD',
       'survival_status', 'survival_days', 'rfs_days', 'rfs_indicator', 'pCR',
       'rcb_class', 'race_white', 'race_black'],
      dtype='object')

In [18]:
df_med[['pid', 'pCR', 'age',  'race_white', 'race_black',
        'ER', 'PR', 'HR', 'HER2', 'HR_HER2_STATUS',
       'bilateral']].to_csv('BreastDCEDL_spy1_metadata.csv', index = False)
df_med.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 221 entries, 0 to 220
Data columns (total 23 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   pid                    221 non-null    int64  
 1   age                    221 non-null    float64
 2   race                   221 non-null    int64  
 3   ER                     219 non-null    float64
 4   PR                     219 non-null    float64
 5   HR                     219 non-null    float64
 6   HER2                   216 non-null    float64
 7   hr_her2_category       216 non-null    float64
 8   HR_HER2_STATUS         216 non-null    object 
 9   bilateral              221 non-null    int64  
 10  laterality             221 non-null    int64  
 11  MRI_LD_Baseline        219 non-null    float64
 12  early_treatment_mriLD  210 non-null    float64
 13  interreg_mriLD         202 non-null    float64
 14  presurg_mriLD          208 non-null    float64
 15  surviv

In [19]:
df_med['pid']=df_med.pid.apply(lambda x:'ISPY1_'+str(x))
df_med.head()

Unnamed: 0,pid,age,race,ER,PR,HR,HER2,hr_her2_category,HR_HER2_STATUS,bilateral,...,interreg_mriLD,presurg_mriLD,survival_status,survival_days,rfs_days,rfs_indicator,pCR,rcb_class,race_white,race_black
0,ISPY1_1001,38.73,1,1.0,0.0,1.0,0.0,1.0,HRposHER2neg,0,...,30.0,14.0,8,1264,751,1,0.0,2.0,1,0
1,ISPY1_1002,37.79,1,1.0,1.0,1.0,0.0,1.0,HRposHER2neg,0,...,66.0,16.0,8,1155,1043,1,0.0,3.0,1,0
2,ISPY1_1003,49.83,1,1.0,1.0,1.0,0.0,1.0,HRposHER2neg,0,...,54.0,46.0,7,2387,2387,0,0.0,3.0,1,0
3,ISPY1_1004,48.28,1,0.0,0.0,0.0,0.0,3.0,TripleNeg,0,...,99.0,43.0,7,2436,2436,0,0.0,,1,0
4,ISPY1_1005,45.8,1,1.0,1.0,1.0,0.0,1.0,HRposHER2neg,0,...,60.0,42.0,7,2220,2520,0,0.0,,1,0


In [20]:
pids_med=list(set(df_med.pid))
len(pids_med)

221

In [21]:
df_med.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 221 entries, 0 to 220
Data columns (total 23 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   pid                    221 non-null    object 
 1   age                    221 non-null    float64
 2   race                   221 non-null    int64  
 3   ER                     219 non-null    float64
 4   PR                     219 non-null    float64
 5   HR                     219 non-null    float64
 6   HER2                   216 non-null    float64
 7   hr_her2_category       216 non-null    float64
 8   HR_HER2_STATUS         216 non-null    object 
 9   bilateral              221 non-null    int64  
 10  laterality             221 non-null    int64  
 11  MRI_LD_Baseline        219 non-null    float64
 12  early_treatment_mriLD  210 non-null    float64
 13  interreg_mriLD         202 non-null    float64
 14  presurg_mriLD          208 non-null    float64
 15  surviv

In [22]:
df_med.columns

Index(['pid', 'age', 'race', 'ER', 'PR', 'HR', 'HER2', 'hr_her2_category',
       'HR_HER2_STATUS', 'bilateral', 'laterality', 'MRI_LD_Baseline',
       'early_treatment_mriLD', 'interreg_mriLD', 'presurg_mriLD',
       'survival_status', 'survival_days', 'rfs_days', 'rfs_indicator', 'pCR',
       'rcb_class', 'race_white', 'race_black'],
      dtype='object')

In [23]:
### check Null pCR

In [24]:
df_med.survival_status=df_med.survival_status.map({7:'alive',8:'dead',9:'lost'})

In [25]:
df_med.rfs_indicator=df_med.rfs_indicator.map({0:'free of event at last check',1:'Event occurred'})

In [26]:
df_med[df_med.pCR.isna()][['pid','MRI_LD_Baseline',
       'early_treatment_mriLD', 'interreg_mriLD', 'presurg_mriLD','survival_status', 'survival_days', 'rfs_days', 'rfs_indicator']]

Unnamed: 0,pid,MRI_LD_Baseline,early_treatment_mriLD,interreg_mriLD,presurg_mriLD,survival_status,survival_days,rfs_days,rfs_indicator
20,ISPY1_1025,47.0,45.0,42.0,42.0,dead,718,718,Event occurred
30,ISPY1_1035,100.0,90.0,60.0,41.0,alive,2240,729,Event occurred
58,ISPY1_1064,70.0,67.0,47.0,30.0,alive,2063,369,Event occurred
61,ISPY1_1067,77.0,60.0,0.0,,dead,1044,469,Event occurred
108,ISPY1_1120,74.0,48.0,39.0,,lost,530,530,free of event at last check
186,ISPY1_1205,75.0,70.0,47.0,35.0,lost,185,185,free of event at last check


In [27]:
# dead or event occure and large presurg_mriLD
df_med.at[20,'pCR']=0
df_med.at[30,'pCR']=0
df_med.at[58,'pCR']=0

In [28]:
# dead or event occure and large presurg_mriLD
df_med.at[186,'pCR']=0

In [29]:
df_med.to_csv('BreastDCEDL_spy1_metadata.csv', index = False)

In [30]:
df_voi = pd.read_csv(os.path.join(base_path,'./TCIA_metadata/subject_voi_values.csv'))
df_voi=df_voi.rename(columns={'Subject ID':'pid'})

In [31]:
df_voi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 220 entries, 0 to 219
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   pid                220 non-null    object 
 1   voi_start_x        173 non-null    float64
 2   voi_start_y        173 non-null    float64
 3   voi_start_z        173 non-null    float64
 4   voi_end_x          173 non-null    float64
 5   voi_end_y          173 non-null    float64
 6   voi_end_z          173 non-null    float64
 7   Number of Entries  220 non-null    int64  
 8   VOI Width          173 non-null    float64
 9   VOI Height         173 non-null    float64
 10  VOI Depth          173 non-null    float64
 11  VOI Volume         173 non-null    float64
dtypes: float64(10), int64(1), object(1)
memory usage: 20.8+ KB


In [32]:

df_med = df_med.merge(df_voi, on="pid", how="left")
df_med.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 221 entries, 0 to 220
Data columns (total 34 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   pid                    221 non-null    object 
 1   age                    221 non-null    float64
 2   race                   221 non-null    int64  
 3   ER                     219 non-null    float64
 4   PR                     219 non-null    float64
 5   HR                     219 non-null    float64
 6   HER2                   216 non-null    float64
 7   hr_her2_category       216 non-null    float64
 8   HR_HER2_STATUS         216 non-null    object 
 9   bilateral              221 non-null    int64  
 10  laterality             221 non-null    int64  
 11  MRI_LD_Baseline        219 non-null    float64
 12  early_treatment_mriLD  210 non-null    float64
 13  interreg_mriLD         202 non-null    float64
 14  presurg_mriLD          208 non-null    float64
 15  surviv

RCB class refers to Residual Cancer Burden (RCB) classification, which is used to assess the amount of remaining cancer after neoadjuvant therapy (treatment given before surgery) in breast cancer patients. The classification system helps predict prognosis and survival outcomes.
RCB is determined based on factors like:
- Tumor size and cellularity in the breast
- Number of affected lymph nodes
- Extent of metastasis in the largest lymph node

The classification includes:
- RCB-0: Pathologic complete response (no residual cancer)
- RCB-I: Minimal residual disease
- RCB-II: Moderate residual disease
- RCB-III: Extensive residual disease

This system is widely used in clinical settings to guide treatment decisions and


In [33]:
df_med.columns

Index(['pid', 'age', 'race', 'ER', 'PR', 'HR', 'HER2', 'hr_her2_category',
       'HR_HER2_STATUS', 'bilateral', 'laterality', 'MRI_LD_Baseline',
       'early_treatment_mriLD', 'interreg_mriLD', 'presurg_mriLD',
       'survival_status', 'survival_days', 'rfs_days', 'rfs_indicator', 'pCR',
       'rcb_class', 'race_white', 'race_black', 'voi_start_x', 'voi_start_y',
       'voi_start_z', 'voi_end_x', 'voi_end_y', 'voi_end_z',
       'Number of Entries', 'VOI Width', 'VOI Height', 'VOI Depth',
       'VOI Volume'],
      dtype='object')

In [34]:
df_med = df_med[['pid', 'age',  'ER', 'PR', 'HR', 'HER2',
       'HR_HER2_STATUS', 'bilateral',  'MRI_LD_Baseline',

        'pCR',
       'rcb_class', 'race_white', 'race_black', 'voi_start_x', 'voi_start_y',
       'voi_start_z', 'voi_end_x', 'voi_end_y', 'voi_end_z',
        'VOI Width', 'VOI Height', 'VOI Depth',
       'VOI Volume']]

In [35]:
df_med.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 221 entries, 0 to 220
Data columns (total 23 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   pid              221 non-null    object 
 1   age              221 non-null    float64
 2   ER               219 non-null    float64
 3   PR               219 non-null    float64
 4   HR               219 non-null    float64
 5   HER2             216 non-null    float64
 6   HR_HER2_STATUS   216 non-null    object 
 7   bilateral        221 non-null    int64  
 8   MRI_LD_Baseline  219 non-null    float64
 9   pCR              219 non-null    float64
 10  rcb_class        201 non-null    float64
 11  race_white       221 non-null    int64  
 12  race_black       221 non-null    int64  
 13  voi_start_x      173 non-null    float64
 14  voi_start_y      173 non-null    float64
 15  voi_start_z      173 non-null    float64
 16  voi_end_x        173 non-null    float64
 17  voi_end_y       

In [36]:
df_med.to_csv('BreastDCEDL_spy1_metadata.csv', index = False)

In [37]:
df = df_med

In [38]:
df['HRposHER2neg'] = np.where( (df.HR == 1) & (df.HER2 == 0),1, 0)
df['HRposHER2neg'] = np.where((df.HR.isna()) | (df.HER2.isna()) , None, df['HRposHER2neg'])
df['HRposHER2neg'].value_counts(dropna=False)

Unnamed: 0_level_0,count
HRposHER2neg,Unnamed: 1_level_1
0.0,120
1.0,96
,5


In [39]:
df['TripleNeg'] = np.where( (df.HR == 0) & (df.HER2 == 0),1, 0)
df['TripleNeg'] = np.where((df.HR.isna()) | (df.HER2.isna()) , None, df['TripleNeg'])
df['TripleNeg'].value_counts(dropna=False)

Unnamed: 0_level_0,count
TripleNeg,Unnamed: 1_level_1
0.0,163
1.0,53
,5


In [40]:
df['HER2pos'] = df['HER2']
df['HER2pos'].value_counts(dropna=False)

Unnamed: 0_level_0,count
HER2pos,Unnamed: 1_level_1
0.0,149
1.0,67
,5


In [41]:
df = df_med.rename(columns={'race_a':'race_white', 'race_b':'race_black'})

In [42]:
df['HER2pos']=df['HER2']
df['HRposHER2neg']=np.where((df['HR']==1) & (df['HER2']==0),1,0)
df['HRposHER2neg']=np.where((df['HR'].isna()) | (df['HER2'].isna()),None,df['HRposHER2neg'])
df['TripleNeg']=np.where((df['HR']==0) & (df['HER2']==0),1,0)
df['TripleNeg']=np.where((df['HR'].isna()) | (df['HER2'].isna()),None,df['TripleNeg'])

In [43]:
df['dataset']='duke'

In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 221 entries, 0 to 220
Data columns (total 27 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   pid              221 non-null    object 
 1   age              221 non-null    float64
 2   ER               219 non-null    float64
 3   PR               219 non-null    float64
 4   HR               219 non-null    float64
 5   HER2             216 non-null    float64
 6   HR_HER2_STATUS   216 non-null    object 
 7   bilateral        221 non-null    int64  
 8   MRI_LD_Baseline  219 non-null    float64
 9   pCR              219 non-null    float64
 10  rcb_class        201 non-null    float64
 11  race_white       221 non-null    int64  
 12  race_black       221 non-null    int64  
 13  voi_start_x      173 non-null    float64
 14  voi_start_y      173 non-null    float64
 15  voi_start_z      173 non-null    float64
 16  voi_end_x        173 non-null    float64
 17  voi_end_y       

In [45]:
df.HRposHER2neg.value_counts(dropna=False)

Unnamed: 0_level_0,count
HRposHER2neg,Unnamed: 1_level_1
0.0,120
1.0,96
,5


In [46]:
df.HR_HER2_STATUS.value_counts(dropna=False)

Unnamed: 0_level_0,count
HR_HER2_STATUS,Unnamed: 1_level_1
HRposHER2neg,96
HER2pos,67
TripleNeg,53
,5


In [47]:
'''df['HR_HER2_STATUS']=np.where(df['HRposHER2neg']==1, 'HRposHER2neg', 0)
df['HR_HER2_STATUS']=np.where(df['HER2pos']==1, 'HER2pos', df['HR_HER2_STATUS'])
df['HR_HER2_STATUS']=np.where(df['TripleNeg']==1, 'TripleNeg', df['HR_HER2_STATUS'])
df['HR_HER2_STATUS']=np.where(df['HR'].isna(), None, df['HR_HER2_STATUS'])
df['HR_HER2_STATUS']=np.where(df['HER2'].isna(), None, df['HR_HER2_STATUS'])'''

"df['HR_HER2_STATUS']=np.where(df['HRposHER2neg']==1, 'HRposHER2neg', 0)\ndf['HR_HER2_STATUS']=np.where(df['HER2pos']==1, 'HER2pos', df['HR_HER2_STATUS'])\ndf['HR_HER2_STATUS']=np.where(df['TripleNeg']==1, 'TripleNeg', df['HR_HER2_STATUS'])\ndf['HR_HER2_STATUS']=np.where(df['HR'].isna(), None, df['HR_HER2_STATUS'])\ndf['HR_HER2_STATUS']=np.where(df['HER2'].isna(), None, df['HR_HER2_STATUS'])"

In [48]:
df.to_csv('BreastDCEDL_spy1_metadata.csv', index = False)

## Add info from Dicom

In [49]:
df_dicom = pd.read_csv(os.path.join(base_path,'spy1_data_from_dicom.csv'))
df_dicom.columns

Index(['pid', 'spt_res', 'xy_spacing', 'slice_thick', 'later', 'tcol',
       'max_dcm0', 'max_dcm1', 'max_dcm2', 'mask_sum', 'is_mask', 'pre',
       'post_early', 'post_late', 'n_times', 'mask_start', 'mask_end', 'sraw',
       'eraw', 'scol', 'ecol', 'n_z', 'n_xy'],
      dtype='object')

In [50]:
df = df.merge(df_dicom[["pid","pre","post_early", "post_late",
                  'xy_spacing', 'slice_thick', 'later',
                        'n_times', 'mask_start', 'mask_end', 'sraw', 'eraw',
       'scol', 'ecol', 'n_z', 'n_xy'
                 ]], on="pid", how="left");

In [51]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 221 entries, 0 to 220
Data columns (total 42 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   pid              221 non-null    object 
 1   age              221 non-null    float64
 2   ER               219 non-null    float64
 3   PR               219 non-null    float64
 4   HR               219 non-null    float64
 5   HER2             216 non-null    float64
 6   HR_HER2_STATUS   216 non-null    object 
 7   bilateral        221 non-null    int64  
 8   MRI_LD_Baseline  219 non-null    float64
 9   pCR              219 non-null    float64
 10  rcb_class        201 non-null    float64
 11  race_white       221 non-null    int64  
 12  race_black       221 non-null    int64  
 13  voi_start_x      173 non-null    float64
 14  voi_start_y      173 non-null    float64
 15  voi_start_z      173 non-null    float64
 16  voi_end_x        173 non-null    float64
 17  voi_end_y       

In [52]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 221 entries, 0 to 220
Data columns (total 42 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   pid              221 non-null    object 
 1   age              221 non-null    float64
 2   ER               219 non-null    float64
 3   PR               219 non-null    float64
 4   HR               219 non-null    float64
 5   HER2             216 non-null    float64
 6   HR_HER2_STATUS   216 non-null    object 
 7   bilateral        221 non-null    int64  
 8   MRI_LD_Baseline  219 non-null    float64
 9   pCR              219 non-null    float64
 10  rcb_class        201 non-null    float64
 11  race_white       221 non-null    int64  
 12  race_black       221 non-null    int64  
 13  voi_start_x      173 non-null    float64
 14  voi_start_y      173 non-null    float64
 15  voi_start_z      173 non-null    float64
 16  voi_end_x        173 non-null    float64
 17  voi_end_y       

In [53]:
df.dataset='spy1'

In [54]:
df.to_csv('BreastDCEDL_spy1_metadata.csv', index = False)