# DUKE - Read TCIA metada data
## Harmonize DUKE medical data into BreastDCEDL
#### Author: Bubby Solway
#### Date: 2025-03-12
> BreastDCEDL/DUKE/duke_tcia_metadata.ipynb


[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/naomifridman/BreastDCEDL/blob/main/DUKE/duke_tcia_metadata.ipynb)

## Remarks:
#### Bad dcm data
    *Breast_MRI_250 
    *Breast_MRI_577


In [3991]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import struct
import os

In [3992]:
my_computer=True
git_samples=False

In [3993]:
# use if data nifti data is downloaded
if my_computer:
    # Base path of the nifti data
    base_path="G:\\My Drive\\breast_mri"
    nifti_path= {'spy2':os.path.join(base_path,"SPY2","vis1","dce"),
                        'spy1':os.path.join(base_path,"SPY1","vis1","dce"),
                        'duke':os.path.join(base_path,"DUKE","dce")}

    mask_path={'spy2':os.path.join(base_path,"SPY2","vis1","mask"),
                    'spy1':os.path.join(base_path,"SPY1","vis1","mask"),
                    'duke':os.path.join(base_path,"DUKE","mask")}
    
    
# use for data_samples
if git_samples:
    base_path="."
    nifti_path= {'spy2':os.path.join('.',"ISPY2","data_samples","dce"),
                        'spy1':os.path.join('.',"ISPY1","data_samples","dce"),
                        'duke':os.path.join('.',"DUKE","data_samples","dce")}

    mask_path={'spy2':os.path.join('.',"ISPY2","data_samples","mask"),
                    'spy1':os.path.join('.',"ISPY1","data_samples","mask"),
                    'duke':os.path.join('.',"DUKE","data_samples","mask")}
    

In [3994]:
import os, sys
sys.path.append(os.path.abspath(os.path.join('.',  '../utils')))

import data_utils as ds

ds.setup_paths(base_path, nifti_path, mask_path)

{'spy2': 'G:\\My Drive\\breast_mri\\SPY2\\vis1\\dce', 'spy1': 'G:\\My Drive\\breast_mri\\SPY1\\vis1\\dce', 'duke': 'G:\\My Drive\\breast_mri\\DUKE\\dce'} {'spy2': 'G:\\My Drive\\breast_mri\\SPY2\\vis1\\mask', 'spy1': 'G:\\My Drive\\breast_mri\\SPY1\\vis1\\mask', 'duke': 'G:\\My Drive\\breast_mri\\DUKE\\mask'}


In [3995]:
df = pd.read_excel(os.path.join('TCIA_metadata',
                              "Clinical_and_Other_Features_proc.xlsx"))
df.head()

Unnamed: 0,Patient ID,Slice Thickness,Rows,Columns,FOV Computed (Field of View) in cm,Date of Birth (Days),Menopause (at diagnosis),Race and Ethnicity,ER,PR,...,Tumor Location,Position,Bilateral Information,"Clinical Response, Evaluated Through Imaging",Pathologic Response to Neoadjuvant Therapy,Neoadjuvant Chemotherapy,Adjuvant Chemotherapy,Neoadjuvant Endocrine Therapy Medications,Adjuvant Endocrine Therapy Medications,Near Complete ResponseOverall Near-complete Response: Stricter Definition
0,Breast_MRI_001,5,1,1,15,-15209,0,2,0,0,...,L,L 9 medial,0,2.0,2.0,1.0,0.0,0.0,0.0,0.0
1,Breast_MRI_002,12,2,2,5,-14061,0,2,0,0,...,L,L 1,0,1.0,3.0,1.0,0.0,0.0,0.0,2.0
2,Breast_MRI_003,18,2,2,14,-22685,1,1,1,1,...,,L 2,NC,2.0,2.0,0.0,0.0,1.0,1.0,0.0
3,Breast_MRI_004,18,2,2,15,-21479,1,1,1,1,...,L,L,0,,,0.0,0.0,0.0,1.0,
4,Breast_MRI_005,18,1,1,14,-13932,0,5,1,0,...,R,R 3,0,1.0,1.0,1.0,0.0,0.0,1.0,1.0


In [3996]:
import pandas as pd
import numpy as np

def map_category_to_thickness(category):
    # Dictionary mapping categories to thickness values
    category_map = {
        0: 0.90,
        1: 0.95,
        2: 1.00,
        3: 1.04,
        4: 1.06,
        5: 1.10,
        6: 1.12,
        7: 1.15,
        8: 1.20,
        9: 1.23,
        10: 1.24,
        11: 1.25,
        12: 1.30,
        13: 1.40,
        14: 1.45,
        15: 1.50,
        16: 1.60,
        17: 1.80,
        18: 2.00,
        19: 2.20,
        20: 2.50
    }

    return category_map.get(int(category), None)


In [3997]:
df.columns = [col.strip().replace(' ', '_') for col in df.columns]

df.columns = df.columns.str.replace(' ', '_')

In [3998]:
df.columns

Index(['Patient_ID', 'Slice_Thickness', 'Rows', 'Columns',
       'FOV_Computed_(Field_of_View)_in_cm', 'Date_of_Birth_(Days)',
       'Menopause_(at_diagnosis)', 'Race_and_Ethnicity', 'ER', 'PR', 'HER2',
       'Mol_Subtype', 'Staging(Tumor_Size)#_[T]', 'Tumor_Grade', 'Unnamed:_14',
       'Unnamed:_15', 'Nottingham_grade', 'Histologic_type', 'Tumor_Location',
       'Position', 'Bilateral_Information',
       'Clinical_Response,_Evaluated_Through_Imaging',
       'Pathologic_Response_to_Neoadjuvant_Therapy',
       'Neoadjuvant_Chemotherapy', 'Adjuvant_Chemotherapy',
       'Neoadjuvant_Endocrine_Therapy_Medications',
       'Adjuvant_Endocrine_Therapy_Medications',
       'Near_Complete_ResponseOverall_Near-complete_Response:__Stricter_Definition'],
      dtype='object')

In [3999]:
df['slice_thick'] = df['Slice_Thickness'].apply(map_category_to_thickness)

In [4000]:
df[['slice_thick']].describe()

Unnamed: 0,slice_thick
count,922.0
mean,1.72962
std,0.417857
min,1.04
25%,1.2
50%,2.0
75%,2.0
max,2.5


In [4001]:
dicct_rows={0:320,1:448,2:512}

In [4002]:
df['n_xy'] = df['Rows'].map(dicct_rows)

In the Duke Breast MRI dataset, "FOV_Computed_(Field_of_View)_in_cm" or "FOVcm" refers to the Field of View measurement in centimeters used during the MRI acquisition.

The Field of View (FOV) in MRI is a critical technical parameter that defines the spatial area or volume that is captured in the imaging process. Specifically:

1. It represents the physical dimensions of the region being imaged (in this case, in centimeters)
2. It's an important acquisition parameter that affects spatial resolution and image quality
3. In breast MRI, the FOV typically needs to be large enough to encompass the entire breast volume

The FOV value impacts several aspects of the imaging:
- Larger FOV values capture more anatomy but may reduce spatial resolution
- Smaller FOV values provide higher spatial resolution but cover less area
- The FOV directly affects the pixel/voxel size in combination with the matrix size

This parameter might be relevant in analyses as it can potentially influence the apparent size of lesions and the quality of extracted radiomics features. It's one of the technical MRI parameters that researchers might need to account for when comparing images acquired with different protocols.

In [4003]:
df=df.rename(columns={'FOV_Computed_(Field_of_View)_in_cm':'FOVcm',
                     })

In [4004]:
df['age']=df['Date_of_Birth_(Days)'].apply(lambda x: int(-1.*x/365.))

In [4005]:
df.head()

Unnamed: 0,Patient_ID,Slice_Thickness,Rows,Columns,FOVcm,Date_of_Birth_(Days),Menopause_(at_diagnosis),Race_and_Ethnicity,ER,PR,...,"Clinical_Response,_Evaluated_Through_Imaging",Pathologic_Response_to_Neoadjuvant_Therapy,Neoadjuvant_Chemotherapy,Adjuvant_Chemotherapy,Neoadjuvant_Endocrine_Therapy_Medications,Adjuvant_Endocrine_Therapy_Medications,Near_Complete_ResponseOverall_Near-complete_Response:__Stricter_Definition,slice_thick,n_xy,age
0,Breast_MRI_001,5,1,1,15,-15209,0,2,0,0,...,2.0,2.0,1.0,0.0,0.0,0.0,0.0,1.1,448,41
1,Breast_MRI_002,12,2,2,5,-14061,0,2,0,0,...,1.0,3.0,1.0,0.0,0.0,0.0,2.0,1.3,512,38
2,Breast_MRI_003,18,2,2,14,-22685,1,1,1,1,...,2.0,2.0,0.0,0.0,1.0,1.0,0.0,2.0,512,62
3,Breast_MRI_004,18,2,2,15,-21479,1,1,1,1,...,,,0.0,0.0,0.0,1.0,,2.0,512,58
4,Breast_MRI_005,18,1,1,14,-13932,0,5,1,0,...,1.0,1.0,1.0,0.0,0.0,1.0,1.0,2.0,448,38


# pCR

In the Duke breast MRI dataset, the "Pathologic_Response_to_Neoadjuvant_Therapy" field represents the degree of tumor response to neoadjuvant therapy (treatment given before surgery), with the following code meanings:

* 1.0 (64 cases): Complete response - No evidence of invasive tumor remaining after therapy
* 2.0 (224 cases): Not complete response - Residual invasive disease present after therapy
* 3.0 (11 cases): DCIS only remaining - All invasive disease eliminated, but ductal carcinoma in situ still present
* 4.0 (1 case): LCIS only remaining - All invasive disease eliminated, but lobular carcinoma in situ still present
* 5.0 (12 cases): Treatment response assessment unavailable - Unable to determine response

The values show that most patients (224) did not achieve a complete response to neoadjuvant therapy, while 64 patients had a complete response (pCR - pathologic Complete Response). The relatively small number of patients with only DCIS or LCIS remaining represents an intermediate response category where the invasive component was eliminated but pre-invasive disease remained.

This variable is often used as a primary endpoint in breast cancer research, as achieving pCR is associated with improved long-term outcomes in many breast cancer subtypes.

In [4006]:
df.Pathologic_Response_to_Neoadjuvant_Therapy.value_counts()

Pathologic_Response_to_Neoadjuvant_Therapy
2.0    224
1.0     64
5.0     12
3.0     11
4.0      1
Name: count, dtype: int64

* 5->none,
* 1->1,
* 2,
* 3,
* 4->0

In [4007]:
pcr_dict={2:0,3:0,4:0,5:None,1:1}
df['pCR']=df['Pathologic_Response_to_Neoadjuvant_Therapy'].map(pcr_dict)

In [4008]:
df['pCR'].value_counts(dropna=False)

pCR
NaN    622
0.0    236
1.0     64
Name: count, dtype: int64

In [4009]:
pd.crosstab(df.Neoadjuvant_Chemotherapy, df['pCR'], dropna=False)

pCR,0.0,1.0,NaN
Neoadjuvant_Chemotherapy,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,19,0,582
1.0,217,64,11
,0,0,29


In [4010]:
pd.crosstab(df.Neoadjuvant_Chemotherapy, df.Neoadjuvant_Endocrine_Therapy_Medications, dropna=False)

Neoadjuvant_Endocrine_Therapy_Medications,0.0,1.0,NaN
Neoadjuvant_Chemotherapy,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,582,19,0
1.0,288,3,1
,0,0,29


Paitents who did not revice Neoadjuvant_Chemotherapy == 0, should have pCR = None

* 0 = not complete or near-complete
* 1 = complete
* 2 = near-complete
* 3 = pathologic assessment not available
* NA = not applicable

In [4011]:
pd.crosstab(df['Near_Complete_ResponseOverall_Near-complete_Response:__Stricter_Definition'],
            df.Adjuvant_Chemotherapy)

Adjuvant_Chemotherapy,0.0,1.0
Near_Complete_ResponseOverall_Near-complete_Response:__Stricter_Definition,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,157,48
1.0,62,2
2.0,28,0
3.0,9,1


In [4012]:
pd.crosstab(df['Near_Complete_ResponseOverall_Near-complete_Response:__Stricter_Definition'],
            df.Neoadjuvant_Chemotherapy, dropna=False)

Neoadjuvant_Chemotherapy,0.0,1.0,NaN
Near_Complete_ResponseOverall_Near-complete_Response:__Stricter_Definition,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,19,188,0
1.0,0,64,0
2.0,0,29,0
3.0,1,11,0
,581,0,29


In [4013]:
df.pCR.value_counts()

pCR
0.0    236
1.0     64
Name: count, dtype: int64

In [4014]:
df.pCR.value_counts(dropna = False)

pCR
NaN    622
0.0    236
1.0     64
Name: count, dtype: int64

In [4015]:
df.pCR.value_counts(dropna = False)

pCR
NaN    622
0.0    236
1.0     64
Name: count, dtype: int64

In [4016]:
df = df[['Patient_ID', 'Slice_Thickness', 'Rows', 'Columns', 'FOVcm',
       'Date_of_Birth_(Days)', 'Menopause_(at_diagnosis)',
       'Race_and_Ethnicity', 'ER', 'PR', 'HER2', 'Mol_Subtype',
       'Staging(Tumor_Size)#_[T]', 'Tumor_Grade', 
       'Nottingham_grade', 'Histologic_type',
       'Bilateral_Information', 
       'slice_thick', 'n_xy', 'age', 'pCR']]

In [4017]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 922 entries, 0 to 921
Data columns (total 21 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Patient_ID                922 non-null    object 
 1   Slice_Thickness           922 non-null    int64  
 2   Rows                      922 non-null    int64  
 3   Columns                   922 non-null    int64  
 4   FOVcm                     922 non-null    int64  
 5   Date_of_Birth_(Days)      922 non-null    int64  
 6   Menopause_(at_diagnosis)  922 non-null    int64  
 7   Race_and_Ethnicity        922 non-null    int64  
 8   ER                        922 non-null    int64  
 9   PR                        922 non-null    int64  
 10  HER2                      922 non-null    int64  
 11  Mol_Subtype               922 non-null    int64  
 12  Staging(Tumor_Size)#_[T]  916 non-null    float64
 13  Tumor_Grade               907 non-null    float64
 14  Nottingham

In [4018]:
df['ER'].value_counts()

ER
1    686
0    236
Name: count, dtype: int64

In [4019]:
df['PR'].value_counts()

PR
1    598
0    324
Name: count, dtype: int64

In [4020]:
df['HER2'].value_counts()

HER2
0    759
1    163
Name: count, dtype: int64

In [4021]:
df['HR']=np.where(df['ER']+df['PR']>0,1,0)

In [4022]:
df['HR'].value_counts()

HR
1    699
0    223
Name: count, dtype: int64

In [4023]:
df['HER2'].value_counts()

HER2
0    759
1    163
Name: count, dtype: int64

# Mol_Subtype

In [4024]:
df['Mol_Subtype'].value_counts()

Mol_Subtype
0    595
3    164
1    104
2     59
Name: count, dtype: int64

**Mol_Subtype** refers to molecular subtypes of breast cancer, which are important classifications based on receptor status that help determine treatment strategies and predict prognosis.

- **0 = luminal-like**: These are estrogen receptor (ER) and/or progesterone receptor (PR) positive, but HER2 negative cancers. Often divided into Luminal A and Luminal B subtypes. They generally have better prognosis and respond to hormone therapy.

- **1 = ER/PR pos, HER2 pos**: These are "triple positive" cancers that express both hormone receptors (ER/PR) and HER2. They can be treated with a combination of hormone therapy and HER2-targeted therapies.

- **2 = her2**: These are HER2-positive but hormone receptor-negative (ER/PR negative) cancers. They tend to be more aggressive but respond well to HER2-targeted therapies.

- **3 = trip neg**: Triple-negative breast cancers (TNBC) lack expression of all three receptors (ER-negative, PR-negative, and HER2-negative). These are generally more aggressive with fewer targeted treatment options, often requiring chemotherapy.

In breast MRI research and clinical trials like ISPY2 or ACRIN-6698, these molecular subtypes are crucial variables as they influence:
- Contrast enhancement patterns in DCE-MRI
- Response to neoadjuvant therapy
- Correlation between imaging features and tumor biology
- Prediction of pathological complete response (pCR)

This classification system helps researchers stratify patients in analysis and may reveal different imaging characteristics across molecular subtypes.

### Mol_Subtype
tncs_dict={
* 0 = luminal-like, # (ER) and/or progesterone receptor (PR) positive, but HER2 negative
* 1 = ER/PR pos, HER2 pos,
* 2 = her2, # her2: These are HER2-positive but hormone receptor-negative (ER/PR negative) cancers.
* 3 = trip neg
}

In [4025]:
df['Mol_Subtype'].value_counts()

Mol_Subtype
0    595
3    164
1    104
2     59
Name: count, dtype: int64

In [4026]:
df['hormon_status']=df['Mol_Subtype'].map({0:'luminal_hrpos_her2neg',
                                          1:'hrpos_her2pos',
                                          2:'hrneg_her2pos',
                                          3:'triple_neg' })

In [4027]:
pd.crosstab(df['Mol_Subtype'],df['HR'])

HR,0,1
Mol_Subtype,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0,595
1,0,104
2,59,0
3,164,0


In [4028]:
pd.crosstab(df['Mol_Subtype'],df['HER2'])

HER2,0,1
Mol_Subtype,Unnamed: 1_level_1,Unnamed: 2_level_1
0,595,0
1,0,104
2,0,59
3,164,0


In [4029]:
import pandas as pd
import numpy as np

def create_hr_her2_status(df):
    """
    Creates HR_HER2 status column matching the categories:
    HER2pos, HRposHER2neg, TripleNeg

    Parameters:
    -----------
    df : pandas DataFrame
        Must contain columns 'HR' and 'HER2'

    Returns:
    --------
    pandas Series
        HR_HER2 status with categories matching the target format
    """
    conditions = [
        (df['HER2'] == 1),                          # HER2 positive
        (df['HR'] == 1) & (df['HER2'] == 0),       # HR positive and HER2 negative
        (df['HR'] == 0) & (df['HER2'] == 0)        # Triple negative
    ]

    choices = ['HER2pos', 'HRposHER2neg', 'TripleNeg']

    return pd.Series(np.select(conditions, choices, default='Unknown'))

In [4030]:
df['HR_HER2'] = create_hr_her2_status(df)
df['HR_HER2'].value_counts(dropna=False)

HR_HER2
HRposHER2neg    595
TripleNeg       164
HER2pos         163
Name: count, dtype: int64

In [4031]:
df['TripleNeg'] = np.where(df.HR_HER2=='TripleNeg',1,0)

df['HER2pos'] = np.where(df.HR_HER2=='HER2pos',1,0)

df['HRposHER2neg'] = np.where(df.HR_HER2=='HRposHER2neg',1,0)

In [4032]:
pd.crosstab(df['Mol_Subtype'], df['HR_HER2'])

HR_HER2,HER2pos,HRposHER2neg,TripleNeg
Mol_Subtype,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,595,0
1,104,0,0
2,59,0,0
3,0,0,164


In [4033]:
pd.crosstab(df['Mol_Subtype'], df['HER2'])

HER2,0,1
Mol_Subtype,Unnamed: 1_level_1,Unnamed: 2_level_1
0,595,0
1,0,104
2,0,59
3,164,0


In [4034]:
pd.crosstab(df['Mol_Subtype'], df['HR'])

HR,0,1
Mol_Subtype,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0,595
1,0,104
2,59,0
3,164,0


In [4035]:
pd.crosstab(df['Mol_Subtype'], df['ER'])

ER,0,1
Mol_Subtype,Unnamed: 1_level_1,Unnamed: 2_level_1
0,10,585
1,3,101
2,59,0
3,164,0


In [4036]:
pd.crosstab(df['Mol_Subtype'], df['PR'])

PR,0,1
Mol_Subtype,Unnamed: 1_level_1,Unnamed: 2_level_1
0,77,518
1,24,80
2,59,0
3,164,0


In [4037]:
df['Menopause_(at_diagnosis)'].value_counts()

Menopause_(at_diagnosis)
1    499
0    407
2     16
Name: count, dtype: int64

In [4038]:
df['Menopause_(at_diagnosis)'].value_counts()

Menopause_(at_diagnosis)
1    499
0    407
2     16
Name: count, dtype: int64

In [4039]:
df['menopause'] = df['Menopause_(at_diagnosis)'].map({1:1,0:0,2:None})

In [4040]:
df[df.menopause.isna()].age

6      44
28     64
87     42
119    46
121    47
308    62
381    32
471    47
539    48
583    54
598    51
668    80
812    40
866    55
875    56
905    45
Name: age, dtype: int64

In [4041]:

# Verify the changes
print(f"menopause: {df['menopause'].value_counts()}")
print(f"Missing values remaining: {df['menopause'].isna().sum()}")

menopause: menopause
1.0    499
0.0    407
Name: count, dtype: int64
Missing values remaining: 16


In [4042]:
import warnings
warnings.filterwarnings('ignore')

T (Tumor) describes the size and extent of the primary tumor.
- T1: Small tumor (≤2 cm)
- T2: Medium tumor (2–5 cm)
- T3: Large tumor (>5 cm)
- T4: Tumor has spread beyond the organ
This gives the diameter in cm.


In [4043]:
df.Race_and_Ethnicity.value_counts()

Race_and_Ethnicity
1    651
2    203
0     19
5     18
3     14
6      9
8      4
4      3
7      1
Name: count, dtype: int64

In [4044]:
df['race_white']=np.where(df.Race_and_Ethnicity==1,1,0)
df['race_black']=np.where(df.Race_and_Ethnicity==2,1,0)


In [4045]:
df.race_black.value_counts(dropna=False)

race_black
0    719
1    203
Name: count, dtype: int64

# Tumor chareceriostiocs
## Tumor stage

Let me break down the meaning of these values in the context of tumor staging:

The values represent the different stages of breast cancer in the Duke Breast MRI dataset:

1. Stage 1.0 (409 cases): Early-stage cancer, small tumor, best prognosis
2. Stage 2.0 (395 cases): Intermediate-stage cancer, moderate tumor size
3. Stage 3.0 (90 cases): Advanced local disease, larger tumor
4. Stage 4.0 (22 cases): Metastatic cancer, most advanced stage

This distribution shows that most cases in the dataset are in the early to intermediate stages (stages 1.0 and 2.0), with fewer cases in the advanced stages (3.0 and 4.0). This is typical in cancer datasets, as early detection is crucial in breast cancer management.

Great question! I'll explain the nuanced relationship between tumor size and severity.

Tumor staging is far more complex than just size. There can absolutely be cases where a smaller tumor is more severe than a larger one. The staging takes into account multiple factors:

1. **Tumor Size**: Only one component of staging
2. **Cellular Aggressiveness**: How quickly the tumor grows
3. **Genetic Characteristics**: Molecular markers
4. **Spread Potential**: Lymph node involvement
5. **Invasion Capability**: How the tumor interacts with surrounding tissues

Real-world example: A small 1 cm tumor with aggressive, rapidly dividing cells and high metastatic potential could be more severe than a 5 cm tumor with slow-growing, less invasive cells.

Think of it like this:
- A tiny but "smart" and aggressive cancer cell can be more dangerous than a larger, slower-growing tumor
- Staging tries to capture this complexity, not just measure size

Oncologists use multiple tests to determine:
- Tumor size
- Cellular characteristics
- Genetic markers
- Potential for spread

So while size is important, it's not the only factor determining a cancer's severity or stage. Each tumor is unique and requires a comprehensive assessment.

Yes, precisely! The staging is defined in the pre-treatment stage, before any treatment begins. This initial staging is crucial as it determines the entire treatment approach.

The staging is definitively determined in the pre-treatment stage, which means:
- Before any surgery
- Before chemotherapy
- Before radiation
- Before any cancer treatments begin

This initial staging is crucial because:
1. It provides the baseline understanding of the cancer
2. It guides the entire treatment plan
3. It helps predict potential outcomes
4. It allows for personalized treatment approach

The process involves:
- Comprehensive imaging (MRI, CT scans)
- Detailed biopsy
- Molecular and genetic testing
- Physical examinations
- Assessing potential spread

Think of it like a complete medical detective work before deciding on the treatment strategy. The goal is to understand the cancer as comprehensively as possible before taking any action.

After initial treatment, there might be a pathological restaging, which can refine the initial assessment, but the original pre-treatment staging remains the primary reference point.

Would you like me to elaborate on how this pre-treatment staging is conducted in detail?

## Tumor grade
Tumor grading is both standardized and subjective. Here's a nuanced explanation:

Standardization Efforts:
1. Worldwide organizations like WHO have created guidelines
2. Most common system is the Nottingham Grading System
3. Specific scoring criteria for:
   - Cell differentiation
   - Mitotic activity
   - Nuclear characteristics

Sources of Subjectivity:
- Different pathologists may interpret cellular characteristics slightly differently
- Studies show up to 20-30% variation in grade assignment
- More subjective in complex or borderline cases

Reduction of Subjectivity:
- Multiple pathologist reviews
- Quantitative scoring systems
- Emerging AI-assisted digital pathology tools
- Molecular testing to complement visual grading

Real-World Analogy:
Think of it like art criticism. While there are established criteria for evaluating art, there's still room for individual interpretation. Similarly, tumor grading has standard guidelines, but some expert judgment is always involved.

Key Takeaway: While there are strong standardization efforts, some level of subjectivity is inherent in the process. This is why:
- Multiple expert opinions are valuable
- Molecular testing is increasingly used
- Continuous research refines grading techniques

Would you like me to elaborate on how pathologists work to minimize this subjectivity?

In [4046]:
df['Tumor_Grade'].value_counts(dropna=False)

Tumor_Grade
3.0    695
2.0    144
1.0     68
NaN     15
Name: count, dtype: int64

In [4047]:
df['Nottingham_grade'].value_counts(dropna=False)

Nottingham_grade
2.0    318
NaN    284
3.0    207
1.0    113
Name: count, dtype: int64

**Key Point: These NaN values are not random errors, but likely represent real limitations in the diagnostic process.**

In [4048]:
df[df['Nottingham_grade'].isna()]

Unnamed: 0,Patient_ID,Slice_Thickness,Rows,Columns,FOVcm,Date_of_Birth_(Days),Menopause_(at_diagnosis),Race_and_Ethnicity,ER,PR,...,pCR,HR,hormon_status,HR_HER2,TripleNeg,HER2pos,HRposHER2neg,menopause,race_white,race_black
2,Breast_MRI_003,18,2,2,14,-22685,1,1,1,1,...,0.0,1,luminal_hrpos_her2neg,HRposHER2neg,0,0,1,1.0,1,0
5,Breast_MRI_006,18,2,2,14,-16735,0,1,1,1,...,,1,luminal_hrpos_her2neg,HRposHER2neg,0,0,1,0.0,1,0
10,Breast_MRI_011,18,2,2,14,-19389,1,1,0,0,...,,0,triple_neg,TripleNeg,1,0,0,1.0,1,0
15,Breast_MRI_016,18,2,2,11,-19059,1,6,1,1,...,,1,luminal_hrpos_her2neg,HRposHER2neg,0,0,1,1.0,0,0
21,Breast_MRI_022,5,1,1,10,-19510,1,1,1,1,...,0.0,1,hrpos_her2pos,HER2pos,0,1,0,1.0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
902,Breast_MRI_903,8,2,2,9,-18978,1,2,0,0,...,,0,hrneg_her2pos,HER2pos,0,1,0,1.0,0,1
903,Breast_MRI_904,5,0,0,15,-15089,0,0,1,1,...,,1,luminal_hrpos_her2neg,HRposHER2neg,0,0,1,0.0,0,0
904,Breast_MRI_905,18,2,2,14,-14552,1,1,1,1,...,,1,luminal_hrpos_her2neg,HRposHER2neg,0,0,1,1.0,1,0
910,Breast_MRI_911,18,2,2,14,-15563,0,1,1,0,...,,1,luminal_hrpos_her2neg,HRposHER2neg,0,0,1,0.0,1,0


In [4049]:
pd.crosstab(df['Nottingham_grade'], df.pCR, dropna=False)

pCR,0.0,1.0,NaN
Nottingham_grade,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1.0,11,0,102
2.0,78,9,231
3.0,64,37,106
,83,18,183


In [4050]:
pd.crosstab(df['Nottingham_grade'], df.Histologic_type, dropna=False)

Histologic_type,0.0,1.0,2.0,3.0,5.0,9.0,NaN
Nottingham_grade,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1.0,0,93,18,0,2,0,0
2.0,1,273,39,1,0,4,0
3.0,0,201,6,0,0,0,0
,0,8,0,0,0,0,276


In [4051]:
pd.crosstab(df.pCR, df.Histologic_type, dropna=False)

Histologic_type,0.0,1.0,2.0,3.0,5.0,9.0,NaN
pCR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0.0,0,136,16,0,0,1,83
1.0,0,46,0,0,0,0,18
,1,393,47,1,2,3,175


## Histologic Type distribution in the Duke MRI dataset:

Based on the document you've shared, I'll provide a comprehensive explanation of Histologic Type in the context of the Duke MRI dataset.

Let me break down the Histologic Type values in more detail:

1. **1.0 (Invasive Ductal Carcinoma - IDC)**
   - Most common type
   - Starts in milk ducts
   - Breaks through duct wall
   - Represents majority of cases (575/922 cases)

2. **2.0 (Invasive Lobular Carcinoma - ILC)**
   - Begins in milk-producing lobules
   - Spreads to surrounding tissue
   - Less common (63/922 cases)

3. **Other Values (3.0, 5.0, 9.0)**
   - Rare or specialized subtypes
   - Could include:
     * Medullary Carcinoma
     * Metaplastic Carcinoma
     * Inflammatory Breast Cancer
     * Mixed cell types

4. **NaN (276 cases)**
   - Missing or undetermined histologic type
   - Could be due to:
     * Incomplete medical records
     * Challenging sample analysis
     * Preliminary diagnosis

The key takeaway is that the dataset is predominantly composed of Invasive Ductal Carcinoma (1.0), with a small number of other types and a significant portion of unclassified (NaN) cases.

Would you like me to elaborate on any specific aspect of histologic typing or the distribution in this dataset?

In [4052]:
df['Histologic_type'].value_counts(dropna=False)

Histologic_type
1.0    575
NaN    276
2.0     63
9.0      4
5.0      2
3.0      1
0.0      1
Name: count, dtype: int64

In [4053]:
hist_dict={1:'IDC', 2:'ILC',9:'rare',3:'rare',0:'rare',5:'rare'}

In [4054]:
df['hist_type']=df['Histologic_type'].map(hist_dict)

In [4055]:
df['hist_type'].value_counts(dropna=False)

hist_type
IDC     575
NaN     276
ILC      63
rare      8
Name: count, dtype: int64

### Bilateral_Information
Practical Implications

0 (Unilateral): Standard treatment approach
1 (Bilateral):

More comprehensive surgery
Potential bilateral mastectomy
Enhanced genetic testing


NC: Requires further investigation

In [4056]:
df['Bilateral_Information'].value_counts(dropna=False)

Bilateral_Information
0     623
NC    271
1      28
Name: count, dtype: int64

In [4057]:
df['Bilateral_Information'].values[3]==0

True

In [4058]:
df['Bilateral_Information'].values[2]=='NC'

True

In [4059]:
df['bilateral']=df['Bilateral_Information'].map({0:0,1:1,'NC':np.nan})

In [4094]:
df[~df.pCR.isna()]['bilateral'].value_counts(dropna=False)

bilateral
0.0    193
NaN     99
1.0      8
Name: count, dtype: int64

In [4061]:
df['Staging(Tumor_Size)#_[T]'].value_counts(dropna=False)

Staging(Tumor_Size)#_[T]
1.0    409
2.0    395
3.0     90
4.0     22
NaN      6
Name: count, dtype: int64

In [4062]:
df=df.rename(columns={'Staging(Tumor_Size)#_[T]':'stage_tum_s'})

In [4063]:
df['tum_vol_stage']=df['stage_tum_s'].map({1:1.5,2:2.5,3:6,4:8})

## Tumor Stage and Tumor Volume, Merge more tcia data

In [4064]:
dd = pd.read_excel(os.path.join('TCIA_metadata',
                              "Imaging_Features.xlsx"))
dd.columns

Index(['Patient ID', 'F1_DT_POSTCON (T11=0.05,T12=0.5)',
       'F1_DT_POSTCON (T11=0.05,T12=0.1)', 'F1_DT_POSTCON (T11=0.02,T12=0.5)',
       'F1_DT_POSTCON (T11=0.02,T12=0.8)', 'F1_DT_POSTCON (T11=0.05,T12=0.8)',
       'F1_DT_POSTCON (T11=0.1,T12=0.5)', 'F1_DT_POSTCON (T11=0.1,T12=0.8)',
       'F1_DT_POSTCON (T11=0.2,T12=0.5)', 'F1_DT_POSTCON (T11=0.2,T12=0.8)',
       ...
       'WashinRate_map_difference_entropy_tissue_PostCon',
       'WashinRate_map_information_measure_correlation1_tissue_PostCon',
       'WashinRate_map_information_measure_correlation2_tissue_PostCon',
       'WashinRate_map_inverse_difference_is_homom_tissue_PostCon',
       'WashinRate_map_inverse_difference_normalized_tissue_PostCon',
       'WashinRate_map_inverse_difference_moment_normalized_tissue_PostCon',
       'WashinRate_map_mean_tissue_PostCon',
       'WashinRate_map_std_dev_tissue_PostCon',
       'WashinRate_map_skewness_tissue_PostCon',
       'WashinRate_map_kurtosis_tissue_PostCon'],
      dt

In [4065]:
dd.head()

Unnamed: 0,Patient ID,"F1_DT_POSTCON (T11=0.05,T12=0.5)","F1_DT_POSTCON (T11=0.05,T12=0.1)","F1_DT_POSTCON (T11=0.02,T12=0.5)","F1_DT_POSTCON (T11=0.02,T12=0.8)","F1_DT_POSTCON (T11=0.05,T12=0.8)","F1_DT_POSTCON (T11=0.1,T12=0.5)","F1_DT_POSTCON (T11=0.1,T12=0.8)","F1_DT_POSTCON (T11=0.2,T12=0.5)","F1_DT_POSTCON (T11=0.2,T12=0.8)",...,WashinRate_map_difference_entropy_tissue_PostCon,WashinRate_map_information_measure_correlation1_tissue_PostCon,WashinRate_map_information_measure_correlation2_tissue_PostCon,WashinRate_map_inverse_difference_is_homom_tissue_PostCon,WashinRate_map_inverse_difference_normalized_tissue_PostCon,WashinRate_map_inverse_difference_moment_normalized_tissue_PostCon,WashinRate_map_mean_tissue_PostCon,WashinRate_map_std_dev_tissue_PostCon,WashinRate_map_skewness_tissue_PostCon,WashinRate_map_kurtosis_tissue_PostCon
0,Breast_MRI_001,1.0,0.120721,0.530395,1.0,1.0,1.0,1.0,1.0,1.0,...,3.380663,-0.025575,0.422391,0.171959,0.960359,0.996829,14.517894,20.347506,1.62587,11.406955
1,Breast_MRI_002,1.0,0.129546,0.485217,1.0,1.0,1.0,1.0,1.0,1.0,...,3.444474,-0.036063,0.505652,0.177087,0.959067,0.996363,47.29795,83.909561,0.251498,5.659428
2,Breast_MRI_003,0.174775,0.062051,0.06991,0.132265,0.330662,0.34955,0.661324,0.6991,1.0,...,3.478455,-0.04373,0.546674,0.170507,0.957527,0.995981,114.171582,129.252343,1.928743,11.554948
3,Breast_MRI_004,0.086546,0.045111,0.034619,0.051265,0.128162,0.173093,0.256325,0.346185,0.51265,...,3.389678,-0.017802,0.363818,0.17721,0.960705,0.996827,33.499175,69.164227,1.171314,8.493319
4,Breast_MRI_005,0.289669,0.052031,0.115868,0.378575,0.839984,0.579338,1.0,0.958287,1.0,...,4.009938,-0.049294,0.603426,0.117966,0.930624,0.989135,34.406635,26.951415,0.985464,4.331451


In [4066]:
dd=dd.rename(columns={'Patient ID':'pid'})

dd.to_csv(os.path.join('TCIA_metadata',
                              "duke_imaging_features.csv"), index=False)

In [4067]:
dd.columns[15:70]

Index(['F1_DT_T1NFS (T11=0.1,T12=0.5)', 'F1_DT_T1NFS (T11=0.1,T12=0.8)',
       'F1_DT_T1NFS (T11=0.2,T12=0.5)', 'F1_DT_T1NFS (T11=0.2,T12=0.8)',
       'TumorMajorAxisLength_mm', 'BEVR_Tumor', 'BEDR1_Tumor', 'BEDR2_Tumor',
       'MF_Tumor', 'ASD_Tumor', 'Volume_cu_mm_Tumor', 'Autocorrelation_Tumor',
       'Contrast_Tumor', 'Correlation1_Tumor', 'Correlation2_Tumor',
       'Cluster_Prominence_Tumor', 'Cluster_Shade_Tumor',
       'Dissimilarity_Tumor', 'Energy_Tumor', 'Entropy_Tumor',
       'Homogeneity1_Tumor', 'Homogeneity2_Tumor', 'Max_probability_Tumor',
       'Sum_of_squares_variance_Tumor', 'Sum_avg_Tumor', 'Sum_variance_Tumor',
       'Sum_entropy_Tumor', 'Diff_entropy_Tumor', 'Inf_mea_of_corr1_Tumor',
       'Inf_mea_of_corr2_Tumor', 'Inv_Diff_homo_Tumor',
       'Inv_Diff_normalized_Tumor', 'Inv_diff_moment_normalized_Tumor',
       'Median_solidity_Tumor', 'Median_Elongation_Tumor',
       'Median_Euler_No_Tumor', 'Max_Enhancement_from_char_curv',
       'Time_to_Peak_fr

In [4068]:
dd[['Volume_cu_mm_Tumor']].describe()

Unnamed: 0,Volume_cu_mm_Tumor
count,921.0
mean,9915.76485
std,24114.85682
min,25.077509
25%,994.726562
50%,2833.614621
75%,8240.313649
max,424316.27016


In [4069]:
dd['tum_vol_tcia_imaging']=dd['Volume_cu_mm_Tumor']/1000.
dd.head()

Unnamed: 0,pid,"F1_DT_POSTCON (T11=0.05,T12=0.5)","F1_DT_POSTCON (T11=0.05,T12=0.1)","F1_DT_POSTCON (T11=0.02,T12=0.5)","F1_DT_POSTCON (T11=0.02,T12=0.8)","F1_DT_POSTCON (T11=0.05,T12=0.8)","F1_DT_POSTCON (T11=0.1,T12=0.5)","F1_DT_POSTCON (T11=0.1,T12=0.8)","F1_DT_POSTCON (T11=0.2,T12=0.5)","F1_DT_POSTCON (T11=0.2,T12=0.8)",...,WashinRate_map_information_measure_correlation1_tissue_PostCon,WashinRate_map_information_measure_correlation2_tissue_PostCon,WashinRate_map_inverse_difference_is_homom_tissue_PostCon,WashinRate_map_inverse_difference_normalized_tissue_PostCon,WashinRate_map_inverse_difference_moment_normalized_tissue_PostCon,WashinRate_map_mean_tissue_PostCon,WashinRate_map_std_dev_tissue_PostCon,WashinRate_map_skewness_tissue_PostCon,WashinRate_map_kurtosis_tissue_PostCon,tum_vol_tcia_imaging
0,Breast_MRI_001,1.0,0.120721,0.530395,1.0,1.0,1.0,1.0,1.0,1.0,...,-0.025575,0.422391,0.171959,0.960359,0.996829,14.517894,20.347506,1.62587,11.406955,5.526842
1,Breast_MRI_002,1.0,0.129546,0.485217,1.0,1.0,1.0,1.0,1.0,1.0,...,-0.036063,0.505652,0.177087,0.959067,0.996363,47.29795,83.909561,0.251498,5.659428,1.279434
2,Breast_MRI_003,0.174775,0.062051,0.06991,0.132265,0.330662,0.34955,0.661324,0.6991,1.0,...,-0.04373,0.546674,0.170507,0.957527,0.995981,114.171582,129.252343,1.928743,11.554948,2.442624
3,Breast_MRI_004,0.086546,0.045111,0.034619,0.051265,0.128162,0.173093,0.256325,0.346185,0.51265,...,-0.017802,0.363818,0.17721,0.960705,0.996827,33.499175,69.164227,1.171314,8.493319,0.491384
4,Breast_MRI_005,0.289669,0.052031,0.115868,0.378575,0.839984,0.579338,1.0,0.958287,1.0,...,-0.049294,0.603426,0.117966,0.930624,0.989135,34.406635,26.951415,0.985464,4.331451,10.006104


In [4070]:
dd.shape

(922, 531)

In [4071]:
dd[["pid","tum_vol_tcia_imaging" ]]

Unnamed: 0,pid,tum_vol_tcia_imaging
0,Breast_MRI_001,5.526842
1,Breast_MRI_002,1.279434
2,Breast_MRI_003,2.442624
3,Breast_MRI_004,0.491384
4,Breast_MRI_005,10.006104
...,...,...
917,Breast_MRI_918,14.826966
918,Breast_MRI_919,0.874207
919,Breast_MRI_920,0.603619
920,Breast_MRI_921,9.800327


In [4072]:
df=df.rename(columns={'Patient_ID':'pid'})

In [4073]:
df.head()

Unnamed: 0,pid,Slice_Thickness,Rows,Columns,FOVcm,Date_of_Birth_(Days),Menopause_(at_diagnosis),Race_and_Ethnicity,ER,PR,...,HR_HER2,TripleNeg,HER2pos,HRposHER2neg,menopause,race_white,race_black,hist_type,bilateral,tum_vol_stage
0,Breast_MRI_001,5,1,1,15,-15209,0,2,0,0,...,HER2pos,0,1,0,0.0,0,1,IDC,0.0,2.5
1,Breast_MRI_002,12,2,2,5,-14061,0,2,0,0,...,TripleNeg,1,0,0,0.0,0,1,IDC,0.0,2.5
2,Breast_MRI_003,18,2,2,14,-22685,1,1,1,1,...,HRposHER2neg,0,0,1,1.0,1,0,,,2.5
3,Breast_MRI_004,18,2,2,15,-21479,1,1,1,1,...,HRposHER2neg,0,0,1,1.0,1,0,IDC,0.0,1.5
4,Breast_MRI_005,18,1,1,14,-13932,0,5,1,0,...,HER2pos,0,1,0,0.0,0,0,IDC,0.0,2.5


In [4074]:

df = df.merge(dd[["pid","tum_vol_tcia_imaging" ]], on="pid", how="left");

In [4075]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 922 entries, 0 to 921
Data columns (total 34 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   pid                       922 non-null    object 
 1   Slice_Thickness           922 non-null    int64  
 2   Rows                      922 non-null    int64  
 3   Columns                   922 non-null    int64  
 4   FOVcm                     922 non-null    int64  
 5   Date_of_Birth_(Days)      922 non-null    int64  
 6   Menopause_(at_diagnosis)  922 non-null    int64  
 7   Race_and_Ethnicity        922 non-null    int64  
 8   ER                        922 non-null    int64  
 9   PR                        922 non-null    int64  
 10  HER2                      922 non-null    int64  
 11  Mol_Subtype               922 non-null    int64  
 12  stage_tum_s               916 non-null    float64
 13  Tumor_Grade               907 non-null    float64
 14  Nottingham

## Anotation of Boumding box




In [4076]:
df['n_xy'].value_counts()

n_xy
512    628
448    261
320     33
Name: count, dtype: int64

In [4077]:
df.columns

Index(['pid', 'Slice_Thickness', 'Rows', 'Columns', 'FOVcm',
       'Date_of_Birth_(Days)', 'Menopause_(at_diagnosis)',
       'Race_and_Ethnicity', 'ER', 'PR', 'HER2', 'Mol_Subtype', 'stage_tum_s',
       'Tumor_Grade', 'Nottingham_grade', 'Histologic_type',
       'Bilateral_Information', 'slice_thick', 'n_xy', 'age', 'pCR', 'HR',
       'hormon_status', 'HR_HER2', 'TripleNeg', 'HER2pos', 'HRposHER2neg',
       'menopause', 'race_white', 'race_black', 'hist_type', 'bilateral',
       'tum_vol_stage', 'tum_vol_tcia_imaging'],
      dtype='object')

In [4078]:
df[df.Rows!=df.Columns]

Unnamed: 0,pid,Slice_Thickness,Rows,Columns,FOVcm,Date_of_Birth_(Days),Menopause_(at_diagnosis),Race_and_Ethnicity,ER,PR,...,TripleNeg,HER2pos,HRposHER2neg,menopause,race_white,race_black,hist_type,bilateral,tum_vol_stage,tum_vol_tcia_imaging


In [4079]:
df_masks=pd.read_excel(os.path.join('TCIA_metadata/','Annotation_Boxes.xlsx'))
df_masks.head(1)

Unnamed: 0,Patient ID,Start Row,End Row,Start Column,End Column,Start Slice,End Slice
0,Breast_MRI_001,234,271,308,341,89,112


In [4080]:
df_masks.shape

(922, 7)

In [4081]:
for i,row in df.iterrows():
    pid=row['pid']
    tt=df_masks[df_masks['Patient ID']==pid]
    if len(tt)>0:
        df.loc[i,'sraw']=tt['Start Row'].values[0]
        df.loc[i,'eraw']=tt['End Row'].values[0]
        df.loc[i,'scol']=tt['Start Column'].values[0]
        df.loc[i,'ecol']=tt['End Column'].values[0]
        df.loc[i,'mask_start']=tt['Start Slice'].values[0]
        df.loc[i,'mask_end']=tt['End Slice'].values[0]
df.info()       

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 922 entries, 0 to 921
Data columns (total 40 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   pid                       922 non-null    object 
 1   Slice_Thickness           922 non-null    int64  
 2   Rows                      922 non-null    int64  
 3   Columns                   922 non-null    int64  
 4   FOVcm                     922 non-null    int64  
 5   Date_of_Birth_(Days)      922 non-null    int64  
 6   Menopause_(at_diagnosis)  922 non-null    int64  
 7   Race_and_Ethnicity        922 non-null    int64  
 8   ER                        922 non-null    int64  
 9   PR                        922 non-null    int64  
 10  HER2                      922 non-null    int64  
 11  Mol_Subtype               922 non-null    int64  
 12  stage_tum_s               916 non-null    float64
 13  Tumor_Grade               907 non-null    float64
 14  Nottingham

## Add information from dicom

In [4082]:
df_dcm = pd.read_csv('duke_dicom_nifti_mapping.csv')
df_dcm.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 920 entries, 0 to 919
Data columns (total 16 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   pid           920 non-null    object 
 1   n_times       920 non-null    int64  
 2   n_xy          920 non-null    int64  
 3   n_z           920 non-null    int64  
 4   slice_space   920 non-null    float64
 5   xy_spacing    920 non-null    float64
 6   voxel_z       920 non-null    float64
 7   top_folder    920 non-null    object 
 8   file_count    920 non-null    float64
 9   t1_file       920 non-null    object 
 10  acq0          920 non-null    object 
 11  acq1          920 non-null    object 
 12  acq2          920 non-null    object 
 13  acq3          913 non-null    object 
 14  acq4          433 non-null    object 
 15  acq0_increas  920 non-null    float64
dtypes: float64(5), int64(3), object(8)
memory usage: 115.1+ KB


In [4083]:
df = df.merge(df_dcm[["pid","xy_spacing","voxel_z","n_times","n_z"]], on="pid", how="left")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 922 entries, 0 to 921
Data columns (total 44 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   pid                       922 non-null    object 
 1   Slice_Thickness           922 non-null    int64  
 2   Rows                      922 non-null    int64  
 3   Columns                   922 non-null    int64  
 4   FOVcm                     922 non-null    int64  
 5   Date_of_Birth_(Days)      922 non-null    int64  
 6   Menopause_(at_diagnosis)  922 non-null    int64  
 7   Race_and_Ethnicity        922 non-null    int64  
 8   ER                        922 non-null    int64  
 9   PR                        922 non-null    int64  
 10  HER2                      922 non-null    int64  
 11  Mol_Subtype               922 non-null    int64  
 12  stage_tum_s               916 non-null    float64
 13  Tumor_Grade               907 non-null    float64
 14  Nottingham

In [4084]:
df=df.rename(columns={
                     'HR_HER2':'HR_HER2_STATUS'})

In [4085]:
df['pre']=0
df['post_early']=1
df['post_late']=df['n_times']-1

In [4086]:
df.columns

Index(['pid', 'Slice_Thickness', 'Rows', 'Columns', 'FOVcm',
       'Date_of_Birth_(Days)', 'Menopause_(at_diagnosis)',
       'Race_and_Ethnicity', 'ER', 'PR', 'HER2', 'Mol_Subtype', 'stage_tum_s',
       'Tumor_Grade', 'Nottingham_grade', 'Histologic_type',
       'Bilateral_Information', 'slice_thick', 'n_xy', 'age', 'pCR', 'HR',
       'hormon_status', 'HR_HER2_STATUS', 'TripleNeg', 'HER2pos',
       'HRposHER2neg', 'menopause', 'race_white', 'race_black', 'hist_type',
       'bilateral', 'tum_vol_stage', 'tum_vol_tcia_imaging', 'sraw', 'eraw',
       'scol', 'ecol', 'mask_start', 'mask_end', 'xy_spacing', 'voxel_z',
       'n_times', 'n_z', 'pre', 'post_early', 'post_late'],
      dtype='object')

In [4087]:
df.pCR.value_counts(),df[~df.pCR.isna()].shape

(pCR
 0.0    236
 1.0     64
 Name: count, dtype: int64,
 (300, 47))

In [4088]:
df.columns

Index(['pid', 'Slice_Thickness', 'Rows', 'Columns', 'FOVcm',
       'Date_of_Birth_(Days)', 'Menopause_(at_diagnosis)',
       'Race_and_Ethnicity', 'ER', 'PR', 'HER2', 'Mol_Subtype', 'stage_tum_s',
       'Tumor_Grade', 'Nottingham_grade', 'Histologic_type',
       'Bilateral_Information', 'slice_thick', 'n_xy', 'age', 'pCR', 'HR',
       'hormon_status', 'HR_HER2_STATUS', 'TripleNeg', 'HER2pos',
       'HRposHER2neg', 'menopause', 'race_white', 'race_black', 'hist_type',
       'bilateral', 'tum_vol_stage', 'tum_vol_tcia_imaging', 'sraw', 'eraw',
       'scol', 'ecol', 'mask_start', 'mask_end', 'xy_spacing', 'voxel_z',
       'n_times', 'n_z', 'pre', 'post_early', 'post_late'],
      dtype='object')

In [4089]:
cols=['pid', 'n_xy',
       
      'ER', 'PR', 'HER2', 
      'Mol_Subtype', 'stage_tum_s',
       'Tumor_Grade', 'Nottingham_grade', 
        'FOVcm', 
      'slice_thick',
      'age', 'pCR',
       'HR', 'hormon_status', 'HR_HER2_STATUS', 'TripleNeg', 'HER2pos',
       'HRposHER2neg', 'menopause',
      'race_white', 'race_black', 
      'hist_type',
       'bilateral', 
        'tum_vol_tcia_imaging',
      
      'sraw', 'eraw',
       'scol', 'ecol', 'mask_start', 'mask_end',
      'n_z',
      'n_times',
       'xy_spacing', 'voxel_z', 
      'pre', 'post_early', 'post_late']

In [4095]:
for p in cols:
    print(p,df[p].shape)

pid (922,)
n_xy (922,)
ER (922,)
PR (922,)
HER2 (922,)
Mol_Subtype (922,)
stage_tum_s (922,)
Tumor_Grade (922,)
Nottingham_grade (922,)
FOVcm (922,)
slice_thick (922,)
age (922,)
pCR (922,)
HR (922,)
hormon_status (922,)
HR_HER2_STATUS (922,)
TripleNeg (922,)
HER2pos (922,)
HRposHER2neg (922,)
menopause (922,)
race_white (922,)
race_black (922,)
hist_type (922,)
bilateral (922,)
tum_vol_tcia_imaging (922,)
sraw (922,)
eraw (922,)
scol (922,)
ecol (922,)
mask_start (922,)
mask_end (922,)
n_z (922,)
n_times (922,)
xy_spacing (922,)
voxel_z (922,)
pre (922,)
post_early (922,)
post_late (922,)


In [4096]:
df.pCR.value_counts()

pCR
0.0    236
1.0     64
Name: count, dtype: int64

In [4097]:

df[cols].to_csv('BreastDCEDL_duke_metadata.csv', index = False)