In [29]:
import pandas as pd
import numpy as np
from tableone import tableone
from tqdm import tqdm

In [18]:
def dataframe_stats(df):
    num_patients = df.empi_anon.nunique()
    num_exams = df.acc_anon.nunique()
    
    print(f"Patients: {num_patients}")
    print(f"Exams: {num_exams}")
    
    if 'png_path' in df.columns:
        print(f"Images: {len(df)}")

In [19]:
base_directory = "/data/beatrice/emory_datathon/embed_upload"
mag_path = base_directory + "/tables/embed_datathon_magview_reduced.csv"
meta_path = base_directory + "/tables/embed_datathon_metadata_reduced.csv"

# Load clinical data from magview
mag_df = pd.read_csv(mag_path)
dataframe_stats(mag_df)

  exec(code_obj, self.user_global_ns, self.user_ns)


Patients: 115883
Exams: 364733


In [24]:
categorical_cols = ['ETHNICITY_DESC', 'asses']
continuous_cols = ['age_at_study']

tableone(
    mag_df,
    columns = categorical_cols + continuous_cols,
    categorical = categorical_cols,
)

Unnamed: 0,Unnamed: 1,Missing,Overall
n,,,408464
"ETHNICITY_DESC, n (%)",African American or Black,1016.0,180922 (44.4)
"ETHNICITY_DESC, n (%)",American Indian or Alaskan Native,,846 (0.2)
"ETHNICITY_DESC, n (%)",Asian,,21365 (5.2)
"ETHNICITY_DESC, n (%)",Caucasian or White,,171948 (42.2)
"ETHNICITY_DESC, n (%)",Hispanic,,34 (0.0)
"ETHNICITY_DESC, n (%)",Multiple,,1307 (0.3)
"ETHNICITY_DESC, n (%)",Native Hawaiian or Other Pacific Islander,,3835 (0.9)
"ETHNICITY_DESC, n (%)",Not Recorded,,32 (0.0)
"ETHNICITY_DESC, n (%)",Patient Declines,,20 (0.0)


In [21]:
# Filtering magview clinical data for birads 0 since these cases contain abnormalities with descriptors
br0_df = mag_df.loc[mag_df.asses == "A"]
dataframe_stats(br0_df)

Patients: 40721
Exams: 46911


In [23]:
categorical_cols = ['ETHNICITY_DESC', 'asses']
continuous_cols = ['age_at_study']

tableone(
    br0_df,
    columns = categorical_cols + continuous_cols,
    categorical = categorical_cols,
)

Unnamed: 0,Unnamed: 1,Missing,Overall
n,,,57479
"ETHNICITY_DESC, n (%)",African American or Black,148.0,25718 (44.9)
"ETHNICITY_DESC, n (%)",American Indian or Alaskan Native,,140 (0.2)
"ETHNICITY_DESC, n (%)",Asian,,3570 (6.2)
"ETHNICITY_DESC, n (%)",Caucasian or White,,21998 (38.4)
"ETHNICITY_DESC, n (%)",Hispanic,,7 (0.0)
"ETHNICITY_DESC, n (%)",Multiple,,245 (0.4)
"ETHNICITY_DESC, n (%)",Native Hawaiian or Other Pacific Islander,,646 (1.1)
"ETHNICITY_DESC, n (%)",Not Recorded,,2 (0.0)
"ETHNICITY_DESC, n (%)",Patient Declines,,3 (0.0)


In [31]:
# Adding columns "mass", "asymmetry", "arch_distortion" and "calc" as a summary of imaging findings contained in the 
#other columns. This will be coded as 1 = present; 0 = absent

findings_df = br0_df.copy().reset_index()

# Instantiate lists for the four finding type -  mass, asymmetry, architectural distortion and calcification
# Default value set to 0. 

df_len = findings_df.shape[0]
mass_list = [0] * df_len
asymmetry_list = [0] * df_len
arch_distortion_list = [0] * df_len
calc_list = [0] * df_len


# Architectural Distortion is defined as: 'massshape' ['Q', 'A']
# Asymmetry is defined as: 'massshape' in ['T', 'B', 'S', 'F', 'V']
# Mass is defined as: 'massshape' in ['G', 'R', 'O', 'X', 'N', 'Y', 'D', 'L']
#       or 'massmargin' in ['D', 'U', 'M', 'I', 'S']
#       or 'massdens' in ['+', '-', '=']
# Calcification: defined as presence of any non-zero or non-null value in "calcdistri", "calcfind" or "calcnumber"

#iterate through rows and assign values to the lists based on above rules
for i, row in tqdm(findings_df.iterrows(), total=df_len):
    if (row['massshape'] in ['G', 'R', 'O', 'X', 'N', 'Y', 'D', 'L'])\
    or (row['massmargin'] in ['D', 'U', 'M', 'I', 'S'])\
    or (row['massdens'] in ['+', '-', '=']):
        mass_list[i] = 1
        
    if row['massshape'] in ['T', 'B', 'S', 'F', 'V']:
        asymmetry_list[i] = 1

    if row['massshape']in ['Q', 'A']:
        arch_distortion_list[i] = 1
        
    if (row['calcdistri'] is not np.nan)\
    or (row['calcfind'] is not np.nan)\
    or (row['calcnumber'] != 0):
        calc_list[i] = 1        

# Append the final image findings columns to the dataframe        
findings_df['mass'] = mass_list
findings_df['asymmetry'] = asymmetry_list
findings_df['arch_distortion'] = arch_distortion_list
findings_df['calc'] = calc_list

findings_df.sample(10)

100%|██████████| 57479/57479 [00:05<00:00, 11478.17it/s]


Unnamed: 0,index,empi_anon,acc_anon,desc,numfind,side,asses,tissueden,bside,type,...,massshape,massmargin,massdens,calcfind,calcdistri,calcnumber,mass,asymmetry,arch_distortion,calc
32928,235384,84045214,9247071816278616,MG Screen Bilat w/Tomo/CAD Stnd Protocol,1,R,A,4.0,,,...,O,,,,,0,1,0,0,0
3972,27872,69608661,4603674472140793,MG Screen Bilat w/Tomo/CAD Stnd Protocol,1,B,A,3.0,,,...,,,,,,0,0,0,0,0
54555,388844,92248508,3838535052993292,MG Screen Bilat w/Tomo/CAD Stnd Protocol,1,R,A,3.0,,,...,O,D,=,,,0,1,0,0,0
50217,357901,92836736,3369040666839953,MG Screen Bilat w/Tomo/CAD Stnd Protocol,1,R,A,2.0,,,...,S,,,,,0,0,1,0,0
46815,334731,14876292,2841517934557309,MG Screening Bilateral,1,L,A,2.0,,,...,F,,,,,0,0,1,0,0
36972,264400,50602743,1489366395836495,MG Screening Bilateral,1,L,A,3.0,,,...,F,,,,,0,0,1,0,0
19527,140301,34226345,5557182890270605,MG Screen Bilat w/Tomo/CAD Stnd Protocol,1,R,A,1.0,,,...,,,,,,0,0,0,0,0
30294,218066,12945478,8948156729781421,MG Screen Bilat w/Tomo/CAD Stnd Protocol,2,L,A,3.0,,,...,,,,G,G,0,0,0,0,1
10832,76659,66564257,9542872491924293,MG Screen Bilat w/Tomo/CAD Stnd Protocol,1,L,A,2.0,,,...,S,,,,,0,0,1,0,0
56090,398879,72709244,4919732463445331,MG Screen Bilat w/Tomo/CAD Stnd Protocol,1,L,A,4.0,,,...,O,D,,,,0,1,0,0,0


In [33]:
categorical_cols = ['mass', 'asymmetry', 'arch_distortion', 'calc']

tableone(
    findings_df,
    columns = categorical_cols,
    categorical = categorical_cols
)

Unnamed: 0,Unnamed: 1,Missing,Overall
n,,,57479
"mass, n (%)",0.0,0.0,49378 (85.9)
"mass, n (%)",1.0,,8101 (14.1)
"asymmetry, n (%)",0.0,0.0,28450 (49.5)
"asymmetry, n (%)",1.0,,29029 (50.5)
"arch_distortion, n (%)",0.0,0.0,54587 (95.0)
"arch_distortion, n (%)",1.0,,2892 (5.0)
"calc, n (%)",0.0,0.0,47777 (83.1)
"calc, n (%)",1.0,,9702 (16.9)
