# Data analysis of skeletal muscle area and SMI for normal patients

In [113]:
from IPython.display import HTML

HTML('''<script>
code_show=false; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')

###### Load libraries and directories

In [114]:
# from IPython import get_ipython
from IPython.display import display, HTML
#from tqdm.notebook import tqdm
import pickle
import os
import pprint
pp = pprint.PrettyPrinter(indent=1)

import pandas as pd
import numpy as np

import json

# Custom functions
import pickle
def save_object(obj, filename):
    with open(filename, 'wb') as output:  # Overwrites any existing file.
        pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL)

def load_object(filename):        
    with open(filename, 'rb') as input:
        return pickle.load(input)
    

In [115]:
# Install statsmodels if it doesn't exist
import pip
def import_or_install(package):
    try:
        return __import__(package)
    except ImportError:
        pip.main(['install', package]) 
        
packages = ['statsmodels','matplotlib','scipy','pygrowup','sklearn']

for package in packages:
    import_or_install(package)

In [116]:
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import scipy.stats as stats
get_ipython().run_line_magic('tb', '')

AttributeError: 'DataFrame' object has no attribute 'summary'

In [117]:
cwd = os.getcwd()
print(cwd)

/home/jupyteruser/smipipeline


In [118]:
data = '/home/jupyteruser/data'
pickles = '/home/jupyteruser/pickles'
models = '/home/jupyteruser/models'
output = '/home/jupyteruser/output/v5_results'

In [119]:
# Import modules and config file
configfile = os.path.join(cwd,'config/debug_ES/v5_run_prediction_CV_poorl3.json')
with open(configfile, "r") as f:
        config = json.load(f)
pp.pprint(config)

{'l3_finder': {'cache_dir': '/tf/_cache/',
               'cache_intermediate_results': True,
               'dicom_dir': '/tf/data',
               'model_path_dir': '/tf/models/l3/cv_final',
               'new_tim_dicom_dir_structure': True,
               'output_directory': '/tf/output/v5_cv_poorl3/l3',
               'overwrite': True,
               'save_plots': True,
               'show_plots': False},
 'muscle_segmentor': {'model_path_dir': '/tf/models/muscle/cv_final',
                      'output_directory': '/tf/output/v5_cv_poorl3/ms'}}


In [120]:
for key in config.keys():
    for label, value in config[key].items():
        if type(value) is str:
            config[key][label] = value.replace('/tf', '/home/jupyteruser')

pp.pprint(config)

{'l3_finder': {'cache_dir': '/home/jupyteruser/_cache/',
               'cache_intermediate_results': True,
               'dicom_dir': '/home/jupyteruser/data',
               'model_path_dir': '/home/jupyteruser/models/l3/cv_final',
               'new_tim_dicom_dir_structure': True,
               'output_directory': '/home/jupyteruser/output/v5_cv_poorl3/l3',
               'overwrite': True,
               'save_plots': True,
               'show_plots': False},
 'muscle_segmentor': {'model_path_dir': '/home/jupyteruser/models/muscle/cv_final',
                      'output_directory': '/home/jupyteruser/output/v5_cv_poorl3/ms'}}


## Produce data for analysis using prediction results and preprocessing results

In [121]:
# Load the area csv
filename = 'areas-mm2_by_subject_id_manual_outliers.csv'
#filename = 'areas-mm2_by_subject_id.csv'


sma_csv = os.path.join(config["muscle_segmentor"]["output_directory"],filename)
df_sma = pd.read_csv(sma_csv, index_col=False)
print('Total number of patients for analysis before manual eliminations: ', len(df_sma))

Total number of patients for analysis before manual eliminations:  2197


In [122]:
df_man = df_sma[~df_sma['area_mm2_Elan'].isnull()]
len(df_man)

43

In [123]:
df_man.head(1)

Unnamed: 0,subject_id,area_mm2,%auto mask pixels,% manual mask pixels,area_mm2_Elan,sagittal_series,axial_series
45,Z1756629,3590.897184,7.043,7.444,3795.34838,Z1756629-SE-8-Body_Std._Axial_AIDR_-_H_Sagittal_3.000,Z1756629-SE-2-Body_Std._Axial_AIDR_-_H__5.0


In [124]:
if 'outliers' in filename: # MOdified area sheet with manual area calculated for outlier patient
    df_sma_manuals = df_sma.loc[~df_sma['area_mm2_Elan'].isnull()]
    for i,row in df_sma_manuals.iterrows():
        df_sma.loc[df_sma['subject_id']==row['subject_id'],'area_mm2'] = row['area_mm2_Elan']
    

# Manually identified diseases cases [by Andrew]:
man_disease = ['Z832424','Z1256768']

# Manually identified poor L3s [by Elan for v5]:
man_poorl3 = ['Z627604','Z1292977','Z1150365','Z1016357','Z1335481','Z567376']

man_eliminations = man_disease + man_poorl3

df_eliminations = df_sma.loc[df_sma['subject_id'].isin(man_eliminations)]
display(df_eliminations)

Unnamed: 0,subject_id,area_mm2,%auto mask pixels,% manual mask pixels,area_mm2_Elan,sagittal_series,axial_series
635,Z1292977,6910.657248,,,,recon from: Z1292977-SE-9-_ORAL_CONTRAST;ISOVUE_300_FC12_ORG_,Z1292977-SE-9-_ORAL_CONTRAST;ISOVUE_300_FC12_ORG_
941,Z627604,11793.75,,,,Z627604-SE-9-Sagittal_Bone_Sagittal_3.000,Z627604-SE-4-Axial_Body_5.0
1007,Z1016357,4313.28222,,,,recon from: Z1016357-SE-8-Ax-MIP_Lung_Ax-MIP_8.0,Z1016357-SE-8-Ax-MIP_Lung_Ax-MIP_8.0
1038,Z1335481,10616.9378,,,,recon from: Z1335481-SE-10-Standard_Axial,Z1335481-SE-10-Standard_Axial
1100,Z1150365,12304.90181,,,,recon from: Z1150365-SE-4-Axial_Body_5.0,Z1150365-SE-4-Axial_Body_5.0
2188,Z567376,7402.861004,,,,recon from: Z567376-SE-2-Body_Std._Axial_AIDR_-_H__5.0,Z567376-SE-2-Body_Std._Axial_AIDR_-_H__5.0


In [125]:
df_sma= df_sma.loc[~df_sma['subject_id'].isin(man_eliminations)]
df_sma = df_sma[['subject_id','area_mm2','sagittal_series','axial_series']]
print('Final patients for analysis: ',len(df_sma))

Final patients for analysis:  2191


In [126]:
# change subject_id to id
df_sma.columns=['ID','area_mm2','sagittals','axials']
#display(df_sma.head(2))

In [127]:
# Now merge on ID
df_analysis = df_sma
#display(df_analysis.head(10))

In [128]:
# Load age information from patlist csv
# Load normal patient list
infile  = 'patlist_with_validBMI_corrected_v5.csv'
df_P = pd.read_csv(infile, index_col=False)
df_P = df_P.loc[:, ~df_P.columns.str.contains('^Unnamed')]

In [129]:
# Keep only columns needed
df_P = df_P[['PAT_ID','WEIGHT','HEIGHT','BMI_FOR_AGE_PERCENTILE','BMI_CALCULATED','Age',
 'Sex',
 'Race',
 'AgeGroup','STUDYRESULT','IMPRESSION']]
df_P.columns = ['ID','WEIGHT','HEIGHT','BMI_AGE_P','BMI_CALC','Age','Sex','Race','AgeGroup','STUDYRESULT','IMPRESSION']
#display(df_P.head(10))

In [130]:
# Get rid of patients not in df_analysis
print("Total number of normal patients identified", len(df_P))
df_P = df_P[df_P['ID'].isin(df_analysis['ID'].values)]
print("Total number of normal patients with images", len(df_P))

Total number of normal patients identified 2238
Total number of normal patients with images 2191


In [131]:
# Merge the patient fields with df_analysis
df_analysis = pd.merge(df_P,df_analysis, on="ID")

print('Length of final analysis df: ', len(df_analysis))

# Redefine AgeGroup be flooring instead of rounding.
df_analysis['AgeGroup'] = np.floor(df_analysis['Age'].values)

Length of final analysis df:  2191


In [132]:
# Manual L3 check
infile  = 'poorl3.csv'
df_poorl3 = pd.read_csv(infile, index_col=False)

l3_present = df_poorl3.loc[~df_poorl3['L3slice'].isnull(),'ID'].values.tolist()
print('Cases with manually identified L3s: ', len(l3_present))

Cases with manually identified L3s:  30


In [133]:
# This was used to filter exams for prediction in run_prediction_cv
# normal_patients_corrected = df_analysis.ID.values.tolist()
# print(len(normal_patients_corrected))
# save_object(normal_patients_corrected,os.path.join(pickles,'normal_patients_corrected.pkl'))

In [134]:
l3_present_analysis = [l3 for l3 in l3_present if l3 in df_analysis.ID.values]
print('Normal L3s that are valid: ', len(l3_present_analysis))
for l3 in l3_present_analysis:
    print(l3)

Normal L3s that are valid:  26
Z1762020
Z418856
Z357478
Z670302
Z1211258
Z717870
Z890238
Z1302641
Z627309
Z1041413
Z511402
Z1722108
Z837620
Z441830
Z5745
Z489517
Z1332420
Z1263347
Z486791
Z320930
Z678707
Z1221549
Z1274627
Z1119985
Z1000800
Z362374


In [135]:
# Find missing BMI_CALC
print('Patients missing BMI calculated:' ,len(df_analysis[df_analysis['BMI_CALC'].isnull()]))
print('Patients missing BMI age percentile:' ,len(df_analysis[df_analysis['BMI_AGE_P'].isnull()]))
print('Patients missing Height:' ,len(df_analysis[df_analysis['HEIGHT'].isnull()]))
print('Patients missing Weight' ,len(df_analysis[df_analysis['WEIGHT'].isnull()]))
print('Patients missing Age' ,len(df_analysis[df_analysis['Age'].isnull()]))
print('Patients missing Race' ,len(df_analysis[df_analysis['Race'].isnull()]))
print('Patients missing Sex' ,len(df_analysis[df_analysis['Sex'].isnull()]))

Patients missing BMI calculated: 251
Patients missing BMI age percentile: 0
Patients missing Height: 11
Patients missing Weight 27
Patients missing Age 0
Patients missing Race 0
Patients missing Sex 0


In [136]:
df_analysis = df_analysis.sort_values(by=['AgeGroup'],ascending='False')
#print(df_analysis['AgeGroup'].value_counts().sort_values())

In [137]:
list(df_analysis)

['ID',
 'WEIGHT',
 'HEIGHT',
 'BMI_AGE_P',
 'BMI_CALC',
 'Age',
 'Sex',
 'Race',
 'AgeGroup',
 'STUDYRESULT',
 'IMPRESSION',
 'area_mm2',
 'sagittals',
 'axials']

In [138]:
# Handling an outlier that had 2 CT exams, but the exam and EHR data were mismatched. Manually correcting for it.
df_analysis.loc[df_analysis['ID']=='Z549850','Age'] = 13.99
df_analysis.loc[df_analysis['ID']=='Z549850','WEIGHT'] = np.nan
df_analysis.loc[df_analysis['ID']=='Z549850','HEIGHT'] = np.nan
df_analysis.loc[df_analysis['ID']=='Z549850','BMI_AGE_P'] = 86.5
df_analysis.loc[df_analysis['ID']=='Z549850','BMI_CALC'] = 23.66
df_analysis.loc[df_analysis['ID']=='Z549850','AgeGroup'] = 13

In [178]:
#df_analysis.loc[df_analysis['ID']=='Z549850']

In [140]:
# Patients > 18
print('No of Patients > 18 years that are discarded' ,len(df_analysis[df_analysis['AgeGroup'] > 18]))
#print('No of Patients < 2.5 yearas' ,len(df_analysis[df_analysis['Age'] < 2.5]))

No of Patients > 18 years that are discarded 1


In [141]:
#print('No of boys' ,len(df_analysis[df_analysis['Sex'] == 'Male']))
#print('No of girls' ,len(df_analysis[df_analysis['Sex'] == 'Female']))

In [142]:
# Create sex stratified and age filtered dfs
df_analysis['area_cm2'] = df_analysis['area_mm2']/100

df_analysis['smi'] = df_analysis['area_cm2']/((df_analysis['HEIGHT']/100)**2)

df_m = df_analysis[(df_analysis['Sex']=='Male') & (df_analysis['AgeGroup'] <= 18)] #& (df_analysis['Age'] > 3)]
print('Length of male df = ',len(df_m))
df_f = df_analysis[(df_analysis['Sex']=='Female') & (df_analysis['AgeGroup'] <= 18)] # & (df_analysis['Age'] > 3)]
print('Length of female df = ',len(df_f))

Length of male df =  1056
Length of female df =  1134


In [143]:
1056/1134

0.9312169312169312

In [144]:
pd. set_option('display.max_rows', 100) # or 1000
pd.set_option('display.max_colwidth', None)

### Figuring out distribution of slice thicknesses for AJR

In [145]:
df_halfmm = df_analysis[df_analysis['axials'].str.contains('_0.5')]
df_3mm = df_analysis[df_analysis['axials'].str.contains('_3.0')]
df_rest = df_analysis[~(df_analysis['ID'].isin(df_3mm['ID']) | df_analysis['ID'].isin(df_halfmm['ID']))]

In [146]:
print('0.5 mm studies: ', len(df_halfmm))
print('3 mm studies: ', len(df_3mm))
print('5 mm studies: ', len(df_rest))

0.5 mm studies:  88
3 mm studies:  50
5 mm studies:  2053


### Figuring out reasons for CT examination for the healthy patient dataset as requested by reviewers:

In [147]:
print(list(df_analysis))

['ID', 'WEIGHT', 'HEIGHT', 'BMI_AGE_P', 'BMI_CALC', 'Age', 'Sex', 'Race', 'AgeGroup', 'STUDYRESULT', 'IMPRESSION', 'area_mm2', 'sagittals', 'axials', 'area_cm2', 'smi']


In [148]:
# Create df with study result alone
df_SR = df_analysis[['ID','STUDYRESULT','IMPRESSION']]
df_SR.loc[:,'STUDYRESULT'] = df_SR['STUDYRESULT'].str.lower()
df_SR.loc[:,'IMPRESSION'] = df_SR['IMPRESSION'].str.lower()

df_SR_aps =  df_SR[df_SR['STUDYRESULT'].str.contains('appendicitis')]
print('Appendicitis patients: ', len(df_SR_aps))
print('Appendicitis patients % of total: ', len(df_SR_aps)*100/len(df_SR))

Appendicitis patients:  302
Appendicitis patients % of total:  13.783660429027842


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item_labels[indexer[info_axis]]] = value


In [149]:
df_SR_hyd =  df_SR[df_SR['STUDYRESULT'].str.contains('hydronephrosis')]
print('Hydronephrosis patients: ', len(df_SR_hyd))
print('Hydronephrosis patients % of total: ', len(df_SR_hyd)*100/len(df_SR))

Hydronephrosis patients:  79
Hydronephrosis patients % of total:  3.605659516202647


In [179]:
#display(df_SR_hyd)

In [151]:
indiglist = ['vomit','diarrhea','digestive disorder']
pattern = '|'.join(indiglist)
df_SR_ind = df_SR[df_SR['STUDYRESULT'].str.contains(pattern)]
print('Indigestion patients: ', len(df_SR_ind))
print('Indigestion patients % of total: ', len(df_SR_ind)*100/len(df_SR))

Indigestion patients:  280
Indigestion patients % of total:  12.779552715654953


In [152]:
traumalist = ['trauma','injury','mvc','mva','accident','bruising','fracture']
pattern = '|'.join(traumalist)
df_SR_tr = df_SR[df_SR['STUDYRESULT'].str.contains(pattern)]
print('Trauma patients: ', len(df_SR_tr))
print('Trauma patients % of total: ', len(df_SR_tr)*100/len(df_SR))

Trauma patients:  579
Trauma patients % of total:  26.42628936558649


In [153]:
df_SR_pain = df_SR[df_SR['STUDYRESULT'].str.contains('pain')]
print('Pain patients: ', len(df_SR_pain))
print('Pain patients % of total: ', len(df_SR_pain)*100/len(df_SR))

Pain patients:  1469
Pain patients % of total:  67.04701049748974


In [154]:
df_SR_pain_aps = df_SR_pain[df_SR_pain['ID'].isin(df_SR_aps['ID'])]
print('Pain and Appendicitis patients: ', len(df_SR_pain_aps))

Pain and Appendicitis patients:  230


In [155]:
df_SR_pain_tr = df_SR_pain[df_SR_pain['ID'].isin(df_SR_tr['ID'])]
print('Pain and Trauma patients: ', len(df_SR_pain_tr))

Pain and Trauma patients:  233


In [156]:
df_SR_tr_aps = df_SR_tr[df_SR_tr['ID'].isin(df_SR_aps['ID'])]
print('Trauma and Appendicitis patients: ', len(df_SR_tr_aps))

Trauma and Appendicitis patients:  10


In [157]:
df_SR_pain_aps_tr = df_SR_pain_aps[df_SR_pain_aps['ID'].isin(df_SR_pain_tr['ID'])]
print('Pain, Appendicitis and Trauma patients: ', len(df_SR_pain_aps_tr))

Pain, Appendicitis and Trauma patients:  7


In [158]:
df_no_pain = df_SR[~df_SR["ID"].isin(df_SR_pain["ID"])]
df_no_pain_aps = df_no_pain[~df_no_pain["ID"].isin(df_SR_aps["ID"])]
df_no_pain_aps_tr = df_no_pain_aps[~df_no_pain_aps["ID"].isin(df_SR_tr["ID"])]
print('Other patients: ', len(df_no_pain_aps_tr))

Other patients:  307


In [159]:
display(df_no_pain_aps_tr)

Unnamed: 0,ID,STUDYRESULT,IMPRESSION
1989,Z1929867,"clinical history: 2-month-old, ropa. comparison: skeletal survey 1/20/2018 procedure comments: ct of the abdomen and pelvis was performed with intravenous contrast. findings: lower thorax: diffuse groundglass opacities at the lung bases . liver and biliary system: normal. spleen: normal. pancreas: not well visualized. no gross abnormalities. adrenal glands: normal. kidneys, ureters, and bladder: normal. increased medullary pyramidal enhancement bilaterally bowel: normal. appendix: the appendix is not identified. peritoneal cavity: no free fluid. uterus and ovaries: no gross abnormality. vasculature: normal. lymph nodes: normal. abdominal wall: normal. osseous structures: normal.",1. dense medullary pyramids. this may relate reflect pooling of contrast versus nephrocalcinosis. the former is favored. 2. no traumatic abnormality.
1995,Z1929500,"clinical history: 2-month-old male with portal venous gas seen on doppler ultrasound of liver, also with hematochezia. ct was performed to assess the intestinal wall and to evaluate for an av malformation. comparison: ultrasound and radiographs of the abdomen 1/12/2018 procedure comments: ct of the abdomen and pelvis was performed with intravenous contrast. findings: lower thorax: mild dependent changes are seen at the lung bases. mild lobular air trapping in both lower lobes. liver and biliary system: normal. no portal venous gas is noted. spleen: normal. pancreas: normal. adrenal glands: a punctate calcification is seen in the right adrenal gland. kidneys, ureters, and bladder: the right kidney appears to be a non-dilated duplex system. bowel: normal. i see no findings to suggest bowel ischemia. appendix: the appendix is identified and is normal. peritoneal cavity: no free fluid. vasculature: normal. lymph nodes: normal. abdominal wall: normal. osseous structures: normal.",1. there is no evidence of portal venous gas. 2. no findings to suggest an arteriovenous malformation. 3. punctate calcification in the right adrenal gland is likely due to the sequela of neonatal hemorrhage. 4. suspected duplex right renal collecting system without pelvocaliectasis or ureterectasis. radiology fellow yinan li discussed the above findings with dr. asai on 1/13/2018 11:54 am in person.
1795,Z1947621,"clinical history: 1 day of brown vomit and nonbloody diarrhea, distended colon on axr, guarding on abd exam, ill appearing. history in epic confirmed with mom. comparison: upper gi 3/1/2018 procedure comments: ct of the abdomen and pelvis was performed with intravenous contrast. findings: examination is mildly degraded by patient motion. lower thorax: normal. liver and biliary system: normal. spleen: normal. pancreas: normal. adrenal glands: normal. kidneys, ureters, and bladder: normal. motion artifact particularly limits evaluation of the left kidney, but it appears grossly normal. bowel: multiple dilated, fluid-filled loops of small bowel favored to represent distal jejunum and proximal ileum are seen in the mid abdomen. the most prominent loop of small bowel is seen in the upper right and mid abdomen and likely corresponds to a previously noted distended small bowel loop on the upper gi series from march 2018. there is a relatively smooth transition from this most likely related loop of bowel to normal caliber ileum in the right upper quadrant. no findings of closed loop obstruction. bowel wall enhances normally. the colon is normal in appearance. a moderate amount of stool seen throughout the colon and rectum. appendix: the appendix is identified and is normal. peritoneal cavity: no free fluid. vasculature: normal. lymph nodes: normal. abdominal wall: normal. osseous structures: normal.","1. dilated fluid-filled mid small bowel corresponding with previously noted distended small bowel loops on upper gi series from march 2018. given persistent appearance, findings may be secondary to ileal stenosis, congenital band, or dysfunctional bowel resulting in partial or functional obstruction. 2. normal colon with moderate stool throughout. 3. normal appendix in the right lower quadrant."
1794,Z1973497,"clinical history: 5m with elevated lfts after fall down 17 stairs. per mom, fall down stairs today. comparison: none. procedure comments: ct of the abdomen and pelvis was performed with intravenous contrast. findings: lower thorax: normal. liver and biliary system: normal. spleen: normal. pancreas: normal. adrenal glands: normal. kidneys, ureters, and bladder: normal. bowel: normal. appendix: the appendix is not identified. peritoneal cavity: no free fluid. vasculature: normal. lymph nodes: normal. abdominal wall: normal. osseous structures: normal.",normal ct of the abdomen and pelvis.
1731,Z1864582,"clinical history: 22 mo male with recurrent ileo colic intussusception. history in epic confirmed with anes nurse eric. comparison: none procedure comments: ct of the abdomen and pelvis was performed with intravenous contrast. findings: lower thorax: dependent atelectasis lung bases. liver and biliary system: normal. spleen: normal. pancreas: normal. adrenal glands: normal. kidneys, ureters, and bladder: normal. bowel: there is thickening of the terminal presently 6 in meters in length. moderate stool is visible in the colon. appendix: the appendix is identified and is normal. peritoneal cavity: no free fluid. vasculature: normal. lymph nodes: there numerous lymph nodes in the right lower quadrant measuring up to 6 mm in short axis diameter. abdominal wall: normal. osseous structures: normal.",findings suggest ileitis with right lower quadrant likely reactive lymphadenopathy.
...,...,...,...
950,Z627309,"clinical history: 18-year-old with moderate hydronephrosis. outside read. comparison: 6/1/2007. procedure comments: ct of the abdomen was performed with intravenous and oral contrast. examination was performed at adams county regional medical center on 11/19/2010 and is submitted for interpretation on 12/2/2010. findings: lower thorax: normal. liver and biliary system: normal. spleen: the spleen is surgically absent. numerous small splenules are noted in the left upper quadrant. pancreas: normal adrenal glands: the right adrenal gland is normal. the left adrenal gland is not seen. kidneys and ureters: the left kidney is surgically absent. there is mild dilation of the right renal pelvis which appears similar to prior examinations. the renal parenchyma enhances normally. there is no perinephric stranding or fluid collection. bowel: normal. appendix: normal peritoneal cavity: no free fluid. vasculature: normal. lymph nodes: normal. abdominal wall: diastasis of the musculature at the midline anterior abdominal wall is noted, related to prior surgery. there is ill-defined soft tissue density within the subcutaneous fat at the right inferior aspect of the defect, best seen on image 48 of series 3. no oral contrast is seen extending into this region. loops of small bowel are present abutting the diastatic abdominal wall, though there is no frank herniation. osseous structures: normal.",impression: 1. post operative findings from left nephrectomy and splenectomy related to prior trauma. 2. postoperative diastasis of the abdominal wall without bowel herniation. 3. ill-defined soft tissue density in the right anterior subcutaneous fat. findings are nonspecific and could be postsurgical scarring versus infectious/inflammatory in etiology. 4. moderate pelvocaliectasis of the right kidney which is similar to prior exams.
1797,Z1058129,"clinical history: 18 yo m with peritoneal signs on belly exam diffusely and hx urethral strictures and repairs. renal and bladder u/s with only mild r urothelial thickening yesterday in ed.. comparison: same day abdominal radiograph and renal ultrasound 9/20/2018 procedure comments: ct of the abdomen and pelvis was performed with intravenous contrast. findings: lower thorax: normal. liver and biliary system: normal. spleen: normal. pancreas: normal. adrenal glands: normal. kidneys, ureters, and bladder: the kidneys are normal with no evidence of hydronephrosis or obstruction. delayed phase images demonstrate symmetric opacification of the renal collecting systems and bilateral ureters throughout their course with no evidence of a defect or obstruction. the bladder is normal.. bowel: there may be a tiny hiatal hernia. there is a moderate amount of stool seen throughout the colon and fecalized small bowel contents within the lower abdomen. appendix: the appendix is identified and is normal. peritoneal cavity: no free fluid or free fluid. a calcification is seen within the deep left pelvis, likely representing a phlebolith. vasculature: normal. lymph nodes: scattered nonenlarged mesenteric lymph nodes are seen. abdominal wall: normal. osseous structures: there are multilevel schmorl's nodes throughout the lower thoracic and lumbar spine.",essentially normal ct of the abdomen and pelvis.
787,Z283402,"clinical history: 18 yo male with continued weight loss.. comparison: none procedure comments: ct of the abdomen and pelvis was performed with intravenous and oral contrast. findings: lower thorax: normal. liver and biliary system: normal. spleen: normal. pancreas: normal. adrenal glands: normal. kidneys, ureters, and bladder: normal. bowel: normal. appendix: the appendix is identified and is normal. peritoneal cavity: no free fluid. vasculature: normal. lymph nodes: normal. abdominal wall: normal. osseous structures: normal.",impression: normal ct of the abdomen and pelvis.
1271,Z855060,"clinical history: 18-year-old with one week of nausea and vomiting. comparison: none procedure comments: ct of the abdomen and pelvis was performed with intravenous contrast. patient was not able to tolerate oral contrast. findings: lower thorax: normal. liver and biliary system: normal. gallbladder is normal. spleen: normal. pancreas: normal. adrenal glands: normal. kidneys, ureters, and bladder: normal. bowel: normal. appendix: the appendix is identified and is normal. peritoneal cavity: no free fluid. vasculature: normal. lymph nodes: normal. abdominal wall: normal. osseous structures: normal.","impression: normal ct of the abdomen and pelvis. specifically, the appendix is normal and no alternative etiology for patient's symptoms identified"


In [160]:
accounted = len(df_SR_aps) + len(df_SR_tr) + len(df_SR_pain)
unaccounted = len(df_SR) - accounted
print('accounted ' , accounted, ' unaccounted: ', unaccounted)

accounted  2350  unaccounted:  -159


### Identify split of patients in the Other Race Group for AJR

In [161]:
df_analysis_2ab = df_analysis[df_analysis['Age'] >= 2]
df_others = df_analysis_2ab.loc[(df_analysis_2ab['Race']!='Black or African American') & (df_analysis_2ab['Race']!='White'),['ID','Race']]

In [162]:
df_analysis_2ab['Race'].unique()

array(['White', 'Black or African American', 'Hispanic/Latino', 'Asian',
       'Unknown', 'Patient Refused', 'American Indian and Alaska Native',
       'Other', 'Native Hawaiian and Other Pacific Islander',
       'Middle Eastern'], dtype=object)

In [163]:
df_others.describe()

Unnamed: 0,ID,Race
count,137,137
unique,137,8
top,Z841643,Other
freq,1,60


In [164]:
# Male proportion # values from Table 1
b_m = 190*100/1043
o_m = 61*100/1043
w_m  = 792*100/1043
print('Male split -  Black: ', b_m ,'% Others: ', o_m,'% White: ',w_m)
# female proportion
b_f = 137*100/1125
o_f = 76*100/1125
w_f  = 912*100/1125
print('FeMale split -  Black: ', b_f ,'% Others: ', o_f,'% White: ',w_f)

Male split -  Black:  18.21668264621285 % Others:  5.848513902205178 % White:  75.93480345158197
FeMale split -  Black:  12.177777777777777 % Others:  6.7555555555555555 % White:  81.06666666666666


### Total mean age and std age for AJR

In [187]:
df_m['Age'].describe()

count    1043.000000
mean       11.516991
std         4.450749
min         2.002243
25%         7.894481
50%        12.163706
75%        15.274682
max        18.901457
Name: Age, dtype: float64

In [186]:
df_f['Age'].describe()

count    1125.000000
mean       12.997769
std         4.073768
min         2.046210
25%        10.451020
50%        14.125752
75%        16.172369
max        18.944104
Name: Age, dtype: float64

In [185]:
df_analysis_2ab['Age'].describe()

count    2169.000000
mean       12.288486
std         4.323549
min         2.002243
25%         9.126282
50%        13.223206
75%        15.832188
max        19.013558
Name: Age, dtype: float64

## Population statistics in comparison to CDC/WHO data

In [165]:
#Step3: Loop over each unique date in the studies dataframe df_F
def get_agewise_stats(df_in):
    uniqueages = df_in.AgeGroup.unique() 
    column_names = ['AgeGroup','Age_IQR','median_area_cm2','median_SMI','samplesize']
    df_C = pd.DataFrame(columns = column_names)
    #display(df_C)
    for age in uniqueages:
        df_l = df_in[df_in['AgeGroup']==age]
        #mean_ar = np.mean(df_l['area_cm2'])
        median_ar = np.median(df_l['area_cm2'])
        median_SMI = np.nanmedian(df_l['smi'])
        q75, q25 = np.percentile(df_l['Age'], [75 ,25])
        age_iqr = q75 - q25
        samplesize = len(df_l)
        #print([age,mean_ar,median_ar,samplesize])
        df = pd.DataFrame([[age,age_iqr,median_ar,median_SMI,samplesize]],columns=column_names)
        #display(df)
        df_C = pd.concat([df_C,df])
    return df_C

In [166]:
print('Count statistics stratified by gender and age group')
print('(M - Male) (F-Female)')    
df_f_agestats = get_agewise_stats(df_f).add_suffix('_F')
df_m_agestats = get_agewise_stats(df_m).add_suffix('_M')

display(pd.concat([df_f_agestats,df_m_agestats],axis=1))
df_f_agestats.to_csv(output+'/Female_summary.csv',index=False)
df_m_agestats.to_csv(output+'/Male_summary.csv',index=False)

Count statistics stratified by gender and age group
(M - Male) (F-Female)


Unnamed: 0,AgeGroup_F,Age_IQR_F,median_area_cm2_F,median_SMI_F,samplesize_F,AgeGroup_M,Age_IQR_M,median_area_cm2_M,median_SMI_M,samplesize_M
0,0.0,0.084796,18.652997,57.867612,6,0.0,0.206801,19.83962,57.837493,11
0,1.0,0.455484,29.410056,41.829557,3,1.0,0.000251,32.862968,47.651174,2
0,2.0,0.421892,33.67111,40.082557,19,2.0,0.299689,35.466114,42.537933,35
0,3.0,0.312682,38.022487,38.67864,22,3.0,0.525317,39.725392,40.104204,39
0,4.0,0.404026,39.722193,35.444461,17,4.0,0.408744,47.247928,39.364745,34
0,5.0,0.561457,46.824046,35.704454,27,5.0,0.517024,51.085545,39.372846,45
0,6.0,0.325437,46.8351,33.450946,39,6.0,0.445364,51.776582,36.786229,52
0,13.0,0.496507,92.737925,36.146844,102,7.0,0.553034,58.549062,36.833798,60
0,7.0,0.56882,51.911274,33.515501,38,8.0,0.423236,63.356002,36.107026,49
0,8.0,0.62031,56.929601,33.441124,48,9.0,0.500686,70.845328,36.812588,44


### Since n for age = 0 and 1 are less than 10, process only ages 2 and above.

In [167]:
print('NO of Females before filtering pats < 2yrs: ', len(df_f))
print('NO of Males before filtering pats < 2yrs: ', len(df_m))

NO of Females before filtering pats < 2yrs:  1134
NO of Males before filtering pats < 2yrs:  1056


In [168]:
df_f = df_f[df_f['Age'] >= 2]
df_m = df_m[df_m['Age'] >= 2]     

In [169]:
print('NO of Females after filtering pats < 2yrs: ', len(df_f))
print('NO of Males after filtering pats < 2yrs: ', len(df_m))

NO of Females after filtering pats < 2yrs:  1125
NO of Males after filtering pats < 2yrs:  1043


### race split for table 1

In [171]:
df_f['Race'].unique()

array(['White', 'Black or African American', 'Hispanic/Latino',
       'Patient Refused', 'American Indian and Alaska Native', 'Unknown',
       'Other', 'Asian', 'Native Hawaiian and Other Pacific Islander',
       'Middle Eastern'], dtype=object)

In [180]:
print('Female whites: ', len(df_f[df_f['Race'].str.contains('White')]))
print('Female blacks: ', len(df_f[df_f['Race'].str.contains('Black')]))
print('Female Asians: ', len(df_f[df_f['Race'].str.contains('Asian')]))
print('Female Hawaiian: ', len(df_f[df_f['Race'].str.contains('Hawaiian')]))
print('Female Alaskan: ', len(df_f[df_f['Race'].str.contains('Alaska')]))
print('Female Others: ', len(df_f[df_f['Race'].str.contains('Other')]))
print('Female MEs: ', len(df_f[df_f['Race'].str.contains('Middle')]))
print('Female unknowns: ', len(df_f[df_f['Race'].str.contains('Unknown')]))
print('Female hispanics: ', len(df_f[df_f['Race'].str.contains('Hispanic')]))
print('Female refused: ', len(df_f[df_f['Race'].str.contains('Refused')]))

Female whites:  912
Female blacks:  137
Female Asians:  9
Female Hawaiian:  3
Female Alaskan:  4
Female Others:  35
Female MEs:  1
Female unknowns:  12
Female hispanics:  14
Female refused:  1


In [182]:
print('male whites: ', len(df_m[df_m['Race'].str.contains('White')]))
print('male blacks: ', len(df_m[df_m['Race'].str.contains('Black')]))
print('male Asians: ', len(df_m[df_m['Race'].str.contains('Asian')]))
print('male Hawaiian: ', len(df_m[df_m['Race'].str.contains('Hawaiian')]))
print('male Alaskan: ', len(df_m[df_m['Race'].str.contains('Alaska')]))
print('male Others: ', len(df_m[df_m['Race'].str.contains('Other')]))
print('male MEs: ', len(df_m[df_m['Race'].str.contains('Middle')]))
print('male unknowns: ', len(df_m[df_m['Race'].str.contains('Unknown')]))
print('male hispanics: ', len(df_m[df_m['Race'].str.contains('Hispanic')]))
print('male refused: ', len(df_m[df_m['Race'].str.contains('Refused')]))

male whites:  792
male blacks:  190
male Asians:  11
male Hawaiian:  2
male Alaskan:  2
male Others:  30
male MEs:  0
male unknowns:  10
male hispanics:  6
male refused:  2


## Sample Patients for Manuscript Figures

In [None]:
def oddeven(x):
    if x & 1:
        return 'odd'
    else:
        return 'even'

In [None]:
# Sample 3 year old male with median sma
med_sma = 39.73 # From Table above
df_3_m = df_m[(df_m['AgeGroup']==3) & (df_m['area_cm2'].between(np.floor(med_sma),np.ceil(med_sma)))].sort_values(by = 'area_cm2',ascending = True)
patid = df_3_m.loc[df_3_m['area_cm2']==df_3_m['area_cm2'].median()]['ID'].iloc[0]
sma  = df_3_m.loc[df_3_m['area_cm2']==df_3_m['area_cm2'].median()]['area_cm2'].iloc[0]
smi  = df_3_m.loc[df_3_m['area_cm2']==df_3_m['area_cm2'].median()]['smi'].iloc[0]
print('Median Male patient 3 year old: ', patid, ' with SMA: ', round(sma,2), ' and SMI: ', round(smi,2))

In [None]:
# Sample 7 year old female with median sma
med_sma = 51.91 # From Table above
df_7_f = df_f[(df_f['AgeGroup']==7) & (df_f['area_cm2'].between(np.floor(med_sma),np.ceil(med_sma)))].sort_values(by = 'area_cm2',ascending = True)
if len(df_7_f) > 2:
    patid = df_7_f.loc[df_7_f['area_cm2']==df_7_f['area_cm2'].median()]['ID'].iloc[0]
    sma  = df_7_f.loc[df_7_f['area_cm2']==df_7_f['area_cm2'].median()]['area_cm2'].iloc[0]
    smi  = df_7_f.loc[df_7_f['area_cm2']==df_7_f['area_cm2'].median()]['smi'].iloc[0]
else:
    patid = df_7_f['ID'].iloc[0]
    sma  = df_7_f['area_cm2'].iloc[0]
    smi  = df_7_f['smi'].iloc[0]
print('Median FeMale patient 7 year old: ', patid, ' with SMA: ', round(sma,2), ' and SMI: ', round(smi,2))

In [None]:
# Sample 11 year old male with median sma
med_sma_l = 91
med_sma_h = 93 # From Table above
df_11_m = df_m[(df_m['AgeGroup']==11) & (df_m['area_cm2'].between(np.floor(med_sma_l),np.ceil(med_sma_h)))].sort_values(by = 'area_cm2',ascending = True)
#display(df_11_m)
ind = int(np.floor(np.median(list(range(0,len(df_11_m))))))
patid = df_11_m['ID'].iloc[ind]
sma  = df_11_m['area_cm2'].iloc[ind]
smi  = df_11_m['smi'].iloc[ind]
print('Median Male patient 11 year old: ', patid, ' with SMA: ', round(sma,2), ' and SMI: ', round(smi,2))

In [None]:
# Sample  16 year old female with median sma
med_sma_l = 98
med_sma_h = 99 # From Table above
df_16_f = df_f[(df_f['AgeGroup']==16) & (df_f['area_cm2'].between(np.floor(med_sma_l),np.ceil(med_sma_h)))].sort_values(by = 'area_cm2',ascending = True)
#display(df_16_f)
ind = int(np.floor(np.median(list(range(0,len(df_16_f))))))
patid = df_16_f['ID'].iloc[ind]
sma  = df_16_f['area_cm2'].iloc[ind]
smi  = df_16_f['smi'].iloc[ind]
print('Median Female patient 16 year old: ', patid, ' with SMA: ', round(sma,2), ' and SMI: ', round(smi,2))

In [None]:
# Sample 10 year old male at 0.05, 0.25, 0.75 and 0.95 sma
q05 = 54.627
q05_l = 54
q05_h = 57
df_tmp = df_m[(df_m['AgeGroup']==10) & (df_m['area_cm2'].between(np.floor(q05_l),np.ceil(q05_h)))].sort_values(by = 'area_cm2',ascending = True)
print(len(df_tmp))
ind = int(np.floor(np.min(list(range(0,len(df_tmp))))))
patid = df_tmp['ID'].iloc[ind]
sma  = df_tmp['area_cm2'].iloc[ind]
smi  = df_tmp['smi'].iloc[ind]
print('Q 0.05 Male patient 10 year old: ', patid, ' with SMA: ', round(sma,2), ' and SMI: ', round(smi,2))

q25 = 62.832310335898384
df_tmp = df_m[(df_m['AgeGroup']==10) & (df_m['area_cm2'].between(np.floor(q25),np.ceil(q25)))].sort_values(by = 'area_cm2',ascending = True)
ind = int(np.floor(np.median(list(range(0,len(df_tmp))))))
patid = df_tmp['ID'].iloc[ind]
sma  = df_tmp['area_cm2'].iloc[ind]
smi  = df_tmp['smi'].iloc[ind]
print('Q 0.25 Male patient 10 year old: ', patid, ' with SMA: ', round(sma,2), ' and SMI: ', round(smi,2))

q50 = 69.98443148678662
df_tmp = df_m[(df_m['AgeGroup']==10) & (df_m['area_cm2'].between(np.floor(q50),np.ceil(q50)))].sort_values(by = 'area_cm2',ascending = True)
ind = int(np.floor(np.median(list(range(0,len(df_tmp))))))
patid = df_tmp['ID'].iloc[ind]
sma  = df_tmp['area_cm2'].iloc[ind]
smi  = df_tmp['smi'].iloc[ind]
print('Q 0.50 Male patient 10 year old: ', patid, ' with SMA: ', round(sma,2), ' and SMI: ', round(smi,2))

q75 = 76.36021542634919
df_tmp = df_m[(df_m['AgeGroup']==10) & (df_m['area_cm2'].between(np.floor(q75),np.ceil(q75)))].sort_values(by = 'area_cm2',ascending = True)
ind = int(np.floor(np.median(list(range(0,len(df_tmp))))))
patid = df_tmp['ID'].iloc[ind]
sma  = df_tmp['area_cm2'].iloc[ind]
smi  = df_tmp['smi'].iloc[ind]
print('Q 0.75 Male patient 10 year old: ', patid, ' with SMA: ', round(sma,2), ' and SMI: ', round(smi,2))

q95 = 91.43285125961786
q95_l = 90
q95_h = 92
df_tmp = df_m[(df_m['AgeGroup']==10) & (df_m['area_cm2'].between(np.floor(q95_l),np.ceil(q95_h)))].sort_values(by = 'area_cm2',ascending = True)
ind = int(np.floor(np.max(list(range(0,len(df_tmp))))))
patid = df_tmp['ID'].iloc[ind]
sma  = df_tmp['area_cm2'].iloc[ind]
smi  = df_tmp['smi'].iloc[ind]
print('Q 0.95 Male patient 10 year old: ', patid, ' with SMA: ', round(sma,2), ' and SMI: ', round(smi,2))

In [None]:
# Sample 2 patients with manual correction from df_man, minor and major variations.
df_man['relativearea_diff'] = abs(df_man['area_mm2'] - df_man['area_mm2_Elan'])/df_man['area_mm2']
df_man = df_man.sort_values(by=['relativearea_diff'],ascending=True)
#display(df_man)

# manual1 Z1762020
# manual2 Z981182
display(df_analysis[df_analysis['ID']=='Z1762020'])
display(df_analysis[df_analysis['ID']=='Z5745'])

### Calculate Z-scores for bmi, height and weight w.r.t to age from WHO/CDC data

In [97]:
1125+1043

2168

In [None]:
#pip install pygrowup
from pygrowup import Calculator
from pygrowup import helpers

calculator = Calculator(include_cdc=True)

def percentile_for_zscore(zscore):
    return stats.norm.cdf(zscore)

def zscore_for_percentile(p):
    return stats.norm.ppf(p)

def add_Z_scores(row):
    try:
        global calculator
        valid_age = row['Age']*12
        valid_gender = helpers.get_good_sex(row['Sex'])
        if (np.isnan(row['HEIGHT'])):
            row['height_age_Zscore'] = row["HEIGHT"]
        else:
            row['height_age_Zscore'] = float(calculator.lhfa(row['HEIGHT'],valid_age,valid_gender))
        
        if (np.isnan(row['WEIGHT'])):
            row['weight_age_Zscore'] = row["WEIGHT"]
        else:
            row['weight_age_Zscore'] = float(calculator.wfa(row['WEIGHT'],valid_age,valid_gender))
            
        if (np.isnan(row['BMI_AGE_P'])):
            row['bmi_age_Zscore'] = row['BMI_AGE_P']
        else:
            row['bmi_age_Zscore'] = zscore_for_percentile(row['BMI_AGE_P']/100)
    except:
        print(row)
        raise
    return row


def add_Zpercentile_scores(row):
    try:
        if (np.isnan(row['weight_age_Zscore'])):
            row['weight_age_Zpercentile'] = row["weight_age_Zscore"]
        else:
            row['weight_age_Zpercentile'] = 100*percentile_for_zscore(row["weight_age_Zscore"])
        
        if (np.isnan(row['height_age_Zscore'])):
            row['height_age_Zpercentile'] = row["height_age_Zscore"]
        else:
            row['height_age_Zpercentile'] = 100*percentile_for_zscore(row["height_age_Zscore"])
    except:
        print(row)
        raise
    return row

In [None]:
df_m = df_m.apply(add_Z_scores,axis=1)
df_m = df_m.rename({'BMI_AGE_P':'bmi_age_Zpercentile'},axis=1)
#df_m.weight_age_Zscore = df_m.weight_age_Zscore.astype(float)
#df_m.height_age_Zscore = df_m.height_age_Zscore.astype(float)
#df_m.bmi_age_Zscore = df_m.bmi_age_Zscore.astype(float)
df_m = df_m.apply(add_Zpercentile_scores,axis=1)

In [None]:
df_f = df_f.apply(add_Z_scores,axis=1)
df_f = df_f.rename({'BMI_AGE_P':'bmi_age_Zpercentile'},axis=1)
#df_m.weight_age_Zscore = df_m.weight_age_Zscore.astype(float)
#df_m.height_age_Zscore = df_m.height_age_Zscore.astype(float)
#df_m.bmi_age_Zscore = df_m.bmi_age_Zscore.astype(float)
df_f = df_f.apply(add_Zpercentile_scores,axis=1)

In [None]:
# Compute descriptive statistics on the dataframe
def calculate_stats(df):
    num_list = ['BMI_CALC','WEIGHT','HEIGHT','Age','area_cm2','smi','height_age_Zscore','height_age_Zpercentile',
                'weight_age_Zscore','weight_age_Zpercentile','bmi_age_Zscore','bmi_age_Zpercentile']
    df_num = df[num_list]
    df_d = df_num.describe()
    df_whites = df.loc[df['Race']=='White',num_list].describe()
    df_blacks = df.loc[df['Race']=='Black or African American',num_list].describe()
    df_others = df.loc[(df['Race']!='Black or African American') & (df['Race']!='White'),num_list].describe()
    
    df_whites = df_whites.add_suffix('_W')
    df_blacks = df_blacks.add_suffix('_B')
    df_others = df_others.add_suffix('_O')
    return pd.concat([df_d, df_whites,df_blacks,df_others], axis=1)

In [None]:
df_m_stats= calculate_stats(df_m).T.round(decimals=2)
df_f_stats = calculate_stats(df_f).T.round(decimals=2)

In [None]:
print("Population Statistics for Male")
display(df_m_stats.head(12))
df_f_stats.to_csv(output+'/Female_summary_statistics.csv',index=True)
df_m_stats.to_csv(output+'/Male_summary_statistics.csv',index=True)

In [None]:
print("Population Statistics for Female")
display(df_f_stats.head(12))

## Correlation metrics

In [None]:
from scipy.stats import pearsonr
import pandas as pd

def calculate_pvalues(df):
    # https://stackoverflow.com/questions/25571882/pandas-columns-correlation-with-statistical-significance
    df = df.dropna()._get_numeric_data()
    dfcols = pd.DataFrame(columns=df.columns)
    pvalues = dfcols.transpose().join(dfcols, how='outer')
    for r in df.columns:
        for c in df.columns:
            pvalues[r][c] = round(pearsonr(df[r], df[c])[1], 9)
    return pvalues

# Function to calculate Pearson's correlation coefficient (r) with confidence intervals in python!
# https://zhiyzuo.github.io/Pearson-Correlation-CI-in-Python/
from scipy import stats
def pearsonr_ci(x,y,alpha=0.05):
    ''' calculate Pearson correlation along with the confidence interval using scipy and numpy
    Parameters
    ----------
    x, y : iterable object such as a list or np.array
      Input for correlation calculation
    alpha : float
      Significance level. 0.05 by default
    Returns
    -------
    r : float
      Pearson's correlation coefficient
    pval : float
      The corresponding p value
    lo, hi : float
      The lower and upper bound of confidence intervals
    '''

    r, p = stats.pearsonr(x,y)
    r_z = np.arctanh(r)
    se = 1/np.sqrt(x.size-3)
    z = stats.norm.ppf(1-alpha/2)
    lo_z, hi_z = r_z-z*se, r_z+z*se
    lo, hi = np.tanh((lo_z, hi_z))
    return r, p, lo, hi

def calculate_pearsonr_with_ci_pval(df):
    # https://stackoverflow.com/questions/25571882/pandas-columns-correlation-with-statistical-significance
    df = df.dropna()._get_numeric_data()
    dfcols = pd.DataFrame(columns=df.columns)
    pearson_r = dfcols.transpose().join(dfcols, how='outer')
    for r in df.columns:
        for c in df.columns:
            cor, p, lo, hi = pearsonr_ci(df[r],df[c])
            pearson_r[r][c] = str(round(cor,2)) + " (" + str(round(lo,2)) + " - " + str(round(hi,2)) + "), p = " + '{:0.2e}'.format(p)
    return pearson_r

In [None]:
print('Pearson with CI and P-values')
df_m_cor_p = calculate_pearsonr_with_ci_pval(df_m[['WEIGHT','HEIGHT','Age','BMI_CALC','bmi_age_Zpercentile','area_cm2','smi']])
display(df_m_cor_p)
df_m_cor_p.to_csv(output+'/Male_pearsonr_with_ci.csv',index=True)

In [None]:
print('Correlation Map Male')
display(df_m[['WEIGHT','HEIGHT','Age','BMI_CALC','bmi_age_Zpercentile','area_cm2','smi']].corr().round(3))
df_m_cor = df_m[['WEIGHT','HEIGHT','Age','BMI_CALC','bmi_age_Zpercentile','area_cm2','smi']].corr().round(3)
df_m_cor.to_csv(output+'/Male_correlation_statistics.csv',index=True)

In [None]:
print('P-values for Correlation map - male, P < 0.05 are statistically significant')
df_m_cor_p = calculate_pvalues(df_m[['WEIGHT','HEIGHT','Age','BMI_CALC','area_cm2','smi']])
display(df_m_cor_p)
df_m_cor_p.to_csv(output+'/Male_correlation_pvalues.csv',index=True)

In [None]:
print('Pearson with CI and P-values')
df_f_cor_p = calculate_pearsonr_with_ci_pval(df_f[['WEIGHT','HEIGHT','Age','BMI_CALC','bmi_age_Zpercentile','area_cm2','smi']])
display(df_f_cor_p)
df_f_cor_p.to_csv(output+'/Female_pearsonr_with_ci.csv',index=True)

In [None]:
print('Correlation Map Female')
display(df_f[['WEIGHT','HEIGHT','Age','BMI_CALC','bmi_age_Zpercentile','area_cm2','smi']].corr().round(3))
df_f_cor = df_f[['WEIGHT','HEIGHT','Age','BMI_CALC','bmi_age_Zpercentile','area_cm2','smi']].corr().round(3)
df_f_cor.to_csv(output+'/Female_correlation_statistics.csv',index=True)

In [None]:
print('P-values for Correlation map - female, P < 0.05 are statistically significant')
df_f_cor_p = calculate_pvalues(df_f[['WEIGHT','HEIGHT','Age','BMI_CALC','area_cm2','smi']])
display(df_f_cor_p)
df_f_cor_p.to_csv(output+'/Female_correlation_pvalues.csv',index=True)

## Quantile Regression with Splines

In [None]:
# Get Restricted Cubic splines by transforming independent variable (age)
from statsmodels.regression.quantile_regression import QuantReg as QR
# Reference: 
from patsy import dmatrix
from patsy import cr
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.metrics import mean_squared_error
from math import sqrt

In [None]:
import rpy2.robjects as robjects
from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri
#Must be activated
pandas2ri.activate()
utils = importr('utils')
#utils.chooseCRANmirror(ind=1)
#utils.install_packages('quantreg')
qr_package=importr('quantreg')

In [None]:
def get_knots(x,nknots):
    # Knot locations are set based on this article
    #https://support.sas.com/resources/papers/proceedings16/5621-2016.pdf
    if nknots == 3:
        return [x.quantile(0.1),x.quantile(0.5),x.quantile(0.9)]
    elif nknots == 4:
        return [x.quantile(0.05),x.quantile(0.35),x.quantile(0.65),x.quantile(0.95)]
    elif nknots == 5:
        return [x.quantile(0.05),x.quantile(0.275),x.quantile(0.5),x.quantile(0.725),x.quantile(0.95)]

In [None]:
def fit_bs_models(model,qlist):
    res = [model.fit(q=q) for q in qlist] 
    return res

def get_transformed_x(x,knots,degrees):
    transformed_x = dmatrix("bs(train, knots=" + knots +", degree= "+ str(degrees) 
                    +",include_intercept=False)", {"train": x},return_type='dataframe')
    return transformed_x

def find_points_above(model,x_data,y_data):
    y_m = model.predict(x_data)
    points = y_data > y_m
    outlier_count = sum(points)
    return outlier_count, points

def find_points_below(model,x_data,y_data):
    y_m = model.predict(x_data)
    points = y_data < y_m
    outlier_count = sum(points)
    return outlier_count, points

In [None]:
units_dict = {'WEIGHT':'Weight (Kg)','Age':'Age (Years)', 'HEIGHT': 'Height (cm)', 'area_cm2': 'SMA (cm^2)', 'smi': 'SMI (cm^2/m^2)',
             'BMI_CALC': ' BMI', 'bmi_age_Zpercentile': 'BMI for Age (Z-percentile)'}

# Gets design matrix command for restricted cubic splines (RCS)
def get_dm_command_RCS(DF,knots):
    lb = knots[0]
    ub = knots[-1]
    internal = knots[1:-1] # Internal knots
    
    command_head = "cr(train,df = " +  str(DF) + ", knots="

    command_mid  = ""
    internal_str=  list(map(str,internal))
    #print(internal_str)
    if len(internal_str)==1:
        command_mid = internal_str[0]
    else:
        command_mid = "(" + internal_str[0] 
        for i in internal_str[1:]:
            command_mid = command_mid + ',' + i    
        command_mid = command_mid + ')'

    command_tail = ", lower_bound=" + str(lb) +", upper_bound=" + str(ub) + ") -1"
    command = command_head + command_mid + command_tail
    return command

def get_aic_rpy2(dm,df,response_var,q=0.5):
    # Change colnames
    colnames = []
    for i in range(len(list(dm))):
        ind = i+1
        colnames.append('x' + str(ind))
        
    #print(colnames)    
    dm.columns = colnames

    # Add Jitter, Required for R QR
    df_R = robjects.r['as.matrix'](dm)
    df_R  = robjects.r['jitter'](df_R)
    dm_jitter = dm.copy()

    # Add response var to df for Quantile Reg in Rpy2
    dm_jitter.iloc[:,:] = df_R
    dm_jitter[response_var] = df[response_var].values


    # QR Command
    qr_head = response_var + ' ~ '
    qr_tail = colnames[0]
    for i in colnames[1:]:
        qr_tail = qr_tail + '+' + i
    qr_command = qr_head + qr_tail
    #print('qr_command:', qr_command)

    # Perform QR and find AIC
    qr_mod = qr_package.rq(qr_command,data=dm_jitter, tau=q)
    AIC = qr_package.AIC_rq(qr_mod)[0]

    return AIC

def spline_model_optimizer_cr(df,response_var,predictor_var='Age',nknots=[3,4,5]):
    # Storage parameters
    model_dict = {}
    
    # Akaike defined by Elan
    akaike_dict = {}
    mean_akaike = {}
    akaike_dict_25 = {}
    akaike_dict_75 = {}
    
    # AIC from R for quantreg models
    aic_dict = {}
    mean_aic = {}
    aic_dict_25 = {}
    aic_dict_75 = {}

    # Maximum Likelihood calculated by Elan
    L_dict = {}
    L_dict_25 = {}
    L_dict_75 = {}
    
    knots_dict = {} 
    model_keys = []
    
    # Knot constraints
    min_knot = 3
    max_knot = 15 + 1 # [+1 for python]
    min_knot_diff = 3
    
    # Iterate over all knots
    for nknot in nknots :
        knots = get_knots(np.floor(df[predictor_var]),nknot)

        # Book-keeping
        mkey = 'nknots_'+ str(nknot)
        model_keys.append(mkey)
        knots_dict[mkey] = knots

        #print('knots: ', knots)
        command = get_dm_command_RCS(nknot,knots)
       # print(command)
        transformed_age = dmatrix(command, {"train": df[predictor_var].values}, return_type='dataframe')
       
        # Fit qr model
        qr_model = QR(df[response_var].values,transformed_age.values)
        model_dict[mkey] = qr_model
        
        # Calculate AIC from R qr function
        aic_dict[mkey] = get_aic_rpy2(transformed_age,df,response_var,q=0.5)
        aic_dict_25[mkey] = get_aic_rpy2(transformed_age,df,response_var,q=0.25)
        aic_dict_75[mkey] = get_aic_rpy2(transformed_age,df,response_var,q=0.75)
        
        # Calculate mean aic
        mean_aic[mkey] = (aic_dict[mkey] + aic_dict_25[mkey] + aic_dict_75[mkey])/3
            
    # Find best model
    best_model_raic= min(mean_aic, key=mean_aic.get)
    return model_dict[best_model_raic],knots_dict[best_model_raic]

def plot_bs_qr_cr(dfs,mlist,knots,sexes=['Male','Female'],predictor='Age',response='area_cm2',xstart=0, ystart = 0):
    x = np.linspace(dfs[0][predictor].values.min(),dfs[0][predictor].values.max(),70)
    #age_range = dfs[0][predictor].values.max() - dfs[0][predictor].values.min()
    #x_df = np.linspace(dfs[0][predictor].values.min(),dfs[0][predictor].values.max(),int(age_range*2))
    x_df = np.arange(2.0, 19.5, 0.5)
    ymax = max(max(dfs[0][response]),max(dfs[0][response]))
    plt.figure(figsize = (14,7),linewidth=2)
    
    results_df = pd.DataFrame(columns = ['Age','0.05','0.25','0.50','0.75','0.95','0.05','0.25','0.50','0.75','0.95'])
    results_df['Age'] =  x_df
        #fig, ax = plt.subplots(figsize=(8, 6))
    for n in range(len(mlist)):
        df = dfs[n]
        models = mlist[n]
        
        command = get_dm_command_RCS(len(knots[n]),knots[n])
        transformed_x = dmatrix(command, {"train": x}, return_type='dataframe') 
        transformed_x_df = dmatrix(command, {"train": x_df}, return_type='dataframe') 
        
        ax = plt.subplot(1,2,n+1)
        for i in range(len(models)):
            y = models[i].predict(transformed_x)
            y_df = models[i].predict(transformed_x_df)
            if n == 0:
                results_df.iloc[:,i+1] = y_df
            else:
                results_df.iloc[:,i+6] = y_df
                
            ax.plot(x, y, linestyle='dotted',label='q =' + str(round(models[i].q,2)),linewidth=3)
          
        # Plot outliers in red
        x_data = df[predictor].values
        tr_x_data = dmatrix(command, {"train": x_data}, return_type='dataframe')
        y_data = df[response].values
        _,outliers_above = find_points_above(models[-1],tr_x_data,y_data)
        _,outliers_below = find_points_below(models[0],tr_x_data,y_data)
       # print('Number of outliers > 0.90: ', sum(outliers_above))
       # print('Number of outliers < 0.10: ', sum(outliers_below))
        outliers = outliers_above | outliers_below
        
        print('Total outliers: ', sum(outliers), 'Total points: ', len(outliers))
        
        normals = ~ outliers
        ax.scatter(x_data[normals], y_data[normals],facecolor=None, edgecolor = 'k', alpha=.05)
        ax.scatter(x_data[outliers], y_data[outliers],facecolor=None, edgecolor = 'r', alpha=.25)
        
       #ax.scatter(df[predictor], df[response],facecolor=None, edgecolor = 'k', alpha=.1)
       # print(np.ceil(max(df[predictor].values)))
        ax.set_xlim((xstart, np.ceil(max(df[predictor].values))),auto=True)
        
        print('xstart: ',xstart)
        print('ystart: ',ystart)
        ax.set_ylim(ystart, ymax,auto=True)
        legend = ax.legend()
        if np.ceil(max(df[predictor].values)) > 20:
            xtick_r = 20
        else:
            xtick_r = 1
        
        ax.set_xticks(np.arange(xstart,np.ceil(max(df[predictor].values)),xtick_r))
        
        if predictor == "BMI_CALC":
            ax.set_xlim(xstart, np.ceil(max(df[predictor].values)),auto=True)
            xtick_r = 2
            ax.set_xticks(np.arange(xstart,np.ceil(max(df[predictor].values)),xtick_r))
        
        ax.set_yticks(np.arange(ystart,ymax,20))
        ax.set_xlabel(units_dict[predictor], fontsize=18)
        ax.set_ylabel(units_dict[response], fontsize=18)
        ax.set_title(sexes[n], fontsize=18)
    title = response +'_vs_'+predictor+'.png'
    plt.tight_layout()
    plt.savefig(os.path.join(output,title), dpi = 300)
    
    
    return results_df

def plot_bs_qr_wcdc_cr(dfs,mlist,knots,cdc_df,sexes=['Male','Female'],predictor='Age',response='area_cm2',xstart=0, ystart=0):
    x = np.linspace(dfs[0][predictor].values.min(),dfs[0][predictor].values.max(),70)
    ymax = max(max(dfs[0][response]),max(dfs[0][response]))
    plt.figure(figsize = (14,7),linewidth=2)
    
    colors = ['r','g','c','m','b']
    #fig, ax = plt.subplots(figsize=(8, 6))
    for n in range(len(mlist)):
        df = dfs[n]
        models = mlist[n]
        
        cdc_df_sex = cdc_df[cdc_df['Sex']==n+1]
        
        command = get_dm_command_RCS(len(knots[n]),knots[n])
        # print(command)
        transformed_x = dmatrix(command, {"train": x}, return_type='dataframe')
        ax = plt.subplot(1,2,n+1)
        
        for i in range(len(models)):
            y = models[i].predict(transformed_x)
            ax.plot(x, y, linestyle='dotted',color=colors[i],label='q =' + str(round(models[i].q,2)),linewidth=3)
            ax.plot(cdc_df_sex.iloc[:,1],cdc_df_sex.iloc[:,i+2],color=colors[i],label='cdc, q =' + str(round(models[i].q,2)),linewidth=2)

        ax.scatter(df[predictor], df[response],facecolor=None, edgecolor = 'k', alpha=.1)
        ax.set_xlim((xstart, np.ceil(max(df[predictor].values))))
        print('xstart: ',xstart)
        print('ystart: ',ystart)
        ax.set_ylim(ystart, ymax,auto=True)
        legend = ax.legend()
        if np.ceil(max(df[predictor].values)) > 20:
            xtick_r = 20
        else:
            xtick_r = 1
        ax.set_xticks(np.arange(xstart,np.ceil(max(df[predictor].values)),xtick_r))
        ax.set_yticks(np.arange(ystart,ymax,10))
        ax.set_xlabel(units_dict[predictor], fontsize=18)
        ax.set_ylabel(units_dict[response], fontsize=18)
        ax.set_title(sexes[n], fontsize=18)
    title = response +'_vs_'+predictor+'.png'
    plt.tight_layout()
    plt.savefig(os.path.join(output,title), dpi = 300)  

In [None]:
# Import cdc data
# 2 to 20years
#infile  = '/tf/smipipeline/growth_curves/data/csv/wtage220.csv'
infile  = '/home/jupyteruser/smipipeline/growth_curves/data/csv/wtage220.csv'
df_cdc_wt= pd.read_csv(infile, index_col=False)
df_cdc_wt['Age'] = df_cdc_wt['Agemos']/12
df_cdc_wt = df_cdc_wt[['Sex','Age','P10','P25','P50','P75','P90']]

# infants to 3 years
#infile2  = '/tf/smipipeline/growth_curves/data/csv/wtageinf.csv'
infile2  = '/home/jupyteruser/smipipeline/growth_curves/data/csv/wtageinf.csv'
df_cdc_wt2= pd.read_csv(infile2, index_col=False)
df_cdc_wt2['Age'] = df_cdc_wt2['Agemos']/12
df_cdc_wt2 = df_cdc_wt2[['Sex','Age','P10','P25','P50','P75','P90']]

# keep only till 2 years
df_cdc_wt2 = df_cdc_wt2[df_cdc_wt2['Age']<2]

# Merge
df_cdc_wt = pd.concat([df_cdc_wt,df_cdc_wt2],ignore_index=True)
df_cdc_wt = df_cdc_wt.sort_values(by=['Age'])

response='WEIGHT'
cdc_df = df_cdc_wt
# Get optimal model config
df_m_nonull = df_m.dropna(subset=[response])
df_f_nonull = df_f.dropna(subset=[response])
model_m,knots_m= spline_model_optimizer_cr(df_m_nonull, response)
model_f,knots_f = spline_model_optimizer_cr(df_f_nonull, response)


# Fit the model for different quantiles
fit_models_m = fit_bs_models(model_m,qlist=[0.05,0.25,0.50,0.75,0.95])
fit_models_f = fit_bs_models(model_f,qlist=[0.05,0.25,0.50,0.75,0.95])

In [None]:
# Plot the results
print('Population comparison to cdc data on ' + response + ' using quantile regression')

cdc_df = cdc_df[cdc_df['Age']>=2]

plot_bs_qr_wcdc_cr([df_m_nonull,df_f_nonull],
           [fit_models_m,fit_models_f],
           knots=[knots_m,knots_f],
           cdc_df = cdc_df,
           response=response)

In [None]:
# Import cdc data
# 2 to 20years
infile  = '/home/jupyteruser/smipipeline/growth_curves/data/csv/statage220.csv'
df_cdc_wt= pd.read_csv(infile, index_col=False)
df_cdc_wt['Age'] = df_cdc_wt['Agemos']/12
df_cdc_wt = df_cdc_wt[['Sex','Age','P10','P25','P50','P75','P90']]
# infants to 3 years
infile2  = '/home/jupyteruser/smipipeline/growth_curves/data/csv/lenageinf.csv'
df_cdc_wt2= pd.read_csv(infile2, index_col=False)
df_cdc_wt2['Age'] = df_cdc_wt2['Agemos']/12
df_cdc_wt2 = df_cdc_wt2[['Sex','Age','P10','P25','P50','P75','P90']]

# keep only till 2 years
df_cdc_wt2 = df_cdc_wt2[df_cdc_wt2['Age']<2]

# Merge
df_cdc_wt = pd.concat([df_cdc_wt,df_cdc_wt2],ignore_index=True)
df_cdc_wt = df_cdc_wt.sort_values(by=['Age'])

response='HEIGHT'
cdc_df = df_cdc_wt
# Get optimal model config
df_m_nonull = df_m.dropna(subset=[response])
df_f_nonull = df_f.dropna(subset=[response])
model_m,knots_m = spline_model_optimizer_cr(df_m_nonull, response)
model_f,knots_f = spline_model_optimizer_cr(df_f_nonull, response)


# Fit the model for different quantiles
fit_models_m = fit_bs_models(model_m,qlist=[0.05,0.25,0.50,0.75,0.95])
fit_models_f = fit_bs_models(model_f,qlist=[0.05,0.25,0.50,0.75,0.95])

In [None]:
# Plot the results
if cdc_df is not None:
    print('Population comparison to cdc data on ' + response + ' using quantile regression')
    cdc_df = cdc_df[cdc_df['Age']>=2]
    
    plot_bs_qr_wcdc_cr([df_m_nonull,df_f_nonull],
               [fit_models_m,fit_models_f],
               knots=[knots_m,knots_f],
               cdc_df = cdc_df,
               response=response, ystart=60)

In [None]:
# Import cdc data
# 2 to 20years
infile  = '/home/jupyteruser/smipipeline/growth_curves/data/csv/bmiage220.csv'
df_cdc_wt= pd.read_csv(infile, index_col=False)
df_cdc_wt['Age'] = df_cdc_wt['Agemos']/12
df_cdc_wt = df_cdc_wt[['Sex','Age','P10','P25','P50','P75','P90']]

response='BMI_CALC'
cdc_df = df_cdc_wt
# Get optimal model config
df_m_nonull = df_m.dropna(subset=[response])
df_f_nonull = df_f.dropna(subset=[response])
model_m,knots_m = spline_model_optimizer_cr(df_m_nonull, response)
model_f,knots_f = spline_model_optimizer_cr(df_f_nonull, response)

# Fit the model for different quantiles
fit_models_m = fit_bs_models(model_m,qlist=[0.05,0.25,0.50,0.75,0.95])
fit_models_f = fit_bs_models(model_f,qlist=[0.05,0.25,0.50,0.75,0.95])

In [None]:
# Plot the results
if cdc_df is not None:
    cdc_df = cdc_df[cdc_df['Age']>=2]
    print('Population comparison to cdc data on ' + response + ' using quantile regression')
    plot_bs_qr_wcdc_cr([df_m_nonull,df_f_nonull],
               [fit_models_m,fit_models_f],
               knots=[knots_m,knots_f],
               cdc_df = cdc_df,
               response=response, ystart=10)

In [None]:
response='area_cm2'

# Get optimal model config
model_m,knots_m = spline_model_optimizer_cr(df_m, response)
model_f,knots_f = spline_model_optimizer_cr(df_f, response)


# Fit the model for different quantiles
fit_models_m = fit_bs_models(model_m,qlist=[0.05,0.25,0.50,0.75,0.95])
fit_models_f = fit_bs_models(model_f,qlist=[0.05,0.25,0.50,0.75,0.95])

In [None]:
knots_m

In [None]:
knots_f

In [None]:
# Plot the results
print('Quantile Regression with Splines for Skeletal Muscle Area (CCHMC data)')
results_df = plot_bs_qr_cr([df_m,df_f],
           [fit_models_m,fit_models_f],
           knots=[knots_m,knots_f],
           response=response)
results_df.to_csv(os.path.join(output,'age_sma_table.csv'))

In [None]:
response='smi'

df_m_nonull = df_m.dropna(subset=[response])
df_f_nonull = df_f.dropna(subset=[response])

# Get optimal model config
model_m,knots_m = spline_model_optimizer_cr(df_m_nonull, response)
model_f,knots_f = spline_model_optimizer_cr(df_f_nonull, response)

# Fit the model for different quantiles
fit_models_m = fit_bs_models(model_m,qlist=[0.05,0.25,0.50,0.75,0.95])
fit_models_f = fit_bs_models(model_f,qlist=[0.05,0.25,0.50,0.75,0.95])

In [None]:
knots_m

In [None]:
knots_f

In [None]:
# Plot the results
print('Quantile Regression with Splines for SMI [CCHMC data]')
results_df = plot_bs_qr_cr([df_m_nonull,df_f_nonull],
           [fit_models_m,fit_models_f],
           knots=[knots_m,knots_f],
           response=response)
results_df.to_csv(os.path.join(output,'age_smi_table.csv'))

In [None]:
response='area_cm2'
predictor='WEIGHT'

df_m_nonull = df_m.dropna(subset=[response,predictor])
df_f_nonull = df_f.dropna(subset=[response,predictor])

# Get optimal model config
model_m,knots_m = spline_model_optimizer_cr(df_m_nonull,response,predictor_var=predictor)
model_f,knots_f = spline_model_optimizer_cr(df_f_nonull,response,predictor_var=predictor)

# Fit the model for different quantiles
fit_models_m = fit_bs_models(model_m,qlist=[0.05,0.25,0.50,0.75,0.95])
fit_models_f = fit_bs_models(model_f,qlist=[0.05,0.25,0.50,0.75,0.95])

In [None]:
# Plot the results
print('Quantile Regression with Splines for SMA [CCHMC data] vs Weight')
plot_bs_qr_cr([df_m_nonull,df_f_nonull],
           [fit_models_m,fit_models_f],
           knots=[knots_m,knots_f],
           response=response, predictor=predictor,xstart=10)

In [None]:
response='smi'
predictor='WEIGHT'

df_m_nonull = df_m.dropna(subset=[response,predictor])
df_f_nonull = df_f.dropna(subset=[response,predictor])

# Get optimal model config
model_m,knots_m = spline_model_optimizer_cr(df_m_nonull,response, predictor_var=predictor)
model_f,knots_f = spline_model_optimizer_cr(df_f_nonull,response, predictor_var=predictor)

# Fit the model for different quantiles
fit_models_m = fit_bs_models(model_m,qlist=[0.05,0.25,0.50,0.75,0.95])
fit_models_f = fit_bs_models(model_f,qlist=[0.05,0.25,0.50,0.75,0.95])

In [None]:
# Plot the results
print('Quantile Regression with Splines for SMI [CCHMC data] vs Weight')
plot_bs_qr_cr([df_m_nonull, df_f_nonull],
           [fit_models_m, fit_models_f],
           knots=[knots_m, knots_f],
           response=response, predictor=predictor, xstart = 10)

In [None]:
response='area_cm2'
predictor='HEIGHT'

df_m_nonull = df_m.dropna(subset=[response,predictor])
df_f_nonull = df_f.dropna(subset=[response,predictor])

# Get optimal model config
model_m,knots_m = spline_model_optimizer_cr(df_m_nonull, response, predictor_var=predictor)
model_f,knots_f = spline_model_optimizer_cr(df_f_nonull, response, predictor_var=predictor)

# Fit the model for different quantiles
fit_models_m = fit_bs_models(model_m,qlist=[0.05,0.25,0.50,0.75,0.95])
fit_models_f = fit_bs_models(model_f,qlist=[0.05,0.25,0.50,0.75,0.95])

In [None]:
# Plot the results
print('Quantile Regression with Splines for SMA [CCHMC data] vs HEIGHT')
plot_bs_qr_cr([df_m_nonull,df_f_nonull],
           [fit_models_m,fit_models_f],
           knots=[knots_m,knots_f],
           response=response,predictor=predictor, xstart=80)

In [None]:
response='smi'
predictor='HEIGHT'

df_m_nonull = df_m.dropna(subset=[response,predictor])
df_f_nonull = df_f.dropna(subset=[response,predictor])

# Get optimal model config
model_m,knots_m = spline_model_optimizer_cr(df_m_nonull, response,predictor_var=predictor)
model_f,knots_f = spline_model_optimizer_cr(df_f_nonull, response,predictor_var=predictor)

# Fit the model for different quantiles
fit_models_m = fit_bs_models(model_m,qlist=[0.05,0.25,0.50,0.75,0.95])
fit_models_f = fit_bs_models(model_f,qlist=[0.05,0.25,0.50,0.75,0.95])

In [None]:
# Plot the results
print('Quantile Regression with Splines for SMI [CCHMC data] vs HEIGHT')
plot_bs_qr_cr([df_m_nonull,df_f_nonull],
           [fit_models_m,fit_models_f],
           knots=[knots_m,knots_f],
           response=response, predictor=predictor, xstart=80)

In [None]:
response='area_cm2'
predictor='bmi_age_Zpercentile'

df_m_nonull = df_m.dropna(subset=[response,predictor])
df_f_nonull = df_f.dropna(subset=[response,predictor])

# Get optimal model config
model_m,knots_m = spline_model_optimizer_cr(df_m_nonull,response, predictor_var=predictor)
model_f,knots_f = spline_model_optimizer_cr(df_f_nonull,response, predictor_var=predictor)

# Fit the model for different quantiles
fit_models_m = fit_bs_models(model_m,qlist=[0.05,0.25,0.50,0.75,0.95])
fit_models_f = fit_bs_models(model_f,qlist=[0.05,0.25,0.50,0.75,0.95])

In [None]:
# Plot the results
print('Quantile Regression with Splines for SMA [CCHMC data] vs BMI percentile')
plot_bs_qr_cr([df_m_nonull, df_f_nonull],
           [fit_models_m, fit_models_f],
           knots=[knots_m, knots_f],
           response=response, predictor=predictor)

In [None]:
response='smi'
predictor='bmi_age_Zpercentile'

df_m_nonull = df_m.dropna(subset=[response,predictor])
df_f_nonull = df_f.dropna(subset=[response,predictor])

# Get optimal model config
model_m,knots_m = spline_model_optimizer_cr(df_m_nonull, response,predictor_var=predictor)
model_f,knots_f = spline_model_optimizer_cr(df_f_nonull, response,predictor_var=predictor)

# Fit the model for different quantiles
fit_models_m = fit_bs_models(model_m,qlist=[0.05,0.25,0.50,0.75,0.95])
fit_models_f = fit_bs_models(model_f,qlist=[0.05,0.25,0.50,0.75,0.95])

In [None]:
knots_f

In [None]:
# Plot the results
print('Quantile Regression with Splines for SMI [CCHMC data] vs BMI percentile')
plot_bs_qr_cr([df_m_nonull, df_f_nonull],
           [fit_models_m, fit_models_f],
           knots=[knots_m, knots_f],
           response=response, predictor=predictor, ystart = 10)

In [None]:
response='area_cm2'
predictor='BMI_CALC'

df_m_nonull = df_m.dropna(subset=[response,predictor])
df_f_nonull = df_f.dropna(subset=[response,predictor])

# Get optimal model config
model_m,knots_m = spline_model_optimizer_cr(df_m_nonull, response,predictor_var=predictor)
model_f,knots_f = spline_model_optimizer_cr(df_f_nonull, response,predictor_var=predictor)

# Fit the model for different quantiles
fit_models_m = fit_bs_models(model_m, qlist=[0.05,0.25,0.50,0.75,0.95])
fit_models_f = fit_bs_models(model_f, qlist=[0.05,0.25,0.50,0.75,0.95])

In [None]:
# Plot the results
print('Quantile Regression with Splines for Skeletal Muscle Index [CCHMC data] vs BMI')
plot_bs_qr_cr([df_m_nonull, df_f_nonull],
           [fit_models_m,fit_models_f],
           knots=[knots_m, knots_f],
           response=response, predictor=predictor, xstart=12)

In [None]:
response='smi'
predictor='BMI_CALC'

df_m_nonull = df_m.dropna(subset=[response,predictor])
df_f_nonull = df_f.dropna(subset=[response,predictor])

# Get optimal model config
model_m,knots_m = spline_model_optimizer_cr(df_m_nonull, response,predictor_var=predictor)
model_f,knots_f = spline_model_optimizer_cr(df_f_nonull, response,predictor_var=predictor)

# Fit the model for different quantiles
fit_models_m = fit_bs_models(model_m,qlist=[0.05,0.25,0.50,0.75,0.95])
fit_models_f = fit_bs_models(model_f,qlist=[0.05,0.25,0.50,0.75,0.95])

In [None]:
# Plot the results
print('Quantile Regression with Splines for Skeletal Muscle Index [CCHMC data] vs BMI')
plot_bs_qr_cr([df_m_nonull,df_f_nonull],
           [fit_models_m, fit_models_f],
           knots=[knots_m, knots_f],
           response=response,predictor=predictor, xstart=12, ystart=10)