In [1]:
Output = ('/Users/alexis/Library/CloudStorage/OneDrive-UniversityofNorthCarolinaatChapelHill/CEMALB_DataAnalysisPM/Projects/P1012. Cytokine Scoring/P1012.3. Analyses/P1012.3.1. Allostatic Load Calculation/Output')
cur_date = "051324"

library(readxl)
library(openxlsx)
library(tidyverse)

# reading in file
cytokine_df = data.frame(read_excel("Input/Allostatic_Mediator_Data_050824.xlsx", sheet = 2)) 
bp_df = data.frame(read_excel("Input/Allostatic_Mediator_Data_050824.xlsx", sheet = 4)) 
subject_info_df = data.frame(read_excel("Input/Subject_Info_050824.xlsx", sheet = 2))

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.3     [32m✔[39m [34mreadr    [39m 2.1.4
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.0
[32m✔[39m [34mggplot2  [39m 3.4.3     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.2     [32m✔[39m [34mtidyr    [39m 1.3.0
[32m✔[39m [34mpurrr    [39m 1.0.2     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors
“Expecting numeric in D1077 / R1077C4: got 'NA'”
“Expecting numeric in D1080 / R1080C4: got 'NA'”
“Expecting numeric in D1083 / R1083C4: got 'NA'”
“Expecting numeric in 

In [10]:
# creating 1 df
full_df = inner_join(subject_info_df, cytokine_df) %>%
    # filtering for subjects who have blood pressure measurements
    filter(Subject_ID %in% unique(bp_df$Subject_ID))
head(full_df)

[1m[22mJoining with `by = join_by(Subject_ID)`


Unnamed: 0_level_0,Study,Original_Subject_Number,Subject_Number,Group,Subject_ID,Sex,Age,Race,Category,Variable,Value
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<dbl>,<chr>,<chr>,<chr>,<dbl>
1,TCORS LAIV,39,5,CS,CS_5,M,21,W,AL Biomarker,Cortisol,116.602
2,TCORS LAIV,39,5,CS,CS_5,M,21,W,AL Biomarker,Noradrenaline,6214.28
3,TCORS LAIV,39,5,CS,CS_5,M,21,W,AL Biomarker,Hba1c,8901.778
4,TCORS LAIV,39,5,CS,CS_5,M,21,W,AL Biomarker,Fibrinogen,1106446.956
5,TCORS LAIV,39,5,CS,CS_5,M,21,W,AL Biomarker,CRP,896782.493
6,TCORS LAIV,39,5,CS,CS_5,M,21,W,Cytokine,IP10,123.031


Goal is to generate allostatic loading scores which is an index of atherosclerotic risk protection from these allostatic load biomarkers and to compare based on tobacco smoke exposure and race as potential covariates.

In [11]:
# creating all dfs to be analyzed
nonsmoker_df = full_df %>%
    filter(Group == "NS")
smoker_df = full_df %>%
    filter(Group == "CS")

# splitting the df based on group and race, so to make the code more efficient
split_allostatic_df = full_df %>%
    group_by(Group, Race) %>%
    # splitting the df based on those variables
    group_split()

black_nonsmoker_df = split_allostatic_df[[3]]
black_smoker_df = split_allostatic_df[[1]]
white_nonsmoker_df = split_allostatic_df[[4]]
white_smoker_df = split_allostatic_df[[2]]

# viewing one of the dataframes 
head(black_nonsmoker_df)

Study,Original_Subject_Number,Subject_Number,Group,Subject_ID,Sex,Age,Race,Category,Variable,Value
<chr>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<dbl>,<chr>,<chr>,<chr>,<dbl>
Woodsie,1,37,NS,NS_37,F,21,B,AL Biomarker,Cortisol,106.118
Woodsie,1,37,NS,NS_37,F,21,B,AL Biomarker,Noradrenaline,3939.929
Woodsie,1,37,NS,NS_37,F,21,B,AL Biomarker,Hba1c,13432.363
Woodsie,1,37,NS,NS_37,F,21,B,AL Biomarker,Fibrinogen,1885217.243
Woodsie,1,37,NS,NS_37,F,21,B,AL Biomarker,CRP,1003162.185
Woodsie,1,37,NS,NS_37,F,21,B,Cytokine,IP10,90.615


In [12]:
mediator_score = function(df){
    # """
    # Creating a scoring function for each mediator.
    # :param (input): initial df (df)
    # :output: df containing the variable (biomarker) name, subject ID, and score
    # """
    
    # creating an empty df to store values
    score_df = data.frame()
    
    # getting all variable names for loop to iterate through
    mediators = unique(df$Variable)
    
    for (i in 1:length(mediators)){

        # filtering df for each mediator
        filtered_df = df %>%
            filter(Variable == mediators[i])
        
        # now iterating through each value of the filtered_df
        for (j in 1:length(filtered_df$Value)){

            # score = (mediator value - mediator min)/ (mediator max - mediator min)
            mediator_score_formula = (filtered_df$Value[j] - min(filtered_df$Value))/(max(filtered_df$Value) - min(filtered_df$Value))

            # storing mediator, subject id, and score
            values_vector = cbind(mediators[i], filtered_df$Subject_ID[j], mediator_score_formula)
            score_df = rbind(score_df, values_vector)
        }
    }
    
    # renaming columns
    colnames(score_df) = c("Variable", "Subject_ID", "Mediator_Score")
    
    # for some reason the Mediator_Score is a character type, so changing to a numeric
    score_df$Mediator_Score = as.numeric(score_df$Mediator_Score)
    
    return(score_df)
}

In [13]:
# calling function
NS_mediator_score_df = mediator_score(nonsmoker_df)
CS_mediator_score_df = mediator_score(smoker_df)
black_NS_mediator_score_df = mediator_score(black_nonsmoker_df)
black_CS_mediator_score_df = mediator_score(black_smoker_df)
white_NS_mediator_score_df = mediator_score(white_nonsmoker_df)
white_CS_mediator_score_df = mediator_score(white_smoker_df)

head(NS_mediator_score_df)

Unnamed: 0_level_0,Variable,Subject_ID,Mediator_Score
Unnamed: 0_level_1,<chr>,<chr>,<dbl>
1,Cortisol,NS_37,0.19949402
2,Cortisol,NS_38,1.0
3,Cortisol,NS_39,0.09488444
4,Cortisol,NS_40,0.0
5,Cortisol,NS_41,0.80006861
6,Cortisol,NS_49,0.2279705


In [14]:
allostatic_score = function(mediator_score_df, Group, Covariate){
    # """
    # Creating a scoring function for allostatic load.
    # :param (input): mediator score df (mediator_score_df), smoking group of subjects, race of subjects
    # :output: df containing the group, covariate (race) either black or white, and allostatic score
    # """
    
    # creating a vector for variables that increase atherosclerotic risk
    allostatic_load_biomarkers = c('Cortisol','Noradrenaline','Hba1c','Fibrinogen','CRP')
    
    # filtering df for these allostatic load biomarkers
    load_df = mediator_score_df %>%
        filter(Variable %in% allostatic_load_biomarkers) 
    
    # filtering for HDL
    HDL_df = mediator_score_df %>%
        filter(Variable == "HDL") %>%
        select(-Variable)
    
    # summing the mediator scores for biomarkers that increase atherosclerotic risk
    biomarker_load_df = load_df %>%
        group_by(Subject_ID) %>%
        summarize(Mediator_Score_Sum = sum(Mediator_Score))
    
    # allostatic score = sum of scores of all load biomarkers for a subject - individual subject HDL score
    # storing this value in the df as another col
    allostatic_load_df = data.frame(Subject_ID = biomarker_load_df$Subject_ID, 
                                    Allostatic_Load = biomarker_load_df$Mediator_Score_Sum - HDL_df$Mediator_Score)
    
    # putting these values into a vector and adding to the df
    score_df = cbind(Group, Covariate, allostatic_load_df)
    
    return(score_df)
}

In [15]:
# calling function
# these first two lines of code have "NA" values for race, since we weren't looking at race as a covariate until 
# the dfs were stratified later by race
NS_allostatic_score_df = allostatic_score(NS_mediator_score_df, "NS", NA)
CS_allostatic_score_df = allostatic_score(CS_mediator_score_df, "CS", NA)
black_NS_allostatic_score_df = allostatic_score(black_NS_mediator_score_df, "NS", "B")
black_CS_allostatic_score_df = allostatic_score(black_CS_mediator_score_df, "CS", "B")
white_NS_allostatic_score_df = allostatic_score(white_NS_mediator_score_df, "NS", "W")
white_CS_allostatic_score_df = allostatic_score(white_CS_mediator_score_df, "CS", "W")

# viewing one of the outputs
head(NS_allostatic_score_df)

Unnamed: 0_level_0,Group,Covariate,Subject_ID,Allostatic_Load
Unnamed: 0_level_1,<chr>,<lgl>,<chr>,<dbl>
1,NS,,NS_37,1.127845
2,NS,,NS_38,2.3206
3,NS,,NS_39,1.282577
4,NS,,NS_40,1.012015
5,NS,,NS_41,2.045814
6,NS,,NS_49,1.953981


In [16]:
# creating 1 df for the allostatic scores
# using the rbind function to combine by rows
allostatic_score_df = rbind(NS_allostatic_score_df, CS_allostatic_score_df, black_NS_allostatic_score_df, 
                            black_CS_allostatic_score_df, white_NS_allostatic_score_df, 
                            white_CS_allostatic_score_df) %>%
    # sorting the df from highest to lowest for easier viewing
    arrange(-Allostatic_Load)

head(allostatic_score_df)

Unnamed: 0_level_0,Group,Covariate,Subject_ID,Allostatic_Load
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<dbl>
1,NS,B,NS_50,3.075079
2,CS,B,CS_24,2.982104
3,CS,,CS_32,2.902104
4,NS,W,NS_49,2.873066
5,CS,W,CS_31,2.832494
6,NS,W,NS_38,2.81519


In [17]:
# exporting results
write.xlsx(allostatic_score_df, paste0(Output,"/", "Allostatic_Load_BP_Subjects_", cur_date, ".xlsx"), rowNames = FALSE)

ADD WORDS ABOUT THREE DIFFERENT METHODS

# Expression Change Summation (ECS)

sum of absolute FC, but what am I taking the fC between?

https://github.com/UNC-CEMALB/P1005_Miscellaneous-Analyses/blob/main/P1005.2.%20Allostatic%20Load/P1005.2.3.1.%20Allostatic%20Load%20Calculation/Allostatic%20Load%20Calculation.ipynb