In [None]:
library(data.table)
library(dglm)
library(dplyr)
library(qqman)

# Functions

In [None]:
load_and_process_tsv <- function(df, phenotype) {
    colnames(df) <- gsub(":", "_", colnames(df))
    df <- df[complete.cases(df[[phenotype]]), ]
    df[[phenotype]] <- qnorm((rank(df[[phenotype]],na.last="keep")-0.5)/sum(!is.na(df[[phenotype]])))
    results_df <- data.frame(Estimate=numeric(),
        Std.Error=numeric(),
        t.value=numeric(),
        "Pr(>|t|)"=numeric(),
        "SNP"=character())

    for (col in names(df)) {
        if (grepl(":", col) | grepl("rs", col)) {
            formula <- paste0("~genetic_sex+baseline_age+", col, "+PC1+PC2+PC3+PCD4")
            model <- paste0(phenotype, formula)
            col_df <- df[complete.cases(df[[col]]) &
                                        df[[phenotype]] > 0 &
                                        df[[col]] > 0 &
                                        complete.cases(df[[phenotype]]), ]
            if (sum(is.na(df[[col]])) < 10000) {
                out <- dglm(as.formula(model), as.formula(model), data=col_df)
                mean_out <- summary(out)
                disp_out <- summary(out$dispersion.fit)
                if (col %in% rownames(disp_out$coefficients)) {
                    mean_row <- mean_out$coefficients[col,]
                    summary_out_disp <- summary(out$dispersion.fit)
                    disp_row <- disp_out$coefficients[col, ]
                    snp_df <- data.frame(Mean.Estimate = mean_row['Estimate'],
                                         Mean.Std.Error = mean_row['Std. Error'],
                                         Mean.t.value = mean_row['t value'],
                                         "Mean.Pr(>|t|)" = mean_row['Pr(>|t|)'],
                                         Dispersion.Estimate = disp_row['Estimate'],
                                         Dispersion.Std.Error = disp_row['Std. Error'],
                                         Dispersion.t.value = disp_row['t value'],
                                         "Dispersion.Pr(>|t|)" = disp_row['Pr(>|t|)'],
                                         SNP = col)
                    results_df <- rbind(results_df, snp_df)
                }
            }
        }
    }
    return(results_df)
}

In [None]:
dglm_gene_trait_pair <- function(input_filename, pheno, qqplot_filename, results_filename) {
    df <- fread(input_filename)
    alpha_denom <- length(df)
    df <- na.omit(df, cols=c("genetic_sex", 
                             "baseline_age", 
                             "PC1", 
                             "PC2",
                             "PC3",
                             "PCD4",
                             pheno))
    results_df <- load_and_process_tsv(df, pheno)
    pdf(qqplot_filename)
    qq(results_df$Dispersion.Pr...t..)
    dev.off()
    write.table(results_df %>% arrange(Dispersion.Pr...t..),
               results_filename, sep='\t', row.names=FALSE)
}

# Examples

In [None]:
dglm_gene_trait_pair("pnpla3_ast_dglm.tsv", 
                     'ast', 
                     'pnpla3_ast_dglm_disperson_qqplot.pdf',
                     'pnpla3_ast_dglm_results.tsv')

In [None]:
dglm_gene_trait_pair("hfe_hgb_dglm.tsv", 
                     'hgb', 
                     'hfe_hgb_dglm_disperson_qqplot.pdf',
                     'hfe_hgb_dglm_results.tsv')