# Differentiel gene expression

Andreas Svendsen

In [None]:
library(data.table)
library(limma)
library(edgeR)
library(broom)

For this project we will just look at the CYP enzymes in the dataset

Import the filtered and normalised data

In [None]:
dge_data_norm <- readRDS("outputs/data_processed/dge_data_norm.rds")

In [None]:
keep_indices <- grep("^CYP", dge_data_norm$genes$symbol)

cyp_data <- dge_data_norm[keep_indices, keep.lib.sizes = FALSE]
cyp_data

An object of class "DGEList"
$counts
          S1     S2     S5     S6     S7     S8    S11    S12    S13    S14
10537 244845 249729 199006 425694  91101  74426 204959 153888  37562  24714
3275  145411 139719 130302 270922 153799 100256 166322 139129 139223 131577
7606  114762 117381  97862 216134  71868  53548  98347  78604  59463  52498
7973   70586  83376  68178 143783  76489  51577  78570  56730  38888  33020
17385  49761  56129  41159  82353  10675   8117  50186  41031  40425  36299
         S15    S16   S17    S18    S19    S20
10537 802526 883195 61170  88579 436836 166170
3275  245926 250821 74993 129204 326703 116288
7606  173919 186999 28389  46772 204208  70775
7973   76718  73618 24882  43597 165762  55073
17385 129983 146769  5595   8262 113872  42412
42 more rows ...

$samples
   group lib.size norm.factors       treatment sample_name replicate
S1     1   802098    0.9784508 Vehicle_control    AS0018_1         1
S2     1   844460    1.0028307 Vehicle_control    AS0018_2  

When using the basic poisson regression model we assume that counts are poisson distribted with a mean equal to the variance. This model does not inherently account for library size or composition effect.

In [None]:
# cyp_data is the subset of genes we want to investigate
counts <- cyp_data$counts
genes <- rownames(counts)
samples <- colnames(counts)

# Creating a long-format data frame
data_long <- data.frame(gene = rep(genes, each = length(samples)),
                        sample = rep(samples, times = length(genes)),
                        count = as.vector(counts),
                        treatment = cyp_data$samples$treatment) 

In [None]:
# Select the first gene
first_gene <- unique(data_long$gene)[1]
first_gene_data <- subset(data_long, gene == first_gene)

# For each cytokine treatment, create a binary variable that is 1 for that specific cytokine treatment and 0 for the vehicle control.

first_gene_data$treatment_IL_6_01_ng_ml <- as.numeric(first_gene_data$treatment == "01_ng_ml_IL_6")
first_gene_data$treatment_IL_6_10_ng_ml <- as.numeric(first_gene_data$treatment == "10_ng_ml_IL_6")
first_gene_data$treatment_IL_1B_01_ng_ml <- as.numeric(first_gene_data$treatment == "01_ng_ml_IL_1B")
first_gene_data$treatment_IL_1B_10_ng_ml <- as.numeric(first_gene_data$treatment == "10_ng_ml_IL_1B")

In [None]:
# Poisson models for each cytokine treatment
poisson_model_IL_6_01_ng_ml <- glm(count ~ treatment_IL_6_01_ng_ml, family = poisson(), data = first_gene_data)
poisson_model_IL_6_10_ng_ml <- glm(count ~ treatment_IL_6_10_ng_ml, family = poisson(), data = first_gene_data)
poisson_model_IL_1B_01_ng_ml <- glm(count ~ treatment_IL_1B_01_ng_ml, family = poisson(), data = first_gene_data)
poisson_model_IL_1B_10_ng_ml <- glm(count ~ treatment_IL_1B_10_ng_ml, family = poisson(), data = first_gene_data)

In [None]:
summary(poisson_model_IL_6_01_ng_ml)


Call:
glm(formula = count ~ treatment_IL_6_01_ng_ml, family = poisson(), 
    data = first_gene_data)

Coefficients:
                         Estimate Std. Error z value Pr(>|z|)    
(Intercept)             10.669258   0.001289    8279   <2e-16 ***
treatment_IL_6_01_ng_ml  0.767586   0.002656     289   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 1016536  on 15  degrees of freedom
Residual deviance:  944330  on 14  degrees of freedom
AIC: 944524

Number of Fisher Scoring iterations: 6


Call:
glm(formula = count ~ treatment_IL_6_10_ng_ml, family = poisson(), 
    data = first_gene_data)

Coefficients:
                         Estimate Std. Error z value Pr(>|z|)    
(Intercept)             10.835898   0.001186 9139.33   <2e-16 ***
treatment_IL_6_10_ng_ml -0.288519   0.003813  -75.67   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 1016536  on 15  degrees of freedom
Residual deviance: 1010347  on 14  degrees of freedom
AIC: 1010541

Number of Fisher Scoring iterations: 6


Call:
glm(formula = count ~ treatment_IL_1B_01_ng_ml, family = poisson(), 
    data = first_gene_data)

Coefficients:
                          Estimate Std. Error z value Pr(>|z|)    
(Intercept)              10.878530   0.001161  9373.5   <2e-16 ***
treatment_IL_1B_01_ng_ml -0.853444   0.004846  -176.1   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 1016536  on 15  degrees of freedom
Residual deviance:  976359  on 14  degrees of freedom
AIC: 976553

Number of Fisher Scoring iterations: 5


Call:
glm(formula = count ~ treatment_IL_1B_10_ng_ml, family = poisson(), 
    data = first_gene_data)

Coefficients:
                          Estimate Std. Error z value Pr(>|z|)    
(Intercept)              10.883578   0.001158  9401.5   <2e-16 ***
treatment_IL_1B_10_ng_ml -0.945326   0.005048  -187.3   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 1016536  on 15  degrees of freedom
Residual deviance:  969605  on 14  degrees of freedom
AIC: 969800

Number of Fisher Scoring iterations: 5

Creating a summary table:

In [None]:
# Creating a summary data frame
summary_data <- 
  data.frame(
    Treatment = 
      c("IL-6 01 ng/ml", "IL-6 10 ng/ml", "IL-1B 01 ng/ml", "IL-1B 10 ng/ml"
        ),
    Log_Expected_Count_Intercept = 
      c(coef(poisson_model_IL_6_01_ng_ml)["(Intercept)"],
        coef(poisson_model_IL_6_10_ng_ml)["(Intercept)"],
        coef(poisson_model_IL_1B_01_ng_ml)["(Intercept)"],
        coef(poisson_model_IL_1B_10_ng_ml)["(Intercept)"]
        ),
    Coefficient = 
      c(coef(poisson_model_IL_6_01_ng_ml)["treatment_IL_6_01_ng_ml"],
        coef(poisson_model_IL_6_10_ng_ml)["treatment_IL_6_10_ng_ml"],
        coef(poisson_model_IL_1B_01_ng_ml)["treatment_IL_1B_01_ng_ml"],
        coef(poisson_model_IL_1B_10_ng_ml)["treatment_IL_1B_10_ng_ml"]),
    P_Value = c(summary(poisson_model_IL_6_01_ng_ml)$coefficients["treatment_IL_6_01_ng_ml", "Pr(>|z|)"],
                summary(poisson_model_IL_6_10_ng_ml)$coefficients["treatment_IL_6_10_ng_ml", "Pr(>|z|)"],
                summary(poisson_model_IL_1B_01_ng_ml)$coefficients["treatment_IL_1B_01_ng_ml", "Pr(>|z|)"],
                summary(poisson_model_IL_1B_10_ng_ml)$coefficients["treatment_IL_1B_10_ng_ml", "Pr(>|z|)"]
                )
)

# Calculating actual expected counts and fold changes
summary_data$Expected_Count = exp(summary_data$Log_Expected_Count_Intercept)
summary_data$Fold_Change = exp(summary_data$Coefficient)

row.names(summary_data) <- NULL

# Print the summary table
print(summary_data)

       Treatment Log_Expected_Count_Intercept Coefficient P_Value
1  IL-6 01 ng/ml                     10.66926   0.7675856       0
2  IL-6 10 ng/ml                     10.83590  -0.2885188       0
3 IL-1B 01 ng/ml                     10.87853  -0.8534440       0
4 IL-1B 10 ng/ml                     10.88358  -0.9453265       0
  Expected_Count Fold_Change
1       43013.00   2.1545579
2       50812.50   0.7493727
3       53025.57   0.4259454
4       53293.93   0.3885527