# Load, tidy and save data

Andreas Svendsen

Preamble

Load our count matrix, gene annotation, and sample information. - Tidy the data - Create our DGEList object. Save this processed data in the output data folder

#### Libraries

In [None]:
library(data.table)
library(RColorBrewer)
library(limma)
library(edgeR)


#### Load data

In [None]:
# Load the count expression matrix. Rows represents genes, and columns samples.

expression_raw <- fread("inputs/data_raw/expression.txt")
expression_raw


         S1   S2   S5   S6   S7   S8  S11  S12  S13  S14  S15  S16 S17  S18
    1: 1327 1322 1105 2569 2696 1513 1597 1107 1681 1751 1709 1447 944 1324
    2:    0    0    0    2    0    2    0    0    1    0    0    0   1    0
    3:  545  607  478 1041  694  480  644  530  620  583  866  933 569  843
    4:  414  482  351  778  786  529  443  429  689  684  544  544 190  411
    5:   73   49   42  117  116   83   73   53   79  118   84   93  34   65
   ---                                                                     
62706:    0    0    0    0    0    0    0    0    0    0    0    0   0    0
62707:    0    0    0    0    0    0    0    0    0    0    0    0   0    0
62708:    0    0    0    0    0    0    0    0    0    0    0    0   0    0
62709:    6   27    6   10   11    2    8    4    6    3   14    4  14   20
62710:    0    6    1    6   12    5    2    5    3    9    8   10   1    6
        S19  S20
    1: 3266 1193
    2:    8    1
    3: 1509  587
    4:  820  293
   

          treatment sample_name replicate group
 1: Vehicle_control    AS0018_1         1     1
 2: Vehicle_control    AS0018_2         2     1
 3:   01_ng_ml_IL_6    AS0018_5         1     2
 4:   01_ng_ml_IL_6    AS0018_6         2     2
 5:   10_ng_ml_IL_6    AS0018_7         1     3
 6:   10_ng_ml_IL_6    AS0018_8         2     3
 7:  01_ng_ml_IL_1B   AS0018_11         1     4
 8:  01_ng_ml_IL_1B   AS0018_12         2     4
 9:  10_ng_ml_IL_1B   AS0018_13         1     5
10:  10_ng_ml_IL_1B   AS0018_14         2     5
11:             Rif   AS0018_15         1     6
12:             Rif   AS0018_16         2     6
13:             Abe   AS0018_17         1     7
14:             Abe   AS0018_18         2     7
15:          Medium   AS0018_19         1     8
16:          Medium   AS0018_20         2     8

           ENSEMBLE_ID          symbol
    1: ENSG00000000003          TSPAN6
    2: ENSG00000000005            TNMD
    3: ENSG00000000419            DPM1
    4: ENSG00000000457           SCYL3
    5: ENSG00000000460        C1orf112
   ---                                
62706: ENSG00000291313 ENSG00000291313
62707: ENSG00000291314 ENSG00000291314
62708: ENSG00000291315 ENSG00000291315
62709: ENSG00000291316 ENSG00000291316
62710: ENSG00000291317         TMEM276

#### Tidy data

In [None]:
# Convert counts data to matrix. Mostly because of convension, but also for
# performance reasons. However, EdgeR do allow DFs now.
expression_raw <- as.matrix(expression_raw)

# Add a color designating group _color column to the sample info
color_scheme <-
  brewer.pal(
    n = length(unique(sample_info$treatment)), # n colors = unique treatments
    name = "Paired" # Palette name to get the color hexadecimal number from.
  )

# Add hexidecimal for black and put first in the vector.
color_scheme <- append(color_scheme, values = "#000000", after = 0)
# Remove the last hexadecimal color code, so we are back to one color/treatment
color_scheme <- color_scheme[-9]
color_scheme # Looks good. 8 color codes for 8 treatments


[1] "#000000" "#A6CEE3" "#1F78B4" "#B2DF8A" "#33A02C" "#FB9A99" "#E31A1C"
[8] "#FDBF6F"

Vehicle_control   01_ng_ml_IL_6   10_ng_ml_IL_6  01_ng_ml_IL_1B  10_ng_ml_IL_1B 
      "#000000"       "#A6CEE3"       "#1F78B4"       "#B2DF8A"       "#33A02C" 
            Rif             Abe          Medium 
      "#FB9A99"       "#E31A1C"       "#FDBF6F" 

          treatment sample_name replicate group group_color
 1: Vehicle_control    AS0018_1         1     1     #000000
 2: Vehicle_control    AS0018_2         2     1     #000000
 3:   01_ng_ml_IL_6    AS0018_5         1     2     #A6CEE3
 4:   01_ng_ml_IL_6    AS0018_6         2     2     #A6CEE3
 5:   10_ng_ml_IL_6    AS0018_7         1     3     #1F78B4
 6:   10_ng_ml_IL_6    AS0018_8         2     3     #1F78B4
 7:  01_ng_ml_IL_1B   AS0018_11         1     4     #B2DF8A
 8:  01_ng_ml_IL_1B   AS0018_12         2     4     #B2DF8A
 9:  10_ng_ml_IL_1B   AS0018_13         1     5     #33A02C
10:  10_ng_ml_IL_1B   AS0018_14         2     5     #33A02C
11:             Rif   AS0018_15         1     6     #FB9A99
12:             Rif   AS0018_16         2     6     #FB9A99
13:             Abe   AS0018_17         1     7     #E31A1C
14:             Abe   AS0018_18         2     7     #E31A1C
15:          Medium   AS0018_19         1     8     #FDBF6F
16:          Medium   AS0018_20         

Classes 'data.table' and 'data.frame':  16 obs. of  5 variables:
 $ treatment  : Factor w/ 8 levels "Vehicle_control",..: 1 1 2 2 3 3 4 4 5 5 ...
 $ sample_name: chr  "AS0018_1" "AS0018_2" "AS0018_5" "AS0018_6" ...
 $ replicate  : int  1 2 1 2 1 2 1 2 1 2 ...
 $ group      : Factor w/ 8 levels "1","2","3","4",..: 1 1 2 2 3 3 4 4 5 5 ...
 $ group_color: Factor w/ 8 levels "#000000","#A6CEE3",..: 1 1 2 2 3 3 4 4 5 5 ...
 - attr(*, ".internal.selfref")=<externalptr> 

           ENSEMBLE_ID          symbol                       id_symbol
    1: ENSG00000000003          TSPAN6          ENSG00000000003;TSPAN6
    2: ENSG00000000005            TNMD            ENSG00000000005;TNMD
    3: ENSG00000000419            DPM1            ENSG00000000419;DPM1
    4: ENSG00000000457           SCYL3           ENSG00000000457;SCYL3
    5: ENSG00000000460        C1orf112        ENSG00000000460;C1orf112
   ---                                                                
62706: ENSG00000291313 ENSG00000291313 ENSG00000291313;ENSG00000291313
62707: ENSG00000291314 ENSG00000291314 ENSG00000291314;ENSG00000291314
62708: ENSG00000291315 ENSG00000291315 ENSG00000291315;ENSG00000291315
62709: ENSG00000291316 ENSG00000291316 ENSG00000291316;ENSG00000291316
62710: ENSG00000291317         TMEM276         ENSG00000291317;TMEM276

#### Create DGEList

In [None]:
dge_data <- DGEList(
  counts = expression_raw,
  genes = annotation_info,
  samples = sample_info
)


#### Save the DGEList

In [None]:
saveRDS(
  object = dge_data,
  file = "outputs/data_processed/dge_data.rds"
)


In [None]:
rm(list = ls())
