In [6]:
source("load_libraries.R")

“Don't need to call dbFetch() for statements, only for queries”

*
*  Package WGCNA 1.63 loaded.
*
*    Important note: It appears that your system supports multi-threading,
*    but it is not enabled within WGCNA in R. 
*    To allow multi-threading within WGCNA with all available cores, use 
*
*          allowWGCNAThreads()
*
*    within R. Use disableWGCNAThreads() to disable threading if necessary.
*    Alternatively, set the following environment variable on your system:
*
*          ALLOW_WGCNA_THREADS=<number_of_processors>
*
*    for example 
*
*          ALLOW_WGCNA_THREADS=4
*
*    To set the environment variable in linux bash shell, type 
*
*           export ALLOW_WGCNA_THREADS=4
*
*     before running R. Other operating systems or shells will
*     have a similar command to achieve the same aim.
*


Allowing multi-threading with up to 4 threads.
[1] "preparing gene to GO mapping data..."
[1] "preparing IC data..."
[1] "preparing gene to GO mapping data..."
[1] "preparing IC data..."
[1] "preparing gene to GO mapping data..."
[1] "prepar

In [7]:
source("functions.R")

Load the gene length

In [1]:
gene_length = read.table("../data/gene_length.tabular", h = F, row.names = 1)
save(gene_length, file="../results/dge/gene_length.RData")

Load the count table

In [2]:
counts = read.table("../data/counts.tabular", sep="\t", header = 1, row.names=1)

Remove the columns with spurious data:
- SPF_8w_F_2_2: low mapping rate (70.9%) and assignment rate (31.3%)
- SPF_8w_F_5_2: low assignment rate (53.5%)
- SPF_8w_M_1_2: low number of assigned reads (14.4)
- SPF_52w_F_1_2 ?

In [3]:
to_remove = c("SPF_8w_F_2_2", "SPF_8w_F_5_2", "SPF_8w_M_1_2")
counts = counts[,!names(counts) %in% to_remove]

Remove the rows with only zeros

In [4]:
nrow(counts)
counts = counts[ rowSums(counts) > 1, ]
nrow(counts)

Rename the genes to be sure there is a map with Entre Gene Identifeies

In [8]:
# Map between Entrez Gene Identifiers and Gene Names (and description) 
eg2name = get_list(org.Mm.egGENENAME)
# Map between Gene Symbols and Entrez Gene Identifiers
symbol2eg = get_list(org.Mm.egSYMBOL2EG)
# Map between RefSeq and Entrez Gene Identifiers
refseq2eg = get_list(org.Mm.egREFSEQ2EG)
# Map between Gene Symbols and Gene Names (and description)
symbol2name = sapply(names(symbol2eg), function(x) return(eg2name[symbol2eg[[x]]]))
head(symbol2name)
# Map between Entrez Gene Identifiers and KEGG pathways
eg2kegg=as.list(org.Mm.egPATH)

In [9]:
# get refseq ids for genes
refseq = sapply(rownames(counts), function(x) return(symbol2eg[[x]])) 
length(refseq[sapply(refseq, is.null)])
to_change = names(refseq[sapply(refseq, is.null)])

In [10]:
changes = sapply(to_change, search_name)

In [11]:
doubled = sapply(names(changes), function(x) return(length(changes[[x]])>1))
single = changes[!doubled]
duplicated = single %in% rownames(counts)
non_duplicated = single[!duplicated]
duplicated_2 = duplicated(non_duplicated)
replacements = non_duplicated[!duplicated_2]

In [12]:
# Fix name of the some genes (identified if no Entrez id found for them and with new names found)
replacements = unlist(replacements)
#rownames(counts)[rownames(counts) == "1700112E06Rik"] = "Lrmda"
new_names = rownames(counts)
names(new_names) = rownames(counts)
new_names[names(replacements)] = replacements
head(new_names)

In [13]:
gene_length = gene_length[rownames(counts),]
rownames(counts) = new_names
names(gene_length) = new_names
save(counts, file="../results/dge/prepared_counts.RData")

In [14]:
refseq_check = sapply(rownames(counts), function(x) return(symbol2eg[[x]])) 
length(refseq[sapply(refseq_check, is.null)])

Extract the metadata

In [15]:
file_desc = read.csv("../data/file_description.csv", row.names = 1)
file_desc = head(file_desc,-6)
file_desc = file_desc[!rownames(file_desc) %in% c("SPF_8w_F_2"), ]
file_desc$"Lane" = gsub(" & ", "_",file_desc$"Lane")
file_desc$"Lane" = gsub(" ", "1",file_desc$"Lane")
head(file_desc)

Unnamed: 0,Sample.name.prefix,Group,Age,Gender,Project.id,Lane,Replicate,Name.in.project,X..Reads
GF_52w_M_1,GF_52w_M,GF,52w,M,Project_148,2,1,Sample_Mg_GF1_old,35713942.0
GF_8w_M_1,GF_8w_M,GF,8w,M,Project_148,2,1,Sample_Mg_GF1_young,25919398.0
GF_52w_M_2,GF_52w_M,GF,52w,M,Project_148,2,2,Sample_Mg_GF2_old,29752263.0
GF_8w_M_2,GF_8w_M,GF,8w,M,Project_148,2,2,Sample_Mg_GF2_young,24133081.0
GF_52w_M_3,GF_52w_M,GF,52w,M,Project_148,3,3,Sample_Mg_GF3_old,26395568.0
GF_8w_M_3,GF_8w_M,GF,8w,M,Project_148,1,3,Sample_Mg_GF3_young,33992780.0


In [16]:
metadata = t(sapply(sapply(colnames(counts), strsplit, split = "_"), unlist))
short_names = paste(metadata[,1], metadata[,2], metadata[,3], metadata[,4], sep ="_")
names(short_names) = rownames(metadata)
metadata = metadata[,-5]
colnames(metadata) = c("type", "age", "gender", "replicate")
metadata = as.data.frame(metadata)
metadata$replicate <- NULL
metadata$project = paste(unlist(file_desc[short_names,'Project.id']))
metadata$project = gsub("Project_", "", metadata$project)
metadata$project[metadata$project == '148'] = 'S148'
metadata

Unnamed: 0,type,age,gender,project
GF_104w_F_1_2,GF,104w,F,S264
GF_104w_F_2_2,GF,104w,F,S264
GF_104w_F_3_2,GF,104w,F,S264
GF_104w_M_1_2,GF,104w,M,S264
GF_104w_M_2_2,GF,104w,M,S264
GF_52w_M_1_2,GF,52w,M,S148
GF_52w_M_2_2,GF,52w,M,S148
GF_52w_M_3_2,GF,52w,M,S148
GF_52w_M_4_2,GF,52w,M,S148
GF_8w_M_1_2,GF,8w,M,S148


In [17]:
save(metadata, file="../results/dge/metadata.RData")