# Data input, cleaning and pre-processing

## Loading expression data

In [2]:
library(WGCNA)
options(stringsAsFactors = FALSE)
femData = read.csv("LiverFemale3600.csv")
maleData = read.csv("LiverMale3600.csv")

In [6]:
dim(femData)
names(femData)

In [32]:
nSets = 2
setLabels = c("Female liver", "Male liver")
shortLabels = c("Female", "Male")

multiExpr = vector(mode = "list", length = nSets)
multiExpr[[1]] = list(data = as.data.frame(t(femData[-c(1:8)]))) # name is "data"
names(multiExpr[[1]]$data) = femData$substanceBXH # feature name
rownames(multiExpr[[1]]$data) = names(femData)[-c(1:8)] # name of samples
multiExpr[[2]] = list(data = as.data.frame(t(maleData[-c(1:8)])))
names(multiExpr[[2]]$data) = maleData$substanceBXH;
rownames(multiExpr[[2]]$data) = names(maleData)[-c(1:8)];
exprSize = checkSets(multiExpr)

In [33]:
exprSize

## Rudimentary data cleaning and outlier removal

In [34]:
# Check that all genes and samples have sufficiently low numbers of missing values.
gsg = goodSamplesGenesMS(multiExpr, verbose = 3)
gsg$allOK

 Flagging genes and samples with too many missing values...
  ..step 1
   ..bad gene count: 0, bad sample counts: 0, 0


In [35]:
sampleTrees = list()
for (set in 1:nSets){
    sampleTrees[[set]] = hclust(dist(multiExpr[[set]]$data), method = "average")
}

In [41]:
pdf(file = "Plots_SampleClustering.pdf", width = 12, height = 12)
par(mfrow=c(2,1))
par(mar = c(0, 4, 2, 0))
for (set in 1:nSets){
    plot(sampleTrees[[set]], 
         main = paste("Sample clustering on all genes in", setLabels[set]),
         xlab="", sub="", cex = 0.7)
}
dev.off() #device.off

In [45]:
#There is an outliers
# Choose the "base" cut height for the female data set
baseHeight = 16
cutHeights = c(16, 16*exprSize$nSamples[2]/exprSize$nSamples[1])
# Re-plot the dendrograms including the cut lines
pdf(file = "Plots_SampleClustering.pdf", width = 12, height = 12)
par(mfrow=c(2,1))
par(mar = c(0, 4, 2, 0))
for (set in 1:nSets){
    plot(sampleTrees[[set]], 
         main = paste("Sample clustering on all genes in", setLabels[set]),
         xlab="", sub="", cex = 0.7)
    abline(h=cutHeights[set], col = "red")
}
dev.off()

In [48]:
#  outlier removal
for (set in 1:nSets){
    # Find clusters cut by the line
    labels = cutreeStatic(sampleTrees[[set]], cutHeight = cutHeights[set])
    labels
    # Keep the largest one (labeled by the number 1)
    keep = (labels==1)
    multiExpr[[set]]$data = multiExpr[[set]]$data[keep, ]
}
collectGarbage();

# Check the size of the leftover data
exprSize = checkSets(multiExpr)
exprSize

In [47]:
cutreeStatic(sampleTrees[[1]], cutHeight = cutHeights[1])==1

## Loading clinical trait data

In [49]:
traitData = read.csv("ClinicalTraits.csv")
names(traitData)
dim(traitData)

In [52]:
# remove columns that hold information we do not need.
allTraits = traitData[, -c(31, 16)]
allTraits = allTraits[, c(2, 11:36) ]
dim(allTraits)
names(allTraits)

In [62]:
# Form a multi-set structure that will hold the clinical traits.
Traits = vector(mode="list", length = nSets)
for (set in 1:nSets){
    setSamples = rownames(multiExpr[[set]]$data)
    traitRows = match(setSamples, allTraits$Mice) # find a correspondance between rows
    Traits[[set]] = list(data = allTraits[traitRows, -1])
    rownames(Traits[[set]]$data) = allTraits[traitRows, 1]
}
collectGarbage()
# Define data set dimensions
nGenes = exprSize$nGenes
nSamples = exprSize$nSamples

In [63]:
nSamples

In [65]:
save(multiExpr, Traits, nGenes, nSamples, setLabels, 
     shortLabels, exprSize,file = "Consensus-dataInput.RData")