# Final Ensemble Prediction Model

In [1]:
library(randomForest)


randomForest 4.7-1

Type rfNews() to see new features/changes/bug fixes.



In [2]:
# loading layer 1 model scores and preprocessed data
load("rData/training_labels.Rdata")
load("models/String_score.Rdata")
load("models/Brainspan_score.Rdata")
load("models/integrated_score.Rdata")

# load metadata containing various predictors and summary statistics scores including DAWN, DAMAGES, and Krishnan et al scores
metadata = read.csv("data/composite_table.csv", stringsAsFactors = F, row.names = 1)


string_preds = string_preds[rownames(metadata), ]
bs_preds = bs_preds[rownames(metadata), ]

# Combine other predictors with integrated model scores
metadata = cbind(
  data.frame(
    String_score = string_preds[rownames(metadata) , "TRUE"],
    BrainSpan_score = bs_preds[rownames(metadata), "TRUE"]
  ),
  metadata[rownames(metadata), ]
)


# Defining training data
# Cleaning training data by removing gene identifiers, etc.
metadata_train = na.roughfix(metadata[metadata$ensembl_string %in% c(pos_genes,neg_genes), -(3:9)]) 

labels = as.factor(rownames(metadata_train) %in% pos_genes)

# Training the final ensembl model
set.seed(43775)
final_model = randomForest(
  y = labels,
  x = metadata_train, 
  importance = T,
  do.trace = 10,
  strata = labels,
  sampsize = c(76,76)
  )

# Defining test data
# Cleaning training data by removing gene identifiers, etc.
metadata_test = na.roughfix(metadata[!(metadata$ensembl_string %in% c(pos_genes,neg_genes)), -(3:9)]) 

# Get predictions for test data
metadata_preds <- predict(final_model, metadata_test, type = "prob")

metadata_score <- rbind(final_model$votes, metadata_preds)

# Append all the final OOB prediction scores to a data frame
final_data <- cbind(
  data.frame(
    final_prediction_scores = metadata_score[rownames(metadata),"TRUE"],
    'STRING+BrainSpan_RF' = integrated_preds[rownames(metadata),"TRUE"]
  ), 
  metadata
)

# Save the scores as a csv file
write.csv(final_data, 
          file = "prediction_scores.csv",
          quote = F, 
          row.names = F)




ntree      OOB      1      2
   10:   6.51%  5.30% 22.67%
   20:   4.55%  3.50% 18.42%
   30:   4.09%  3.40% 13.16%
   40:   4.00%  3.30% 13.16%
   50:   4.09%  3.40% 13.16%
   60:   3.81%  3.10% 13.16%
   70:   3.90%  3.10% 14.47%
   80:   4.18%  3.30% 15.79%
   90:   3.90%  3.30% 11.84%
  100:   4.28%  3.60% 13.16%
  110:   4.09%  3.50% 11.84%
  120:   4.09%  3.50% 11.84%
  130:   4.18%  3.60% 11.84%
  140:   4.09%  3.50% 11.84%
  150:   4.37%  3.70% 13.16%
  160:   4.37%  3.70% 13.16%
  170:   4.37%  3.80% 11.84%
  180:   4.46%  3.80% 13.16%
  190:   4.09%  3.50% 11.84%
  200:   4.28%  3.70% 11.84%
  210:   4.37%  3.70% 13.16%
  220:   4.65%  4.00% 13.16%
  230:   4.46%  3.70% 14.47%
  240:   4.74%  4.00% 14.47%
  250:   4.65%  4.00% 13.16%
  260:   4.55%  3.80% 14.47%
  270:   4.46%  3.70% 14.47%
  280:   4.65%  3.90% 14.47%
  290:   4.55%  3.80% 14.47%
  300:   4.46%  3.70% 14.47%
  310:   4.46%  3.80% 13.16%
  320:   4.55%  3.80% 14.47%
  330:   4.55%  3.80% 14.47%
  340:   4.65%