# Layer 1 Model to generate STRING prediction scores

In [1]:
library(randomForest)

randomForest 4.7-1

Type rfNews() to see new features/changes/bug fixes.



In [2]:
#load pre-processed data files
load("rData/String_network.Rdata")
load("rData/training_labels.Rdata")

# all examples as input features
x_train = shortest_path[rownames(shortest_path) %in% c(pos_genes,neg_genes), ]
# only positive examples as lables
y_train = as.factor( rownames(x_train) %in% pos_genes )

# Train a simple random forest model limiting ntree=500 for effecient results
set.seed(2176)
string_model = randomForest(
  y = y_train,
  x = x_train,
  importance = T,
  strata = y_train,
  sampsize = c(77,77),
  ntree=500, 
  do.trace=10,
  proximity=T)


ntree      OOB      1      2
   10:  28.78% 26.80% 54.55%
   20:  24.42% 22.90% 44.16%
   30:  23.40% 21.70% 45.45%
   40:  23.68% 22.10% 44.16%
   50:  23.40% 21.50% 48.05%
   60:  21.91% 20.00% 46.75%
   70:  21.08% 19.30% 44.16%
   80:  20.61% 18.70% 45.45%
   90:  19.78% 18.00% 42.86%
  100:  19.87% 18.00% 44.16%
  110:  19.96% 18.20% 42.86%
  120:  19.87% 18.10% 42.86%
  130:  19.50% 18.20% 36.36%
  140:  19.31% 17.80% 38.96%
  150:  19.59% 18.00% 40.26%
  160:  18.48% 16.70% 41.56%
  170:  18.85% 17.30% 38.96%
  180:  18.57% 16.80% 41.56%
  190:  18.85% 17.10% 41.56%
  200:  18.29% 16.60% 40.26%
  210:  18.57% 16.90% 40.26%
  220:  18.85% 17.20% 40.26%
  230:  18.66% 17.10% 38.96%
  240:  19.13% 17.20% 44.16%
  250:  19.22% 17.50% 41.56%
  260:  19.41% 17.70% 41.56%
  270:  19.31% 17.60% 41.56%
  280:  19.87% 18.20% 41.56%
  290:  19.78% 18.00% 42.86%
  300:  19.68% 17.90% 42.86%
  310:  19.50% 17.70% 42.86%
  320:  19.78% 18.10% 41.56%
  330:  19.59% 18.10% 38.96%
  340:  19.78%

In [3]:
# Apply stratification by the predicted class labels
strata = string_model$y:string_model$pred
# Set sample size as the smallest stratified class
sampsize = rep(min(table(strata)),4)

# Training Random Forest model by excluding the unused variables in the tree
set.seed(679)
while( sum( varUsed(string_model) < 1 ) > 0 ){
 string_model = randomForest(
   y = y_train,
   x = x_train[, rownames(string_model$importance)[varUsed(string_model) > 0] ],
   importance = T,
   strata = strata,
   sampsize = sampsize,
   ntree = 500,
   do.trace = 100,
   proximity = T)
 
 print(paste(sum(varUsed(string_model) == 0), "out of", nrow(string_model$importance), "variables unused"))
}

# get predictions on all remaining genes
string_preds = predict(string_model, shortest_path[!rownames(shortest_path) %in% rownames(x_train), ], type="prob")
string_preds = rbind(string_preds, string_model$vote)
string_preds = string_preds[ order(string_preds[,2], decreasing=T), ]

# save the model
save(string_model,string_preds,file="models/STRING_score.Rdata")
print("Model saved")

ntree      OOB      1      2
  100:  11.51%  7.40% 64.94%
  200:  10.12%  6.00% 63.64%
  300:   9.29%  4.90% 66.23%
  400:   9.29%  4.80% 67.53%
  500:   9.29%  4.80% 67.53%
[1] "2315 out of 8043 variables unused"
ntree      OOB      1      2
  100:  10.96%  7.20% 59.74%
  200:   9.66%  5.10% 68.83%
  300:   9.10%  4.50% 68.83%
  400:   9.19%  4.50% 70.13%
  500:   8.73%  4.20% 67.53%
[1] "929 out of 5728 variables unused"
ntree      OOB      1      2
  100:  10.12%  5.80% 66.23%
  200:   9.56%  5.50% 62.34%
  300:   9.19%  5.10% 62.34%
  400:   9.38%  5.00% 66.23%
  500:   9.29%  5.00% 64.94%
[1] "541 out of 4799 variables unused"
ntree      OOB      1      2
  100:   9.29%  5.50% 58.44%
  200:   7.61%  4.00% 54.55%
  300:   7.80%  3.90% 58.44%
  400:   7.52%  3.70% 57.14%
  500:   7.99%  4.10% 58.44%
[1] "348 out of 4258 variables unused"
ntree      OOB      1      2
  100:   9.10%  5.30% 58.44%
  200:   9.66%  5.20% 67.53%
  300:   8.91%  4.50% 66.23%
  400:   8.36%  4.10% 63.64%
  

[1] "Model saved"
