# Data Wrangling

The purpose of this notebook is to compile the data from each machine learning algorithm into a format that is easier to analyze.

In this notebook we:
- Read in the files
- Normalize any data that still needs to be
- Combine data into a large super matrix for each data set
- Write super matrix out to a file

## Read in each output file
### Logistic Regression files

In [1]:
#logistic regression files
log.reg.coef.10k = read.table("log_reg/LogRegCoef_10k.txt", sep="\t", header=T)
log.reg.coef.5k = read.table("log_reg/LogRegCoef_5k.txt", sep="\t", header=T)
log.reg.coef.2.5k = read.table("log_reg/LogRegCoef_2.5k.txt", sep="\t", header=T)
log.reg.coef.1k = read.table("log_reg/LogRegCoef_1k.txt", sep="\t", header=T)
log.reg.coef.100 = read.table("log_reg/LogRegCoef_100.txt", sep="\t", header=T)
log.reg.acc.int.10k = read.table("log_reg/LogRegAcc_Int_10k.txt", sep="\t", header=T)
log.reg.acc.int.5k = read.table("log_reg/LogRegAcc_Int_5k.txt", sep="\t", header=T)
log.reg.acc.int.2.5k = read.table("log_reg/LogRegAcc_Int_2.5k.txt", sep="\t", header=T)
log.reg.acc.int.1k = read.table("log_reg/LogRegAcc_Int_1k.txt", sep="\t", header=T)
log.reg.acc.int.100 = read.table("log_reg/LogRegAcc_Int_100.txt", sep="\t", header=T)

#sanity check
# print("Logistic regression")
# dim(log.reg.acc.int.10k)
# dim(log.reg.acc.int.5k)
# dim(log.reg.acc.int.2.5k)
# dim(log.reg.acc.int.1k)
# dim(log.reg.acc.int.100)
# dim(log.reg.coef.10k)
# dim(log.reg.coef.5k)
# dim(log.reg.coef.2.5k)
# dim(log.reg.coef.1k)
# dim(log.reg.coef.100)
log.reg.acc.int.10k
log.reg.acc.int.5k
log.reg.acc.int.2.5k
log.reg.acc.int.1k
log.reg.acc.int.100
# log.reg.coef.10k[, 1:6]
# log.reg.coef.5k[, 1:6]
# log.reg.coef.2.5k[, 1:6]
# log.reg.coef.1k[, 1:6]
# log.reg.coef.100[, 1:6]

Unnamed: 0,Accuracy,Intercept
Fold 1_10k,1.0,-786.4671
Fold 2_10k,1.0,1802.1233
Fold 3_10k,1.0,-351.5635
Fold 4_0k,0.9714286,-840.7232
Fold 5_10k,1.0,300.9593


Unnamed: 0,Accuracy,Intercept
Fold 1_5k,0.9722222,641.0724
Fold 2_5k,0.9714286,1456.1149
Fold 3_5k,0.9428571,521.6163
Fold 4_5k,0.9428571,-1198.7052
Fold 5_5k,0.9722222,-295.9289


Unnamed: 0,Accuracy,Intercept
Fold 1_2.5k,0.9722222,1838.3546
Fold 2_2.5k,0.9428571,4548.3233
Fold 3_2.5k,0.9714286,497.3577
Fold 4_2.5k,0.9428571,-914.9638
Fold 5_2.5k,0.9722222,148918.8378


Unnamed: 0,Accuracy,Intercept
Fold 1_1k,1.0,2886.3864
Fold 2_1k,0.9714286,-578.1678
Fold 3_1k,1.0,8913.0326
Fold 4_1k,0.9428571,675.7159
Fold 5_1k,0.9722222,-404.6812


Unnamed: 0,Accuracy,Intercept
Fold 1_100,0.9444444,34.83226
Fold 2_100,0.9714286,240.93207
Fold 3_100,0.9714286,48.26701
Fold 4_100,0.9714286,107.74293
Fold 5_100,0.9444444,119.8455


### LASSO files

In [2]:
#lasso files
lasso.coef.10k = read.table("lasso/LassoCoef_10k.txt", sep="\t", header=T)
lasso.coef.5k = read.table("lasso/LassoCoef_5k.txt", sep="\t", header=T)
lasso.coef.2.5k = read.table("lasso/LassoCoef_2.5k.txt", sep="\t", header=T)
lasso.coef.1k = read.table("lasso/LassoCoef_1k.txt", sep="\t", header=T)
lasso.coef.100 = read.table("lasso/LassoCoef_100.txt", sep="\t", header=T)
lasso.acc.int.10k = read.table("lasso/LassoAcc_Int_10k.txt", sep="\t", header=T)
lasso.acc.int.5k = read.table("lasso/LassoAcc_Int_5k.txt", sep="\t", header=T)
lasso.acc.int.2.5k = read.table("lasso/LassoAcc_Int_2.5k.txt", sep="\t", header=T)
lasso.acc.int.1k = read.table("lasso/LassoAcc_Int_1k.txt", sep="\t", header=T)
lasso.acc.int.100 = read.table("lasso/LassoAcc_Int_100.txt", sep="\t", header=T)

#lasso files
# print("lasso")
# dim(lasso.coef.10k)
# dim(lasso.coef.5k)
# dim(lasso.coef.2.5k)
# dim(lasso.coef.1k)
# dim(lasso.coef.100)
# lasso.coef.10k[, 1:6]
# lasso.coef.5k[, 1:6]
# lasso.coef.2.5k[, 1:6]
# lasso.coef.1k[, 1:6]
# lasso.coef.100[, 1:6]

In [3]:
min(lasso.coef.10k[1,])
max(lasso.coef.10k[1,])

### Decision and Extra Tree files

In [4]:
#decision tree files
decision.tree.feature.importance.100 = read.csv("trees/DecisionTrees/100_decisiontree_feature_importance_all.csv", sep=",", dec=".")
extra.tree.feature.importance.100 = read.csv("trees/ExtraTrees/100_extra_tree_feature_importance_all.csv", sep=",", dec=".")
decision.tree.accuracy.100 = read.csv("trees/100_Accuracy.csv", sep=",", dec=".")
#trim decision tree data
dimnames(decision.tree.feature.importance.100)[[1]] <- decision.tree.feature.importance.100[,1]
decision.tree.feature.importance.100 <- decision.tree.feature.importance.100[, -1]
dimnames(extra.tree.feature.importance.100)[[1]] <- extra.tree.feature.importance.100[,1]
extra.tree.feature.importance.100 <- extra.tree.feature.importance.100[, -1]
dimnames(decision.tree.accuracy.100)[[1]] <- decision.tree.accuracy.100[,1]
decision.tree.accuracy.100 <- decision.tree.accuracy.100[, -1]
#fix column names for unity across data
rownames(decision.tree.feature.importance.100) = rownames(log.reg.coef.100)
rownames(extra.tree.feature.importance.100) = rownames(log.reg.coef.100)
rownames(decision.tree.accuracy.100) = rownames(log.reg.coef.100)

#decision tree files
decision.tree.feature.importance.1k = read.csv("trees/DecisionTrees/1k_decisiontree_feature_importance_all.csv", sep=",", dec=".")
extra.tree.feature.importance.1k = read.csv("trees/ExtraTrees/1k_extra_tree_feature_importance_all.csv", sep=",", dec=".")
decision.tree.accuracy.1k = read.csv("trees/1k_Accuracy.csv", sep=",", dec=".")
#trim decision tree data
dimnames(decision.tree.feature.importance.1k)[[1]] <- decision.tree.feature.importance.1k[,1]
decision.tree.feature.importance.1k <- decision.tree.feature.importance.1k[, -1]
dimnames(extra.tree.feature.importance.1k)[[1]] <- extra.tree.feature.importance.1k[,1]
extra.tree.feature.importance.1k <- extra.tree.feature.importance.1k[, -1]
dimnames(decision.tree.accuracy.1k)[[1]] <- decision.tree.accuracy.1k[,1]
decision.tree.accuracy.1k <- decision.tree.accuracy.1k[, -1]
#fix column names for unity across data
rownames(decision.tree.feature.importance.1k) = rownames(log.reg.coef.1k)
rownames(extra.tree.feature.importance.1k) = rownames(log.reg.coef.1k)
rownames(decision.tree.accuracy.1k) = rownames(log.reg.coef.1k)

#decision tree files
decision.tree.feature.importance.2.5k = read.csv("trees/DecisionTrees/2.5k_decisiontree_feature_importance_all.csv", sep=",", dec=".")
extra.tree.feature.importance.2.5k = read.csv("trees/ExtraTrees/2.5k_extra_tree_feature_importance_all.csv", sep=",", dec=".")
decision.tree.accuracy.2.5k = read.csv("trees/2.5k_Accuracy.csv", sep=",", dec=".")
#trim decision tree data
dimnames(decision.tree.feature.importance.2.5k)[[1]] <- decision.tree.feature.importance.2.5k[,1]
decision.tree.feature.importance.2.5k <- decision.tree.feature.importance.2.5k[, -1]
dimnames(extra.tree.feature.importance.2.5k)[[1]] <- extra.tree.feature.importance.2.5k[,1]
extra.tree.feature.importance.2.5k <- extra.tree.feature.importance.2.5k[, -1]
dimnames(decision.tree.accuracy.2.5k)[[1]] <- decision.tree.accuracy.2.5k[,1]
decision.tree.accuracy.2.5k <- decision.tree.accuracy.2.5k[, -1]
#fix column names for unity across data
rownames(decision.tree.feature.importance.2.5k) = rownames(log.reg.coef.2.5k)
rownames(extra.tree.feature.importance.2.5k) = rownames(log.reg.coef.2.5k)
rownames(decision.tree.accuracy.2.5k) = rownames(log.reg.coef.2.5k)

#decision tree files
decision.tree.feature.importance.5k = read.csv("trees/DecisionTrees/5k_decisiontree_feature_importance_all.csv", sep=",", dec=".")
extra.tree.feature.importance.5k = read.csv("trees/ExtraTrees/5k_extra_tree_feature_importance_all.csv", sep=",", dec=".")
decision.tree.accuracy.5k = read.csv("trees/5k_Accuracy.csv", sep=",", dec=".")
#trim decision tree data
dimnames(decision.tree.feature.importance.5k)[[1]] <- decision.tree.feature.importance.5k[,1]
decision.tree.feature.importance.5k <- decision.tree.feature.importance.5k[, -1]
dimnames(extra.tree.feature.importance.5k)[[1]] <- extra.tree.feature.importance.5k[,1]
extra.tree.feature.importance.5k <- extra.tree.feature.importance.5k[, -1]
dimnames(decision.tree.accuracy.5k)[[1]] <- decision.tree.accuracy.5k[,1]
decision.tree.accuracy.5k <- decision.tree.accuracy.5k[, -1]
#fix column names for unity across data
rownames(decision.tree.feature.importance.5k) = rownames(log.reg.coef.5k)
rownames(extra.tree.feature.importance.5k) = rownames(log.reg.coef.5k)
rownames(decision.tree.accuracy.5k) = rownames(log.reg.coef.5k)

#decision tree files
decision.tree.feature.importance.10k = read.csv("trees/DecisionTrees/10k_decisiontree_feature_importance_all.csv", sep=",", dec=".")
extra.tree.feature.importance.10k = read.csv("trees/ExtraTrees/10k_extra_tree_feature_importance_all.csv", sep=",", dec=".")
decision.tree.accuracy.10k = read.csv("trees/10k_Accuracy.csv", sep=",", dec=".")
#trim decision tree data
dimnames(decision.tree.feature.importance.10k)[[1]] <- decision.tree.feature.importance.10k[,1]
decision.tree.feature.importance.10k <- decision.tree.feature.importance.10k[, -1]
dimnames(extra.tree.feature.importance.10k)[[1]] <- extra.tree.feature.importance.10k[,1]
extra.tree.feature.importance.10k <- extra.tree.feature.importance.10k[, -1]
dimnames(decision.tree.accuracy.10k)[[1]] <- decision.tree.accuracy.10k[,1]
decision.tree.accuracy.10k <- decision.tree.accuracy.10k[, -1]
#fix column names for unity across data
rownames(decision.tree.feature.importance.10k) = rownames(log.reg.coef.10k)
rownames(extra.tree.feature.importance.10k) = rownames(log.reg.coef.10k)
rownames(decision.tree.accuracy.10k) = rownames(log.reg.coef.10k)

#sanity check
# dim(decision.tree.feature.importance)
# dim(extra.tree.feature.importance)
# dim(decision.tree.accuracy)
# decision.tree.accuracy
# decision.tree.feature.importance[, 1:6]
# extra.tree.feature.importance[, 1:6]

### Neural Network files

In [7]:
#neural net files
neural.net.unnormalized.10k = read.csv("nn/unnormalized_values_10k.csv", sep=",", dec=".")
neural.net.unnormalized.5k = read.csv("nn/unnormalized_values_5k.csv", sep=",", dec=".")
neural.net.unnormalized.2.5k = read.csv("nn/unnormalized_values_2.5k.csv", sep=",", dec=".")
neural.net.unnormalized.1k = read.csv("nn/unnormalized_values_1k.csv", sep=",", dec=".")
neural.net.unnormalized.100 = read.csv("nn/unnormalized_values_100.csv", sep=",", dec=".")
neural.net.normalized.10k = read.csv("nn/normalized_values_10k.csv", sep=",", dec=".")
neural.net.normalized.5k = read.csv("nn/normalized_values_5k.csv", sep=",", dec=".")
neural.net.normalized.2.5k = read.csv("nn/normalized_values_2.5k.csv", sep=",", dec=".")
neural.net.normalized.1k = read.csv("nn/normalized_values_1k.csv", sep=",", dec=".")
neural.net.normalized.100 = read.csv("nn/normalized_values_100.csv", sep=",", dec=".")
#trim neural net files
dimnames(neural.net.unnormalized.10k)[[1]] <- neural.net.unnormalized.10k[,1]
neural.net.unnormalized.10k <- neural.net.unnormalized.10k[, -1]
dimnames(neural.net.unnormalized.5k)[[1]] <- neural.net.unnormalized.5k[,1]
neural.net.unnormalized.5k <- neural.net.unnormalized.5k[, -1]
dimnames(neural.net.unnormalized.2.5k)[[1]] <- neural.net.unnormalized.2.5k[,1]
neural.net.unnormalized.2.5k <- neural.net.unnormalized.2.5k[, -1]
dimnames(neural.net.unnormalized.1k)[[1]] <- neural.net.unnormalized.1k[,1]
neural.net.unnormalized.1k <- neural.net.unnormalized.1k[, -1]
dimnames(neural.net.unnormalized.100)[[1]] <- neural.net.unnormalized.100[,1]
neural.net.unnormalized.100 <- neural.net.unnormalized.100[, -1]
dimnames(neural.net.normalized.10k)[[1]] <- neural.net.normalized.10k[,1]
neural.net.normalized.10k <- neural.net.normalized.10k[, -1]
dimnames(neural.net.normalized.5k)[[1]] <- neural.net.normalized.5k[,1]
neural.net.normalized.5k <- neural.net.normalized.5k[, -1]
dimnames(neural.net.normalized.2.5k)[[1]] <- neural.net.normalized.2.5k[,1]
neural.net.normalized.2.5k <- neural.net.normalized.2.5k[, -1]
dimnames(neural.net.normalized.1k)[[1]] <- neural.net.normalized.1k[,1]
neural.net.normalized.1k <- neural.net.normalized.1k[, -1]
dimnames(neural.net.normalized.100)[[1]] <- neural.net.normalized.100[,1]
neural.net.normalized.100 <- neural.net.normalized.100[, -1]

#fix row names for unity across data
rownames(neural.net.unnormalized.10k) = rownames(log.reg.coef.10k)
rownames(neural.net.unnormalized.5k) = rownames(log.reg.coef.10k)
rownames(neural.net.unnormalized.2.5k) = rownames(log.reg.coef.10k)
rownames(neural.net.unnormalized.1k) = rownames(log.reg.coef.10k)
rownames(neural.net.unnormalized.100) = rownames(log.reg.coef.10k)
rownames(neural.net.normalized.10k) = rownames(log.reg.coef.10k)
rownames(neural.net.normalized.5k) = rownames(log.reg.coef.10k)
rownames(neural.net.normalized.2.5k) = rownames(log.reg.coef.10k)
rownames(neural.net.normalized.1k) = rownames(log.reg.coef.10k)
rownames(neural.net.normalized.100) = rownames(log.reg.coef.10k)

In [None]:
#sanity check
# dim(neural.net.unnormalized.10k)
# dim(neural.net.unnormalized.5k)
# dim(neural.net.unnormalized.2.5k)
# dim(neural.net.unnormalized.1k)
# dim(neural.net.normalized.10k)
# dim(neural.net.normalized.5k)
# dim(neural.net.normalized.2.5k)
# dim(neural.net.normalized.1k)

# neural.net.unnormalized.10k[, 1:6]
# neural.net.normalized.10k[, 1:6]
# neural.net.unnormalized.5k[, 1:6]
# neural.net.normalized.5k[, 1:6]
# neural.net.unnormalized.2.5k[, 1:6]
# neural.net.normalized.2.5k[, 1:6]
# neural.net.unnormalized.1k[, 1:6]
# neural.net.normalized.1k[, 1:6]

#min(neural.net.normalized.10k[1,])
#max(neural.net.normalized.10k[1,])
#min(neural.net.normalized.5k[1,])
#max(neural.net.normalized.5k[1,])
#min(neural.net.normalized.2.5k[1,])
#max(neural.net.normalized.2.5k[1,])
#min(neural.net.normalized.1k[1,])
#max(neural.net.normalized.1k[1,])
#min(neural.net.normalized.100[1,])
#max(neural.net.normalized.100[1,])

In [8]:
nn.acc = read.csv("nn/Accuracies.csv", sep=",", dec=".", header=F)
dimnames(nn.acc)[[1]] <- nn.acc[,1]
nn.acc <- nn.acc[,-1]
dimnames(nn.acc)[[2]] <- c("Fold1", "Fold2", "Fold3", "Fold4", "Fold5")

“incomplete final line found by readTableHeader on 'nn/Accuracies.csv'”

In [9]:
nn.acc <- t(nn.acc)
nn.acc

Unnamed: 0,10k,5k,2.5k,1k,100
Fold1,0.8611,0.8889,0.9167,0.8889,0.6667
Fold2,0.9167,0.8333,0.9167,0.8611,0.6667
Fold3,0.8571,0.9143,0.9429,0.9714,0.7143
Fold4,0.8571,0.7143,0.7714,0.8857,0.7714
Fold5,0.9143,0.8286,0.9143,0.9429,0.7429


# Normalize data

### Logistic Regression data

In [30]:
#normalize log reg coefficients
log.reg.coef.norm.10k = matrix(ncol = ncol(log.reg.coef.10k))
log.reg.coef.norm.5k = matrix(ncol = ncol(log.reg.coef.5k))
log.reg.coef.norm.2.5k = matrix(ncol = ncol(log.reg.coef.2.5k))
log.reg.coef.norm.1k = matrix(ncol = ncol(log.reg.coef.1k))
log.reg.coef.norm.100 = matrix(ncol = ncol(log.reg.coef.100))

colnames(log.reg.coef.norm.10k) = colnames(log.reg.coef.10k)
colnames(log.reg.coef.norm.5k) = colnames(log.reg.coef.5k)
colnames(log.reg.coef.norm.2.5k) = colnames(log.reg.coef.2.5k)
colnames(log.reg.coef.norm.1k) = colnames(log.reg.coef.1k)
colnames(log.reg.coef.norm.100) = colnames(log.reg.coef.100)

for (i in 1:nrow(log.reg.coef.10k)) {
    #normalize
    temp.10k = (log.reg.coef.10k[i,] - min(log.reg.coef.10k[i,], na.rm=T)) / (max(log.reg.coef.10k[i,], na.rm=T) - min(log.reg.coef.10k[i,], na.rm=T))
    temp.5k = (log.reg.coef.5k[i,] - min(log.reg.coef.5k[i,], na.rm=T)) / (max(log.reg.coef.5k[i,], na.rm=T) - min(log.reg.coef.5k[i,], na.rm=T))
    temp.2.5k = (log.reg.coef.2.5k[i,] - min(log.reg.coef.2.5k[i,], na.rm=T)) / (max(log.reg.coef.2.5k[i,], na.rm=T) - min(log.reg.coef.2.5k[i,], na.rm=T))
    temp.1k = (log.reg.coef.1k[i,] - min(log.reg.coef.1k[i,], na.rm=T)) / (max(log.reg.coef.1k[i,], na.rm=T) - min(log.reg.coef.1k[i,], na.rm=T))
    temp.100 = (log.reg.coef.100[i,] - min(log.reg.coef.100[i,], na.rm=T)) / (max(log.reg.coef.100[i,], na.rm=T) - min(log.reg.coef.100[i,], na.rm=T))

    log.reg.coef.norm.10k = rbind(log.reg.coef.norm.10k, temp.10k)
    log.reg.coef.norm.5k = rbind(log.reg.coef.norm.5k, temp.5k)
    log.reg.coef.norm.2.5k = rbind(log.reg.coef.norm.2.5k, temp.2.5k)
    log.reg.coef.norm.1k = rbind(log.reg.coef.norm.1k, temp.1k)
    log.reg.coef.norm.100 = rbind(log.reg.coef.norm.100, temp.100)
    #remove NAs
    log.reg.coef.norm.10k[is.na(log.reg.coef.norm.10k)] = 0
    log.reg.coef.norm.5k[is.na(log.reg.coef.norm.5k)] = 0
    log.reg.coef.norm.2.5k[is.na(log.reg.coef.norm.2.5k)] = 0
    log.reg.coef.norm.1k[is.na(log.reg.coef.norm.1k)] = 0
    log.reg.coef.norm.100[is.na(log.reg.coef.norm.100)] = 0
}
#remove empty first row
log.reg.coef.norm.10k = log.reg.coef.norm.10k[-1,]
log.reg.coef.norm.5k = log.reg.coef.norm.5k[-1,]
log.reg.coef.norm.2.5k = log.reg.coef.norm.2.5k[-1,]
log.reg.coef.norm.1k = log.reg.coef.norm.1k[-1,]
log.reg.coef.norm.100 = log.reg.coef.norm.100[-1,]

In [31]:
#sanity check
log.reg.coef.norm.10k[, 1:20]
log.reg.coef.norm.5k[, 1:20]
log.reg.coef.norm.2.5k[, 1:20]
log.reg.coef.norm.1k[, 1:20]
log.reg.coef.norm.100[, 1:20]

Unnamed: 0,ENSG00000000003.13,ENSG00000000971.14,ENSG00000001084.9,ENSG00000001167.13,ENSG00000001461.15,ENSG00000001561.6,ENSG00000001631.13,ENSG00000002330.12,ENSG00000002586.16,ENSG00000002587.8,ENSG00000002746.13,ENSG00000002933.6,ENSG00000003137.7,ENSG00000003393.13,ENSG00000003436.13,ENSG00000003509.14,ENSG00000004059.9,ENSG00000004139.12,ENSG00000004399.11,ENSG00000004455.15
Fold 1_10k,0.4994409,0.5274549,0.3859824,0.9290671,0.4490394,0.500201,0.3390382,0.5275156,0.67284726,0.4434763,0.5624553,0.6114496,0.4713236,0.4775565,0.5005781,0.7960521,0.2515649,0.3868422,0.2614419,0.3917603
Fold 2_10k,0.4263767,0.4122517,0.2910652,0.5115296,0.4144736,0.4632628,0.4909473,0.2310611,0.29059612,0.4002197,0.4253242,0.4462235,0.4172884,0.459047,0.3754011,0.0,0.5997677,0.3847155,0.4534601,0.5153158
Fold 3_10k,0.5768463,0.5889738,0.6822579,0.3128962,0.7553332,0.5185495,0.6014722,0.7655831,0.59560386,0.5964962,0.5855597,0.6062623,0.6363694,0.7009712,0.6915449,0.6349556,0.8331848,0.6453888,0.5964337,0.6064758
Fold 4_10k,0.4400798,0.4662038,0.3702467,0.6008274,0.0,0.4780815,0.264538,0.4891172,0.09536906,0.422629,0.4684774,0.3923205,0.4681259,0.460312,0.3080675,0.6484615,0.1593153,0.4575572,0.4363684,1.0
Fold 5_10k,0.4796359,0.6330226,0.4204075,0.6494225,0.6703715,0.6014687,0.3283774,0.8233051,0.71959132,0.5833577,0.5954208,0.5701529,0.637019,0.5485111,0.6286785,0.8783345,0.793295,0.5915237,0.4758804,0.6005783


Unnamed: 0,ENSG00000000003.13,ENSG00000000971.14,ENSG00000001084.9,ENSG00000001561.6,ENSG00000002586.16,ENSG00000002933.6,ENSG00000003137.7,ENSG00000003436.13,ENSG00000004139.12,ENSG00000004399.11,ENSG00000004455.15,ENSG00000004766.14,ENSG00000004777.17,ENSG00000004799.7,ENSG00000004809.12,ENSG00000004848.6,ENSG00000004939.12,ENSG00000005007.11,ENSG00000005073.5,ENSG00000005108.14
Fold 1_5k,0.6053645,0.513817,0.4840389,0.557828,0.5455676,0.4917138,0.5921454,0.5564652,0.5117079,0.4101564,0.4490027,0.5674893,0.7512071,0.5353813,0.6917303,0.558994,0.4476393,0.181767,0.536626,0.4961511
Fold 2_5k,0.5309788,0.5060045,0.8515418,0.5472157,0.1764962,0.466985,0.5761677,0.4797196,0.656515,0.6907061,0.5123496,0.4856171,0.3222161,0.557405,0.5729631,0.5710292,0.5295678,0.4665622,0.5425818,0.5143167
Fold 3_5k,0.6208883,0.6483652,0.7393011,0.5801801,0.6247011,0.6430368,0.6649823,0.6124789,0.6981501,0.6784318,1.0,0.806881,0.5884054,0.6452388,0.6726056,0.6201156,0.6240645,0.8831926,0.6035835,0.6058598
Fold 4_5k,0.4522677,0.5067145,0.4495335,0.4956459,0.3705868,0.5432256,0.3494214,0.5487531,0.5697358,0.2886969,0.2060035,0.4046327,0.3422141,0.4376511,0.5832821,0.5590124,0.4060708,0.6222456,0.5248528,0.4572076
Fold 5_5k,0.6507551,0.5726229,0.4809076,0.6215375,0.5560941,0.5817704,0.6674687,0.6542518,0.6161791,1.0,0.6077177,0.6044042,0.1637445,0.5288212,0.851179,0.5060423,0.5685179,0.6108091,0.5701453,0.5086609


Unnamed: 0,ENSG00000000003.13,ENSG00000000971.14,ENSG00000001561.6,ENSG00000002586.16,ENSG00000003137.7,ENSG00000003436.13,ENSG00000004139.12,ENSG00000004399.11,ENSG00000004455.15,ENSG00000005073.5,ENSG00000005108.14,ENSG00000005156.10,ENSG00000005249.11,ENSG00000005381.7,ENSG00000005421.7,ENSG00000005513.9,ENSG00000006377.10,ENSG00000006432.14,ENSG00000006451.6,ENSG00000006468.12
Fold 1_2.5k,0.4220371,0.4750274,0.4467388,0.4724761,0.6266091,0.3769483,0.4254832,0.3771947,0.2436932,0.5068703,0.5082256,1.0,0.439381,0.4574567,0.4429057,0.4897046,0.547562,0.4154736,0.47498705,0.4462874
Fold 2_2.5k,0.5936591,0.5672079,0.5658191,0.5866972,0.5670364,0.7258142,0.7366582,0.7259072,0.7653611,0.6522516,0.6257363,0.466545,0.6036079,0.6113381,0.6290323,0.6265631,0.619494,0.6458134,0.7607545,0.6127722
Fold 3_2.5k,0.3319382,0.3299018,0.3555429,0.3527267,0.377292,0.3436786,0.395622,0.1333212,0.4014651,0.3392464,0.354712,0.2919756,0.2446203,0.3604483,0.3687523,0.310336,0.2718733,0.3335353,0.2810264,0.3352786
Fold 4_2.5k,0.4269907,0.4010694,0.4007951,0.3332993,0.3777309,0.3347019,0.2374468,0.2662313,0.2052907,0.3830349,0.3911044,0.0,0.209688,0.424406,0.3775664,0.2664814,0.4213099,0.4068061,0.24694766,0.3656488
Fold 5_2.5k,0.6564436,0.5070464,0.6832262,0.6491977,0.7280068,0.4970034,0.4821789,0.7531879,0.6050838,0.6479001,0.571291,0.933872,0.6572351,0.5799052,0.5648421,0.6215738,0.5785203,0.4650544,0.05110899,0.5853929


Unnamed: 0,ENSG00000000003.13,ENSG00000002586.16,ENSG00000003436.13,ENSG00000004139.12,ENSG00000004399.11,ENSG00000005073.5,ENSG00000005108.14,ENSG00000005156.10,ENSG00000005249.11,ENSG00000005381.7,ENSG00000006468.12,ENSG00000006638.10,ENSG00000007516.12,ENSG00000008056.11,ENSG00000008086.9,ENSG00000008283.14,ENSG00000008394.11,ENSG00000008735.13,ENSG00000010318.18,ENSG00000010319.5
Fold 1_1k,0.252592,0.6317432,0.4530985,0.2984767,0.6614434,0.4508005,0.456851,0.2942222,0.3566816,0.5154703,0.4453261,0.568282,0.484431,0.4764001,0.4807465,0.6868754,0.3082709,0.3067339,0.415993,0.4987222
Fold 2_1k,0.5464902,0.7191469,0.4027855,0.468456,0.6813172,0.5372777,0.5252488,0.2814707,0.5462807,0.5339204,0.5844556,0.5218946,0.5382171,0.4683313,0.5413507,0.5574185,0.5222279,0.5762052,0.289428,0.4985716
Fold 3_1k,0.546131,0.3546847,0.5287073,0.4531102,0.5325464,0.5795498,0.5903829,0.3998043,0.6564899,0.5447056,0.5569162,0.5347383,0.5957364,0.6035083,0.5623146,0.7583129,0.5797754,0.5387416,0.8494478,0.5128347
Fold 4_1k,0.6511066,0.6988648,0.6830919,0.6510005,0.8052289,0.6007256,0.5948158,0.6741632,0.6898402,0.5083228,0.7057426,0.8126108,0.5372568,0.5792388,0.6174308,0.3991658,0.6505726,0.7972005,0.8000794,0.7437565
Fold 5_1k,0.619475,0.4718451,0.6725339,0.8636829,0.5227287,0.5692662,0.6095043,0.8048277,0.5713803,0.5834707,0.6008172,0.6239302,0.5580048,0.6541584,0.6941636,0.5252786,0.5480947,0.7952108,0.662311,0.6889299


Unnamed: 0,ENSG00000002586.16,ENSG00000005381.7,ENSG00000007516.12,ENSG00000019991.14,ENSG00000037280.14,ENSG00000065054.12,ENSG00000067177.13,ENSG00000078399.14,ENSG00000078596.9,ENSG00000086205.15,ENSG00000087495.15,ENSG00000091490.9,ENSG00000091972.17,ENSG00000095917.12,ENSG00000102595.17,ENSG00000105996.6,ENSG00000105997.21,ENSG00000106004.4,ENSG00000106006.6,ENSG00000106538.8
Fold 1_100,0.7987645,0.5084224,0.6139006,0.359747,0.5199825,0.5694849,0.6330103,0.7295394,0.5483658,0.5907715,0.6366393,0.6228821,0.7343958,0.36806,0.440147,0.2764969,0.4334992,0.5460777,0.5414356,0.5588657
Fold 2_100,0.1874249,0.5271933,0.6604543,0.7330306,0.628494,0.7137958,0.7398743,0.7286017,0.4340336,0.611081,0.6915663,0.4977816,0.7458195,0.4401846,0.5196532,0.5126534,0.5398147,0.6195331,0.595409,0.7700192
Fold 3_100,0.9917636,0.4677845,0.7023231,0.4246376,0.5853013,0.6193991,0.7926308,0.7558438,0.7138872,0.6112045,0.4271214,0.4648041,0.2694926,0.470909,0.7198792,0.4130275,0.6559305,0.6576499,0.884037,0.5321159
Fold 4_100,0.4894729,0.7240266,0.5226974,0.5170355,0.7178933,0.5264698,0.7616211,0.8439485,0.4778866,0.6822926,0.6127339,0.4044966,0.5750775,0.2807518,0.6176341,0.3335894,0.1773369,0.5422904,0.7996896,0.7320262
Fold 5_100,0.1420281,0.3457552,0.2087507,0.5164748,0.7947932,0.5053051,0.6056154,0.6664705,0.4699334,0.5856916,0.476559,0.4345257,0.4732986,0.3815911,0.5181959,0.1146886,0.4579166,0.552698,0.4843384,0.4108316


In [32]:
min(log.reg.coef.norm.10k[1,])
max(log.reg.coef.norm.10k[1,])

### LASSO data

In [33]:
#normalize lasso reg coefficients
lasso.coef.norm.10k = matrix(ncol = ncol(lasso.coef.10k))
lasso.coef.norm.5k = matrix(ncol = ncol(lasso.coef.5k))
lasso.coef.norm.2.5k = matrix(ncol = ncol(lasso.coef.2.5k))
lasso.coef.norm.1k = matrix(ncol = ncol(lasso.coef.1k))
lasso.coef.norm.100 = matrix(ncol = ncol(lasso.coef.100))

colnames(lasso.coef.norm.10k) = colnames(lasso.coef.10k)
colnames(lasso.coef.norm.5k) = colnames(lasso.coef.5k)
colnames(lasso.coef.norm.2.5k) = colnames(lasso.coef.2.5k)
colnames(lasso.coef.norm.1k) = colnames(lasso.coef.1k)
colnames(lasso.coef.norm.100) = colnames(lasso.coef.100)

for (i in 1:nrow(lasso.coef.10k)) {
    #normalize
    temp.10k = (lasso.coef.10k[i,] - min(lasso.coef.10k[i,], na.rm=T)) / (max(lasso.coef.10k[i,], na.rm=T) - min(lasso.coef.10k[i,], na.rm=T))
    temp.5k = (lasso.coef.5k[i,] - min(lasso.coef.5k[i,], na.rm=T)) / (max(lasso.coef.5k[i,], na.rm=T) - min(lasso.coef.5k[i,], na.rm=T))
    temp.2.5k = (lasso.coef.2.5k[i,] - min(lasso.coef.2.5k[i,], na.rm=T)) / (max(lasso.coef.2.5k[i,], na.rm=T) - min(lasso.coef.2.5k[i,], na.rm=T))
    temp.1k = (lasso.coef.1k[i,] - min(lasso.coef.1k[i,], na.rm=T)) / (max(lasso.coef.1k[i,], na.rm=T) - min(lasso.coef.1k[i,], na.rm=T))
    temp.100 = (lasso.coef.100[i,] - min(lasso.coef.100[i,], na.rm=T)) / (max(lasso.coef.100[i,], na.rm=T) - min(lasso.coef.100[i,], na.rm=T))

    lasso.coef.norm.10k = rbind(lasso.coef.norm.10k, temp.10k)
    lasso.coef.norm.5k = rbind(lasso.coef.norm.5k, temp.5k)
    lasso.coef.norm.2.5k = rbind(lasso.coef.norm.2.5k, temp.2.5k)
    lasso.coef.norm.1k = rbind(lasso.coef.norm.1k, temp.1k)
    lasso.coef.norm.100 = rbind(lasso.coef.norm.100, temp.100)
    #remove NAs
    lasso.coef.norm.10k[is.na(lasso.coef.norm.10k)] = 0
    lasso.coef.norm.5k[is.na(lasso.coef.norm.5k)] = 0
    lasso.coef.norm.2.5k[is.na(lasso.coef.norm.2.5k)] = 0
    lasso.coef.norm.1k[is.na(lasso.coef.norm.1k)] = 0
    lasso.coef.norm.100[is.na(lasso.coef.norm.100)] = 0
}
#remove empty first row
lasso.coef.norm.10k = lasso.coef.norm.10k[-1,]
lasso.coef.norm.5k = lasso.coef.norm.5k[-1,]
lasso.coef.norm.2.5k = lasso.coef.norm.2.5k[-1,]
lasso.coef.norm.1k = lasso.coef.norm.1k[-1,]
lasso.coef.norm.100 = lasso.coef.norm.100[-1,]

In [34]:
#sanity check
lasso.coef.norm.10k[, 1:20]
lasso.coef.norm.5k[, 1:20]
lasso.coef.norm.2.5k[, 1:20]
lasso.coef.norm.1k[, 1:20]
lasso.coef.norm.100[, 1:20]

Unnamed: 0,ENSG00000000003.13,ENSG00000000971.14,ENSG00000001084.9,ENSG00000001167.13,ENSG00000001461.15,ENSG00000001561.6,ENSG00000001631.13,ENSG00000002330.12,ENSG00000002586.16,ENSG00000002587.8,ENSG00000002746.13,ENSG00000002933.6,ENSG00000003137.7,ENSG00000003393.13,ENSG00000003436.13,ENSG00000003509.14,ENSG00000004059.9,ENSG00000004139.12,ENSG00000004399.11,ENSG00000004455.15
Fold 1_10k,0.4785262,0.4785262,0.4785262,0.4785262,0.4785262,0.4785262,0.4785262,0.4785262,0.4785262,0.4785262,0.4785262,0.4785262,0.4785262,0.4785262,0.4785262,0.4785262,0.4785262,0.4785262,0.4785262,0.4785262
Fold 2_10k,0.6834637,0.6834637,0.6834637,0.6834637,0.6834637,0.6834637,0.6834637,0.6834637,0.6834637,0.6834637,0.6834637,0.6834637,0.6834637,0.6834637,0.6834637,0.6834637,0.6834637,0.6834637,0.6834637,0.6834637
Fold 3_10k,0.4821271,0.4821271,0.4821271,0.4821271,0.4821271,0.4821271,0.4821271,0.4821271,0.4821271,0.4821271,0.4821271,0.4821271,0.4821271,0.4821271,0.4821271,0.4821271,0.4821271,0.4821271,0.4821271,0.4821271
Fold 4_10k,0.5458588,0.5458588,0.5458588,0.5458588,0.5458588,0.5458588,0.5458588,0.5458588,0.5458588,0.5458588,0.5458588,0.5458588,0.5458588,0.5458588,0.5458588,0.5458588,0.5458588,0.5458588,0.5458588,0.5458588
Fold 5_10k,0.7895754,0.7895754,0.7895754,0.7895754,0.7895754,0.7895754,0.7895754,0.7895754,0.7895754,0.7895754,0.7895754,0.7895754,0.7895754,0.7895754,0.7895754,0.7895754,0.7895754,0.7895754,0.7895754,0.7895754


Unnamed: 0,ENSG00000000003.13,ENSG00000000971.14,ENSG00000001084.9,ENSG00000001561.6,ENSG00000002586.16,ENSG00000002933.6,ENSG00000003137.7,ENSG00000003436.13,ENSG00000004139.12,ENSG00000004399.11,ENSG00000004455.15,ENSG00000004766.14,ENSG00000004777.17,ENSG00000004799.7,ENSG00000004809.12,ENSG00000004848.6,ENSG00000004939.12,ENSG00000005007.11,ENSG00000005073.5,ENSG00000005108.14
Fold 1_5k,0.4217334,0.4217334,0.4217334,0.4217334,0.4217334,0.4217334,0.4217334,0.4217334,0.4217334,0.4217334,0.4217334,0.4217334,0.4217334,0.4217334,0.4217334,0.4217334,0.4217334,0.4217334,0.4217334,0.4217334
Fold 2_5k,0.5652586,0.5652586,0.5652586,0.5652586,0.5652586,0.5652586,0.5652586,0.5652586,0.5652586,0.5652586,0.5652586,0.5652586,0.5652586,0.5652586,0.5652586,0.5652586,0.5652586,0.5652586,0.5652586,0.5652586
Fold 3_5k,0.4525438,0.4525438,0.4525438,0.4525438,0.4525438,0.4525438,0.4525438,0.4525438,0.4525438,0.4525438,0.4525438,0.4525438,0.4525438,0.4525438,0.4525438,0.4525438,0.4525438,0.4525438,0.4525438,0.4525438
Fold 4_5k,0.5488613,0.5488613,0.5488613,0.5488613,0.5488613,0.5488613,0.5488613,0.5488613,0.5488613,0.5488613,0.5488613,0.5488613,0.5488613,0.5488613,0.5488613,0.5488613,0.5488613,0.5488613,0.5488613,0.5488613
Fold 5_5k,0.8108638,0.8108638,0.8108638,0.8108638,0.8108638,0.8108638,0.8108638,0.8108638,0.8108638,0.8108638,0.8108638,0.8108638,0.8108638,0.8108638,0.8108638,0.8108638,0.8108638,0.8108638,0.8108638,0.736739


Unnamed: 0,ENSG00000000003.13,ENSG00000000971.14,ENSG00000001561.6,ENSG00000002586.16,ENSG00000003137.7,ENSG00000003436.13,ENSG00000004139.12,ENSG00000004399.11,ENSG00000004455.15,ENSG00000005073.5,ENSG00000005108.14,ENSG00000005156.10,ENSG00000005249.11,ENSG00000005381.7,ENSG00000005421.7,ENSG00000005513.9,ENSG00000006377.10,ENSG00000006432.14,ENSG00000006451.6,ENSG00000006468.12
Fold 1_2.5k,0.4069436,0.4069436,0.4069436,0.4069436,0.4069436,0.4069436,0.4069436,0.4069436,0.4069436,0.4069436,0.4069436,0.4069436,0.4069436,0.4069436,0.4069436,0.4069436,0.4069436,0.4069436,0.4069436,0.4069436
Fold 2_2.5k,0.5634031,0.5634031,0.5634031,0.5634031,0.5634031,0.5634031,0.5634031,0.5634031,0.5634031,0.5634031,0.5634031,0.5634031,0.5634031,0.5634031,0.5634031,0.530475,0.5634031,0.5634031,0.5634031,0.5634031
Fold 3_2.5k,0.4216463,0.4216463,0.4216463,0.4216463,0.4216463,0.4216463,0.4216463,0.4216463,0.4216463,0.4216463,0.4216463,0.4216463,0.4216463,0.4216463,0.4216463,0.4216463,0.4216463,0.4216463,0.4216463,0.4216463
Fold 4_2.5k,0.5413079,0.5413079,0.5413079,0.5413079,0.5413079,0.5413079,0.5413079,0.5413079,0.5413079,0.5413079,0.5413079,0.5413079,0.5413079,0.5413079,0.5413079,0.5413079,0.5413079,0.5413079,0.5413079,0.5413079
Fold 5_2.5k,0.750921,0.750921,0.7962521,0.750921,0.750921,0.750921,0.750921,0.750921,0.750921,0.750921,0.7132181,0.750921,0.750921,0.750921,0.750921,0.7195216,0.750921,0.750921,0.750921,0.750921


Unnamed: 0,ENSG00000000003.13,ENSG00000002586.16,ENSG00000003436.13,ENSG00000004139.12,ENSG00000004399.11,ENSG00000005073.5,ENSG00000005108.14,ENSG00000005156.10,ENSG00000005249.11,ENSG00000005381.7,ENSG00000006468.12,ENSG00000006638.10,ENSG00000007516.12,ENSG00000008056.11,ENSG00000008086.9,ENSG00000008283.14,ENSG00000008394.11,ENSG00000008735.13,ENSG00000010318.18,ENSG00000010319.5
Fold 1_1k,0.5261222,0.5261222,0.5261222,0.5261222,0.5261222,0.5261222,0.5261222,0.5261222,0.5261222,0.5261222,0.5261222,0.5261222,0.5261222,0.5261222,0.5261222,0.5261222,0.5261222,0.5261222,0.5261222,0.5261222
Fold 2_1k,0.580629,0.580629,0.580629,0.580629,0.580629,0.580629,0.580629,0.580629,0.580629,0.580629,0.580629,0.580629,0.580629,0.580629,0.580629,0.580629,0.580629,0.580629,0.580629,0.580629
Fold 3_1k,0.5625802,0.5625802,0.5625802,0.5625802,0.5625802,0.5625802,0.5625802,0.5625802,0.5625802,0.5625802,0.5625802,0.5625802,0.5625802,0.5625802,0.5625802,0.5625802,0.5625802,0.5625802,0.5625802,0.6972589
Fold 4_1k,0.6388633,0.6388633,0.6388633,0.6388633,0.6388633,0.6388633,0.6388633,0.6388633,0.6388633,0.6388633,0.6388633,0.6388633,0.6388633,0.6388633,0.6388633,0.6388633,0.6388633,0.6388633,0.6388633,0.9506135
Fold 5_1k,0.8144165,0.8144165,0.8144165,0.8144165,0.8144165,0.8144165,0.7524256,0.8144165,0.8144165,0.8144165,0.8144165,0.8144165,0.8144165,0.8144165,0.8144165,0.8144165,0.8144165,0.8144165,0.8144165,0.8144165


Unnamed: 0,ENSG00000002586.16,ENSG00000005381.7,ENSG00000007516.12,ENSG00000019991.14,ENSG00000037280.14,ENSG00000065054.12,ENSG00000067177.13,ENSG00000078399.14,ENSG00000078596.9,ENSG00000086205.15,ENSG00000087495.15,ENSG00000091490.9,ENSG00000091972.17,ENSG00000095917.12,ENSG00000102595.17,ENSG00000105996.6,ENSG00000105997.21,ENSG00000106004.4,ENSG00000106006.6,ENSG00000106538.8
Fold 1_100,0.6810269,0.6810269,0.6810269,0.6810269,0.6810269,0.6810269,0.6810269,0.6810269,0.6810269,0.6810269,0.6810269,0.6810269,0.6810269,0.6810269,0.6810269,0.6810269,0.6810269,0.6810269,0.6810269,0.6810269
Fold 2_100,0.7479444,0.7479444,0.7479444,0.7479444,0.7479444,0.7479444,0.7479444,0.7479444,0.7479444,0.7479444,0.7479444,0.7479444,0.7479444,0.7463666,0.7479444,0.7479444,0.7479444,0.9235086,0.7479444,0.7479444
Fold 3_100,0.4572114,0.4572114,0.4572114,0.4572114,0.4572114,0.4572114,0.4572114,0.598019,0.4572114,0.4572114,0.4572114,0.4572114,0.4572114,0.4572114,0.4572114,0.4572114,0.4572114,0.5087868,0.6583213,0.4572114
Fold 4_100,0.5882367,0.5882367,0.5882367,0.5882367,0.5882367,0.5882367,0.5882367,0.6827697,0.5882367,0.5882367,0.5882367,0.5882367,0.5882367,0.5882367,0.5882367,0.5882367,0.5882367,0.6542006,0.5882367,0.5882367
Fold 5_100,0.7897975,0.7897975,0.7897975,0.7897975,0.7897975,0.7897975,0.801179,0.7897975,0.7897975,0.7897975,0.7897975,0.7897975,0.7897975,0.7897975,0.7897975,0.7897975,0.7897975,1.0,0.7897975,0.7897975


In [37]:
min(lasso.coef.norm.5k[1,])
max(lasso.coef.norm.5k[1,])

### Decision Tree data

In [38]:
decision.tree.importance.norm.100 = matrix(ncol = ncol(decision.tree.feature.importance.100))
colnames(decision.tree.importance.norm.100) = colnames(decision.tree.feature.importance.100)
extra.tree.importance.norm.100 = matrix(ncol = ncol(extra.tree.feature.importance.100))
colnames(extra.tree.importance.norm.100) = colnames(extra.tree.feature.importance.100)

decision.tree.importance.norm.1k = matrix(ncol = ncol(decision.tree.feature.importance.1k))
colnames(decision.tree.importance.norm.1k) = colnames(decision.tree.feature.importance.1k)
extra.tree.importance.norm.1k = matrix(ncol = ncol(extra.tree.feature.importance.1k))
colnames(extra.tree.importance.norm.1k) = colnames(extra.tree.feature.importance.1k)

decision.tree.importance.norm.2.5k = matrix(ncol = ncol(decision.tree.feature.importance.2.5k))
colnames(decision.tree.importance.norm.2.5k) = colnames(decision.tree.feature.importance.2.5k)
extra.tree.importance.norm.2.5k = matrix(ncol = ncol(extra.tree.feature.importance.2.5k))
colnames(extra.tree.importance.norm.2.5k) = colnames(extra.tree.feature.importance.2.5k)

decision.tree.importance.norm.5k = matrix(ncol = ncol(decision.tree.feature.importance.5k))
colnames(decision.tree.importance.norm.5k) = colnames(decision.tree.feature.importance.5k)
extra.tree.importance.norm.5k = matrix(ncol = ncol(extra.tree.feature.importance.5k))
colnames(extra.tree.importance.norm.5k) = colnames(extra.tree.feature.importance.5k)

decision.tree.importance.norm.10k = matrix(ncol = ncol(decision.tree.feature.importance.10k))
colnames(decision.tree.importance.norm.10k) = colnames(decision.tree.feature.importance.10k)
extra.tree.importance.norm.10k = matrix(ncol = ncol(extra.tree.feature.importance.10k))
colnames(extra.tree.importance.norm.10k) = colnames(extra.tree.feature.importance.10k)

for (i in 1:nrow(decision.tree.feature.importance.10k)) {

    #normalize
    temp.100 = (decision.tree.feature.importance.100[i,] - min(decision.tree.feature.importance.100[i,], na.rm=T)) / (max(decision.tree.feature.importance.100[i,], na.rm=T) - min(decision.tree.feature.importance.100[i,], na.rm=T))
    temp2.100 = (extra.tree.feature.importance.100[i,] - min(extra.tree.feature.importance.100[i,], na.rm=T)) / (max(extra.tree.feature.importance.100[i,], na.rm=T) - min(extra.tree.feature.importance.100[i,], na.rm=T))
    temp.1k = (decision.tree.feature.importance.1k[i,] - min(decision.tree.feature.importance.1k[i,], na.rm=T)) / (max(decision.tree.feature.importance.1k[i,], na.rm=T) - min(decision.tree.feature.importance.1k[i,], na.rm=T))
    temp2.1k = (extra.tree.feature.importance.1k[i,] - min(extra.tree.feature.importance.1k[i,], na.rm=T)) / (max(extra.tree.feature.importance.1k[i,], na.rm=T) - min(extra.tree.feature.importance.1k[i,], na.rm=T))
    temp.2.5k = (decision.tree.feature.importance.2.5k[i,] - min(decision.tree.feature.importance.2.5k[i,], na.rm=T)) / (max(decision.tree.feature.importance.2.5k[i,], na.rm=T) - min(decision.tree.feature.importance.2.5k[i,], na.rm=T))
    temp2.2.5k = (extra.tree.feature.importance.2.5k[i,] - min(extra.tree.feature.importance.2.5k[i,], na.rm=T)) / (max(extra.tree.feature.importance.2.5k[i,], na.rm=T) - min(extra.tree.feature.importance.2.5k[i,], na.rm=T))
    temp.5k = (decision.tree.feature.importance.5k[i,] - min(decision.tree.feature.importance.5k[i,], na.rm=T)) / (max(decision.tree.feature.importance.5k[i,], na.rm=T) - min(decision.tree.feature.importance.5k[i,], na.rm=T))
    temp2.5k = (extra.tree.feature.importance.5k[i,] - min(extra.tree.feature.importance.5k[i,], na.rm=T)) / (max(extra.tree.feature.importance.5k[i,], na.rm=T) - min(extra.tree.feature.importance.5k[i,], na.rm=T))
    temp.10k = (decision.tree.feature.importance.10k[i,] - min(decision.tree.feature.importance.10k[i,], na.rm=T)) / (max(decision.tree.feature.importance.10k[i,], na.rm=T) - min(decision.tree.feature.importance.10k[i,], na.rm=T))
    temp2.10k = (extra.tree.feature.importance.10k[i,] - min(extra.tree.feature.importance.10k[i,], na.rm=T)) / (max(extra.tree.feature.importance.10k[i,], na.rm=T) - min(extra.tree.feature.importance.10k[i,], na.rm=T))

    decision.tree.importance.norm.100 = rbind(decision.tree.importance.norm.100, temp.100)
    extra.tree.importance.norm.100 = rbind(extra.tree.importance.norm.100, temp2.100)
    decision.tree.importance.norm.1k = rbind(decision.tree.importance.norm.1k, temp.1k)
    extra.tree.importance.norm.1k = rbind(extra.tree.importance.norm.1k, temp2.1k)
    decision.tree.importance.norm.2.5k = rbind(decision.tree.importance.norm.2.5k, temp.2.5k)
    extra.tree.importance.norm.2.5k = rbind(extra.tree.importance.norm.2.5k, temp2.2.5k)
    decision.tree.importance.norm.5k = rbind(decision.tree.importance.norm.5k, temp.5k)
    extra.tree.importance.norm.5k = rbind(extra.tree.importance.norm.5k, temp2.5k)
    decision.tree.importance.norm.10k = rbind(decision.tree.importance.norm.10k, temp.10k)
    extra.tree.importance.norm.10k = rbind(extra.tree.importance.norm.10k, temp2.10k)
    
    #remove NAs
    decision.tree.feature.importance.100[is.na(decision.tree.feature.importance.100)] = 0
    extra.tree.feature.importance.100[is.na(extra.tree.feature.importance.100)] = 0
    decision.tree.feature.importance.1k[is.na(decision.tree.feature.importance.1k)] = 0
    extra.tree.feature.importance.1k[is.na(extra.tree.feature.importance.1k)] = 0
    decision.tree.feature.importance.2.5k[is.na(decision.tree.feature.importance.2.5k)] = 0
    extra.tree.feature.importance.2.5k[is.na(extra.tree.feature.importance.2.5k)] = 0
    decision.tree.feature.importance.5k[is.na(decision.tree.feature.importance.5k)] = 0
    extra.tree.feature.importance.5k[is.na(extra.tree.feature.importance.5k)] = 0
    decision.tree.feature.importance.10k[is.na(decision.tree.feature.importance.10k)] = 0
    extra.tree.feature.importance.10k[is.na(extra.tree.feature.importance.10k)] = 0
}

#remove first rows
decision.tree.importance.norm.100 = decision.tree.importance.norm.100[-1,]
extra.tree.importance.norm.100 = extra.tree.importance.norm.100[-1,]
decision.tree.importance.norm.1k = decision.tree.importance.norm.1k[-1,]
extra.tree.importance.norm.1k = extra.tree.importance.norm.1k[-1,]
decision.tree.importance.norm.2.5k = decision.tree.importance.norm.2.5k[-1,]
extra.tree.importance.norm.2.5k = extra.tree.importance.norm.2.5k[-1,]
decision.tree.importance.norm.5k = decision.tree.importance.norm.5k[-1,]
extra.tree.importance.norm.5k = extra.tree.importance.norm.5k[-1,]
decision.tree.importance.norm.10k = decision.tree.importance.norm.10k[-1,]
extra.tree.importance.norm.10k = extra.tree.importance.norm.10k[-1,]

#sanity check
decision.tree.importance.norm.10k[, 1:6]
extra.tree.importance.norm.10k[, 1:6]

Unnamed: 0,ENSG00000000003.13,ENSG00000000971.14,ENSG00000001084.9,ENSG00000001167.13,ENSG00000001461.15,ENSG00000001561.6
Fold 1_10k,0,0,0,0,0,0
Fold 2_10k,0,0,0,0,0,0
Fold 3_10k,0,0,0,0,0,0
Fold 4_10k,0,0,0,0,0,0
Fold 5_10k,0,0,0,0,0,0


Unnamed: 0,ENSG00000000003.13,ENSG00000000971.14,ENSG00000001084.9,ENSG00000001167.13,ENSG00000001461.15,ENSG00000001561.6
Fold 1_10k,0,0,0,0,0,0
Fold 2_10k,0,0,0,0,0,0
Fold 3_10k,0,0,0,0,0,0
Fold 4_10k,0,0,0,0,0,0
Fold 5_10k,0,0,0,0,0,0


In [41]:
max(decision.tree.importance.norm.10k[1,])
min(decision.tree.importance.norm.10k[1,])

### Neural Network Data

Note: Ends up being unncessary since Edgar normalized the data separately. :)

In [42]:
#normalize log reg coefficients
nn.coef.norm.10k = matrix(ncol = ncol(neural.net.unnormalized.10k))
nn.coef.norm.5k = matrix(ncol = ncol(neural.net.unnormalized.5k))
nn.coef.norm.2.5k = matrix(ncol = ncol(neural.net.unnormalized.2.5k))
nn.coef.norm.1k = matrix(ncol = ncol(neural.net.unnormalized.1k))
nn.coef.norm.100 = matrix(ncol = ncol(neural.net.unnormalized.100))

colnames(nn.coef.norm.10k) = colnames(neural.net.unnormalized.10k)
colnames(nn.coef.norm.5k) = colnames(neural.net.unnormalized.5k)
colnames(nn.coef.norm.2.5k) = colnames(neural.net.unnormalized.2.5k)
colnames(nn.coef.norm.1k) = colnames(neural.net.unnormalized.1k)
colnames(nn.coef.norm.100) = colnames(neural.net.unnormalized.100)

for (i in 1:nrow(neural.net.unnormalized.10k)) {
    #normalize
    temp.10k = (neural.net.unnormalized.10k[i,] - min(neural.net.unnormalized.10k[i,], na.rm=T)) / (max(neural.net.unnormalized.10k[i,], na.rm=T) - min(neural.net.unnormalized.10k[i,], na.rm=T))
    temp.5k = (neural.net.unnormalized.5k[i,] - min(neural.net.unnormalized.5k[i,], na.rm=T)) / (max(neural.net.unnormalized.5k[i,], na.rm=T) - min(neural.net.unnormalized.5k[i,], na.rm=T))
    temp.2.5k = (neural.net.unnormalized.2.5k[i,] - min(neural.net.unnormalized.2.5k[i,], na.rm=T)) / (max(neural.net.unnormalized.2.5k[i,], na.rm=T) - min(neural.net.unnormalized.2.5k[i,], na.rm=T))
    temp.1k = (neural.net.unnormalized.1k[i,] - min(neural.net.unnormalized.1k[i,], na.rm=T)) / (max(neural.net.unnormalized.1k[i,], na.rm=T) - min(neural.net.unnormalized.1k[i,], na.rm=T))
    temp.100 = (neural.net.unnormalized.100[i,] - min(neural.net.unnormalized.100[i,], na.rm=T)) / (max(neural.net.unnormalized.100[i,], na.rm=T) - min(neural.net.unnormalized.100[i,], na.rm=T))

    nn.coef.norm.10k = rbind(nn.coef.norm.10k, temp.10k)
    nn.coef.norm.5k = rbind(nn.coef.norm.5k, temp.5k)
    nn.coef.norm.2.5k = rbind(nn.coef.norm.2.5k, temp.2.5k)
    nn.coef.norm.1k = rbind(nn.coef.norm.1k, temp.1k)
    nn.coef.norm.100 = rbind(nn.coef.norm.100, temp.100)
    #remove NAs
    nn.coef.norm.10k[is.na(nn.coef.norm.10k)] = 0
    nn.coef.norm.5k[is.na(nn.coef.norm.5k)] = 0
    nn.coef.norm.2.5k[is.na(nn.coef.norm.2.5k)] = 0
    nn.coef.norm.1k[is.na(nn.coef.norm.1k)] = 0
    nn.coef.norm.100[is.na(nn.coef.norm.100)] = 0
}
#remove empty first row
nn.coef.norm.10k = nn.coef.norm.10k[-1,]
nn.coef.norm.5k = nn.coef.norm.5k[-1,]
nn.coef.norm.2.5k = nn.coef.norm.2.5k[-1,]
nn.coef.norm.1k = nn.coef.norm.1k[-1,]
nn.coef.norm.100 = nn.coef.norm.100[-1,]

In [43]:
min(nn.coef.norm.10k[1,])
max(nn.coef.norm.10k[1,])

# Combine outputs into super matrix

## Accuracy super matrix

### 10k super matrix

In [44]:
accuracy.super.matrix.10k = matrix(nrow=5, data=0)
rownames(accuracy.super.matrix.10k) = rownames(log.reg.acc.int.10k)

#add logistic regression accuracy to matrix
accuracy.super.matrix.10k = cbind(accuracy.super.matrix.10k, log.reg.acc.int.10k[,1])

#add lasso regression accuracy to matrix
accuracy.super.matrix.10k = cbind(accuracy.super.matrix.10k, lasso.acc.int.10k[,1])

#add decision tree accuracy to matrix
accuracy.super.matrix.10k = cbind(accuracy.super.matrix.10k, decision.tree.accuracy.10k[1])
accuracy.super.matrix.10k = cbind(accuracy.super.matrix.10k, decision.tree.accuracy.10k[2])

#add neural net accuracy to matrix
accuracy.super.matrix.10k = cbind(accuracy.super.matrix.10k, nn.acc[,1])

#name columns
colnames(accuracy.super.matrix.10k) = c("Fold #", "LogReg Accuracy", "Lasso MSE", 
                                        "DecTree Accuracy", "ExTree Accuracy", "NN Val Accuracy")

#remove first row
accuracy.super.matrix.10k = accuracy.super.matrix.10k[,-1]

In [45]:
#sanity check
 accuracy.super.matrix.10k

Unnamed: 0,LogReg Accuracy,Lasso MSE,DecTree Accuracy,ExTree Accuracy,NN Val Accuracy
Fold 1_10k,1.0,0.4752527,83.33333,86.11111,0.8611
Fold 2_10k,1.0,0.2902739,77.77778,88.88889,0.9167
Fold 3_10k,1.0,0.2611277,80.0,97.14286,0.8571
Fold 4_0k,0.9714286,0.3159924,88.57143,82.85714,0.8571
Fold 5_10k,1.0,0.35659,91.42857,85.71429,0.9143


### 5k super matrix

In [46]:
accuracy.super.matrix.5k = matrix(nrow=5, data=0)
rownames(accuracy.super.matrix.5k) = rownames(log.reg.acc.int.5k)

#add logistic regression accuracy to matrix
accuracy.super.matrix.5k = cbind(accuracy.super.matrix.5k, log.reg.acc.int.5k[,1])

#add lasso regression accuracy to matrix
accuracy.super.matrix.5k = cbind(accuracy.super.matrix.5k, lasso.acc.int.5k[,1])

#add decision tree accuracy to matrix
accuracy.super.matrix.5k = cbind(accuracy.super.matrix.5k, decision.tree.accuracy.5k[1])
accuracy.super.matrix.5k = cbind(accuracy.super.matrix.5k, decision.tree.accuracy.5k[2])

#add neural net accuracy to matrix
accuracy.super.matrix.5k = cbind(accuracy.super.matrix.5k, nn.acc[,2])

#name columns
colnames(accuracy.super.matrix.5k) = c("Fold #", "LogReg Accuracy", "Lasso MSE", 
                                       "DecTree Accuracy", "ExTree Accuracy", "NN Val Accuracy")
                                    

#remove first row
accuracy.super.matrix.5k = accuracy.super.matrix.5k[,-1]

#sanity check
 accuracy.super.matrix.5k

Unnamed: 0,LogReg Accuracy,Lasso MSE,DecTree Accuracy,ExTree Accuracy,NN Val Accuracy
Fold 1_5k,0.9722222,0.4752527,83.33333,88.88889,0.8889
Fold 2_5k,0.9714286,0.2902739,77.77778,88.88889,0.8333
Fold 3_5k,0.9428571,0.2611277,88.57143,88.57143,0.9143
Fold 4_5k,0.9428571,0.3159924,88.57143,88.57143,0.7143
Fold 5_5k,0.9722222,0.35659,97.14286,94.28571,0.8286


### 2.5k super matrix

In [47]:
accuracy.super.matrix.2.5k = matrix(nrow=5, data=0)
rownames(accuracy.super.matrix.2.5k) = rownames(log.reg.acc.int.2.5k)

#add logistic regression accuracy to matrix
accuracy.super.matrix.2.5k = cbind(accuracy.super.matrix.2.5k, log.reg.acc.int.2.5k[,1])

#add lasso regression accuracy to matrix
accuracy.super.matrix.2.5k = cbind(accuracy.super.matrix.2.5k, lasso.acc.int.2.5k[,1])

#add decision tree accuracy to matrix
accuracy.super.matrix.2.5k = cbind(accuracy.super.matrix.2.5k, decision.tree.accuracy.2.5k[1])
accuracy.super.matrix.2.5k = cbind(accuracy.super.matrix.2.5k, decision.tree.accuracy.2.5k[2])

#add neural net accuracy to matrix
accuracy.super.matrix.2.5k = cbind(accuracy.super.matrix.2.5k, nn.acc[,3])

#name columns
colnames(accuracy.super.matrix.2.5k) = c("Fold #", "LogReg Accuracy", "Lasso MSE", 
                                         "DecTree Accuracy", "ExTree Accuracy", "NN Val Accuracy")
                                    

#remove first row
accuracy.super.matrix.2.5k = accuracy.super.matrix.2.5k[,-1]

#sanity check
 accuracy.super.matrix.2.5k

Unnamed: 0,LogReg Accuracy,Lasso MSE,DecTree Accuracy,ExTree Accuracy,NN Val Accuracy
Fold 1_2.5k,0.9722222,0.4752527,80.55556,86.11111,0.9167
Fold 2_2.5k,0.9428571,0.2902739,69.44444,94.44444,0.9167
Fold 3_2.5k,0.9714286,0.2611277,85.71429,94.28571,0.9429
Fold 4_2.5k,0.9428571,0.3159924,80.0,88.57143,0.7714
Fold 5_2.5k,0.9722222,0.35659,94.28571,88.57143,0.9143


### 1k super matrix

In [48]:
accuracy.super.matrix.1k = matrix(nrow=5, data=0)
rownames(accuracy.super.matrix.1k) = rownames(log.reg.acc.int.1k)

#add logistic regression accuracy to matrix
accuracy.super.matrix.1k = cbind(accuracy.super.matrix.1k, log.reg.acc.int.1k[,1])

#add lasso regression accuracy to matrix
accuracy.super.matrix.1k = cbind(accuracy.super.matrix.1k, lasso.acc.int.1k[,1])

#add decision tree accuracy to matrix
accuracy.super.matrix.1k = cbind(accuracy.super.matrix.1k, decision.tree.accuracy.1k[1])
accuracy.super.matrix.1k = cbind(accuracy.super.matrix.1k, decision.tree.accuracy.1k[2])

#add neural net accuracy to matrix
accuracy.super.matrix.1k = cbind(accuracy.super.matrix.1k, nn.acc[,4])

#name columns
colnames(accuracy.super.matrix.1k) = c("Fold #", "LogReg Accuracy", "Lasso MSE", 
                                       "DecTree Accuracy", "ExTree Accuracy", "NN Val Accuracy")
                                    

#remove first row
accuracy.super.matrix.1k = accuracy.super.matrix.1k[,-1]

#sanity check
 accuracy.super.matrix.1k

Unnamed: 0,LogReg Accuracy,Lasso MSE,DecTree Accuracy,ExTree Accuracy,NN Val Accuracy
Fold 1_1k,1.0,0.4752527,86.11111,88.88889,0.8889
Fold 2_1k,0.9714286,0.2902739,75.0,83.33333,0.8611
Fold 3_1k,1.0,0.2611277,82.85714,91.42857,0.9714
Fold 4_1k,0.9428571,0.3159924,88.57143,94.28571,0.8857
Fold 5_1k,0.9722222,0.35659,85.71429,97.14286,0.9429


### 100 super matrix

In [49]:
accuracy.super.matrix.100 = matrix(nrow=5, data=0)
rownames(accuracy.super.matrix.100) = rownames(log.reg.acc.int.100)

#add logistic regression accuracy to matrix
accuracy.super.matrix.100 = cbind(accuracy.super.matrix.100, log.reg.acc.int.100[,1])

#add lasso regression accuracy to matrix
accuracy.super.matrix.100 = cbind(accuracy.super.matrix.100, lasso.acc.int.100[,1])

#add decision tree accuracy to matrix
accuracy.super.matrix.100 = cbind(accuracy.super.matrix.100, decision.tree.accuracy.100[1])
accuracy.super.matrix.100 = cbind(accuracy.super.matrix.100, decision.tree.accuracy.100[2])

#add neural net accuracy to matrix
accuracy.super.matrix.100 = cbind(accuracy.super.matrix.100, nn.acc[,5])

#name columns
colnames(accuracy.super.matrix.100) = c("Fold #", "LogReg Accuracy", "Lasso MSE", 
                                        "DecTree Accuracy", "ExTree Accuracy", "NN Val Accuracy")
                                    

#remove first row
accuracy.super.matrix.100 = accuracy.super.matrix.100[,-1]

#sanity check
 accuracy.super.matrix.100

Unnamed: 0,LogReg Accuracy,Lasso MSE,DecTree Accuracy,ExTree Accuracy,NN Val Accuracy
Fold 1_100,0.9444444,0.4752527,88.88889,88.88889,0.6667
Fold 2_100,0.9714286,0.2902739,80.55556,86.11111,0.6667
Fold 3_100,0.9714286,0.2611277,82.85714,97.14286,0.7143
Fold 4_100,0.9714286,0.3159924,91.42857,88.57143,0.7714
Fold 5_100,0.9444444,0.35659,88.57143,94.28571,0.7429


## Coefficients Super Matrix

### 10k gene super matrix

In [51]:
# double check dimensions
# dim(log.reg.coef)
# dim(decision.tree.feature.importance)
# dim(extra.tree.feature.importance)

coef.super.matrix.10k = matrix(ncol = ncol(log.reg.coef.10k), data = 0)
colnames(coef.super.matrix.10k) = colnames(log.reg.coef.10k)

#add log reg coef to super matrix
for(i in 1:nrow(log.reg.coef.10k)) {
    coef.super.matrix.10k = rbind(coef.super.matrix.10k, log.reg.coef.norm.10k[i,]) 
}

for(i in 1:nrow(lasso.coef.10k)) {
    coef.super.matrix.10k = rbind(coef.super.matrix.10k, lasso.coef.norm.10k[i,]) 
}

#add decision tree important features to decision tree
for(i in 1:nrow(decision.tree.feature.importance.10k)) {
    coef.super.matrix.10k = rbind(coef.super.matrix.10k, decision.tree.importance.norm.10k[i,]) 
}

#add extra tree important features to decision tree
for(i in 1:nrow(extra.tree.feature.importance.10k)) {
    coef.super.matrix.10k = rbind(coef.super.matrix.10k, extra.tree.importance.norm.10k[i,]) 
}

# add important features for neural net
for(i in 1:nrow(neural.net.normalized.10k)) {
    coef.super.matrix.10k = rbind(coef.super.matrix.10k, neural.net.normalized.10k[i,]) 
}

#remove first row
coef.super.matrix.10k = coef.super.matrix.10k[-1, ]

#fix row names
rownames(coef.super.matrix.10k) = c("LogReg_Fold1", "LogReg_Fold2", "LogReg_Fold3", "LogReg_Fold4", "LogReg_Fold5",
                               "Lasso_Fold1", "Lasso_Fold2", "Lasso_Fold3", "Lasso_Fold4", "Lasso_Fold5",
                               "DecTree_Fold1", "DecTree_Fold2", "DecTree_Fold3", "DecTree_Fold4", "DecTree_Fold5",
                               "ExTree_Fold1", "ExTree_Fold2", "ExTree_Fold3", "ExTree_Fold4", "ExTree_Fold5",
                               "NNnorm_Fold1", "NNnorm_Fold2", "NNnorm_Fold3", "NNnorm_Fold4", "NNnorm_Fold5")

#sanity check
# dim(coef.super.matrix.10k)
# coef.super.matrix.10k[, 1:6]

### 5k gene super matrix

In [52]:
coef.super.matrix.5k = matrix(ncol = ncol(log.reg.coef.5k), data = 0)
colnames(coef.super.matrix.5k) = colnames(log.reg.coef.5k)

#add log reg coef to super matrix
for(i in 1:nrow(log.reg.coef.5k)) {
    coef.super.matrix.5k = rbind(coef.super.matrix.5k, log.reg.coef.norm.5k[i,]) 
}

for(i in 1:nrow(lasso.coef.5k)) {
    coef.super.matrix.5k = rbind(coef.super.matrix.5k, lasso.coef.norm.5k[i,]) 
}

#add decision tree important features to decision tree
for(i in 1:nrow(decision.tree.feature.importance.5k)) {
    coef.super.matrix.5k = rbind(coef.super.matrix.5k, decision.tree.importance.norm.5k[i,]) 
}

#add extra tree important features to decision tree
for(i in 1:nrow(extra.tree.feature.importance.5k)) {
    coef.super.matrix.5k = rbind(coef.super.matrix.5k, extra.tree.importance.norm.5k[i,]) 
}

# add important features for neural net
for(i in 1:nrow(neural.net.normalized.5k)) {
    coef.super.matrix.5k = rbind(coef.super.matrix.5k, neural.net.normalized.5k[i,]) 
}

#remove first row
coef.super.matrix.5k = coef.super.matrix.5k[-1, ]

#rename rows
rownames(coef.super.matrix.5k) = c("LogReg_Fold1", "LogReg_Fold2", "LogReg_Fold3", "LogReg_Fold4", "LogReg_Fold5",
                                   "Lasso_Fold1", "Lasso_Fold2", "Lasso_Fold3", "Lasso_Fold4", "Lasso_Fold5",
                                   "DecTree_Fold1", "DecTree_Fold2", "DecTree_Fold3", "DecTree_Fold4", "DecTree_Fold5",
                                   "ExTree_Fold1", "ExTree_Fold2", "ExTree_Fold3", "ExTree_Fold4", "ExTree_Fold5",
                                   "NNnorm_Fold1", "NNnorm_Fold2", "NNnorm_Fold3", "NNnorm_Fold4", "NNnorm_Fold5")

In [53]:
#sanity check
 dim(coef.super.matrix.5k)
 coef.super.matrix.5k[, 1:6]

Unnamed: 0,ENSG00000000003.13,ENSG00000000971.14,ENSG00000001084.9,ENSG00000001561.6,ENSG00000002586.16,ENSG00000002933.6
LogReg_Fold1,0.6053645,0.513817,0.4840389,0.557828026,0.54556764,0.4917138
LogReg_Fold2,0.5309788,0.5060045,0.8515418,0.547215711,0.17649615,0.466985
LogReg_Fold3,0.6208883,0.6483652,0.7393011,0.58018011,0.62470107,0.6430368
LogReg_Fold4,0.4522677,0.5067145,0.4495335,0.495645932,0.37058682,0.5432256
LogReg_Fold5,0.6507551,0.5726229,0.4809076,0.621537452,0.55609411,0.5817704
Lasso_Fold1,0.4217334,0.4217334,0.4217334,0.421733415,0.42173341,0.4217334
Lasso_Fold2,0.5652586,0.5652586,0.5652586,0.565258591,0.56525859,0.5652586
Lasso_Fold3,0.4525438,0.4525438,0.4525438,0.45254382,0.45254382,0.4525438
Lasso_Fold4,0.5488613,0.5488613,0.5488613,0.548861343,0.54886134,0.5488613
Lasso_Fold5,0.8108638,0.8108638,0.8108638,0.810863839,0.81086384,0.8108638


### 2.5k gene super matrix

In [54]:
coef.super.matrix.2.5k = matrix(ncol = ncol(log.reg.coef.2.5k), data = 0)
colnames(coef.super.matrix.2.5k) = colnames(log.reg.coef.2.5k)

#add log reg coef to super matrix
for(i in 1:nrow(log.reg.coef.2.5k)) {
    coef.super.matrix.2.5k = rbind(coef.super.matrix.2.5k, log.reg.coef.norm.2.5k[i,]) 
}

for(i in 1:nrow(lasso.coef.2.5k)) {
    coef.super.matrix.2.5k = rbind(coef.super.matrix.2.5k, lasso.coef.norm.2.5k[i,]) 
}

#add decision tree important features to decision tree
for(i in 1:nrow(decision.tree.feature.importance.2.5k)) {
    coef.super.matrix.2.5k = rbind(coef.super.matrix.2.5k, decision.tree.importance.norm.2.5k[i,]) 
}

#add extra tree important features to decision tree
for(i in 1:nrow(extra.tree.feature.importance.2.5k)) {
    coef.super.matrix.2.5k = rbind(coef.super.matrix.2.5k, extra.tree.importance.norm.2.5k[i,]) 
}

# add important features for neural net
for(i in 1:nrow(neural.net.normalized.2.5k)) {
    coef.super.matrix.2.5k = rbind(coef.super.matrix.2.5k, neural.net.normalized.2.5k[i,]) 
}


#remove first row
coef.super.matrix.2.5k = coef.super.matrix.2.5k[-1, ]

#rename rows
rownames(coef.super.matrix.2.5k) = c("LogReg_Fold1", "LogReg_Fold2", "LogReg_Fold3", "LogReg_Fold4", "LogReg_Fold5",
                                     "Lasso_Fold1", "Lasso_Fold2", "Lasso_Fold3", "Lasso_Fold4", "Lasso_Fold5",
                                     "DecTree_Fold1", "DecTree_Fold2", "DecTree_Fold3", "DecTree_Fold4", "DecTree_Fold5",
                                     "ExTree_Fold1", "ExTree_Fold2", "ExTree_Fold3", "ExTree_Fold4", "ExTree_Fold5",
                                     "NNnorm_Fold1", "NNnorm_Fold2", "NNnorm_Fold3", "NNnorm_Fold4", "NNnorm_Fold5")

In [55]:
#sanity check
 dim(coef.super.matrix.2.5k)
 coef.super.matrix.2.5k[, 1:6]

Unnamed: 0,ENSG00000000003.13,ENSG00000000971.14,ENSG00000001561.6,ENSG00000002586.16,ENSG00000003137.7,ENSG00000003436.13
LogReg_Fold1,0.4220371,0.4750274,0.4467388,0.4724761,0.6266091,0.3769483
LogReg_Fold2,0.5936591,0.5672079,0.5658191,0.5866972,0.5670364,0.7258142
LogReg_Fold3,0.3319382,0.3299018,0.3555429,0.3527267,0.377292,0.3436786
LogReg_Fold4,0.4269907,0.4010694,0.4007951,0.3332993,0.3777309,0.3347019
LogReg_Fold5,0.6564436,0.5070464,0.6832262,0.6491977,0.7280068,0.4970034
Lasso_Fold1,0.4069436,0.4069436,0.4069436,0.4069436,0.4069436,0.4069436
Lasso_Fold2,0.5634031,0.5634031,0.5634031,0.5634031,0.5634031,0.5634031
Lasso_Fold3,0.4216463,0.4216463,0.4216463,0.4216463,0.4216463,0.4216463
Lasso_Fold4,0.5413079,0.5413079,0.5413079,0.5413079,0.5413079,0.5413079
Lasso_Fold5,0.750921,0.750921,0.7962521,0.750921,0.750921,0.750921


### 1k gene super matrix

In [56]:
coef.super.matrix.1k = matrix(ncol = ncol(log.reg.coef.1k), data = 0)
colnames(coef.super.matrix.1k) = colnames(log.reg.coef.1k)

#add log reg coef to super matrix
for(i in 1:nrow(log.reg.coef.1k)) {
    coef.super.matrix.1k = rbind(coef.super.matrix.1k, log.reg.coef.norm.1k[i,]) 
}

for(i in 1:nrow(lasso.coef.1k)) {
    coef.super.matrix.1k = rbind(coef.super.matrix.1k, lasso.coef.norm.1k[i,]) 
}

#add decision tree important features to decision tree
for(i in 1:nrow(decision.tree.feature.importance.1k)) {
    coef.super.matrix.1k = rbind(coef.super.matrix.1k, decision.tree.importance.norm.1k[i,]) 
}

#add extra tree important features to decision tree
for(i in 1:nrow(extra.tree.feature.importance.1k)) {
    coef.super.matrix.1k = rbind(coef.super.matrix.1k, extra.tree.importance.norm.1k[i,]) 
}

# add important features for neural net
for(i in 1:nrow(neural.net.normalized.1k)) {
    coef.super.matrix.1k = rbind(coef.super.matrix.1k, neural.net.normalized.1k[i,]) 
}

#remove first row
coef.super.matrix.1k = coef.super.matrix.1k[-1, ]

#rename rows
rownames(coef.super.matrix.1k) = c("LogReg_Fold1", "LogReg_Fold2", "LogReg_Fold3", "LogReg_Fold4", "LogReg_Fold5",
                                   "Lasso_Fold1", "Lasso_Fold2", "Lasso_Fold3", "Lasso_Fold4", "Lasso_Fold5",
                                   "DecTree_Fold1", "DecTree_Fold2", "DecTree_Fold3", "DecTree_Fold4", "DecTree_Fold5",
                                   "ExTree_Fold1", "ExTree_Fold2", "ExTree_Fold3", "ExTree_Fold4", "ExTree_Fold5",
                                   "NNnorm_Fold1", "NNnorm_Fold2", "NNnorm_Fold3", "NNnorm_Fold4", "NNnorm_Fold5")

In [57]:
#sanity check
 dim(coef.super.matrix.1k)
 coef.super.matrix.1k[, 1:6]

Unnamed: 0,ENSG00000000003.13,ENSG00000002586.16,ENSG00000003436.13,ENSG00000004139.12,ENSG00000004399.11,ENSG00000005073.5
LogReg_Fold1,0.252592,0.63174319,0.45309848,0.2984767,0.66144338,0.45080045
LogReg_Fold2,0.5464902,0.71914688,0.40278551,0.468456,0.68131716,0.5372777
LogReg_Fold3,0.546131,0.35468475,0.52870728,0.4531102,0.53254638,0.57954985
LogReg_Fold4,0.6511066,0.69886484,0.6830919,0.6510005,0.80522893,0.60072555
LogReg_Fold5,0.619475,0.4718451,0.67253394,0.8636829,0.52272873,0.56926622
Lasso_Fold1,0.5261222,0.52612218,0.52612218,0.5261222,0.52612218,0.52612218
Lasso_Fold2,0.580629,0.58062897,0.58062897,0.580629,0.58062897,0.58062897
Lasso_Fold3,0.5625802,0.56258025,0.56258025,0.5625802,0.56258025,0.56258025
Lasso_Fold4,0.6388633,0.6388633,0.6388633,0.6388633,0.6388633,0.6388633
Lasso_Fold5,0.8144165,0.81441652,0.81441652,0.8144165,0.81441652,0.81441652


### 100 gene super matrix

In [58]:
coef.super.matrix.100 = matrix(ncol = ncol(log.reg.coef.100), data = 0)
colnames(coef.super.matrix.100) = colnames(log.reg.coef.100)

#add log reg coef to super matrix
for(i in 1:nrow(log.reg.coef.100)) {
    coef.super.matrix.100 = rbind(coef.super.matrix.100, log.reg.coef.norm.100[i,]) 
}

for(i in 1:nrow(lasso.coef.100)) {
    coef.super.matrix.100 = rbind(coef.super.matrix.100, lasso.coef.norm.100[i,]) 
}

#add decision tree important features to decision tree
for(i in 1:nrow(decision.tree.feature.importance.100)) {
    coef.super.matrix.100 = rbind(coef.super.matrix.100, decision.tree.importance.norm.100[i,]) 
}

#add extra tree important features to decision tree
for(i in 1:nrow(extra.tree.feature.importance.100)) {
    coef.super.matrix.100 = rbind(coef.super.matrix.100, extra.tree.importance.norm.100[i,]) 
}

# add important features for neural net
for(i in 1:nrow(neural.net.normalized.100)) {
    coef.super.matrix.100 = rbind(coef.super.matrix.100, neural.net.normalized.norm.100[i,]) 
}

#remove first row
coef.super.matrix.100 = coef.super.matrix.100[-1, ]

#rename rows
rownames(coef.super.matrix.100) = c("LogReg_Fold1", "LogReg_Fold2", "LogReg_Fold3", "LogReg_Fold4", "LogReg_Fold5",
                                    "Lasso_Fold1", "Lasso_Fold2", "Lasso_Fold3", "Lasso_Fold4", "Lasso_Fold5",
                                    "DecTree_Fold1", "DecTree_Fold2", "DecTree_Fold3", "DecTree_Fold4", "DecTree_Fold5",
                                    "ExTree_Fold1", "ExTree_Fold2", "ExTree_Fold3", "ExTree_Fold4", "ExTree_Fold5",
                                    "NNnorm_Fold1", "NNnorm_Fold2", "NNnorm_Fold3", "NNnorm_Fold4", "NNnorm_Fold5")

ERROR: Error in rbind(coef.super.matrix.100, neural.net.normalized.norm.100[i, : object 'neural.net.normalized.norm.100' not found


In [None]:
#sanity check
 dim(coef.super.matrix.100)
 coef.super.matrix.100[, 1:6]

In [None]:
for (i in 1:nrow(coef.super.matrix.10k)) {
    #remove NAs
    coef.super.matrix.10k[is.na(coef.super.matrix.10k)] = 0
}
for (i in 1:nrow(coef.super.matrix.5k)) {
    #remove NAs
    coef.super.matrix.5k[is.na(coef.super.matrix.5k)] = 0
}
for (i in 1:nrow(coef.super.matrix.2.5k)) {
    #remove NAs
    coef.super.matrix.2.5k[is.na(coef.super.matrix.2.5k)] = 0
}
for (i in 1:nrow(coef.super.matrix.1k)) {
    #remove NAs
    coef.super.matrix.1k[is.na(coef.super.matrix.1k)] = 0
}
for (i in 1:nrow(coef.super.matrix.100)) {
    #remove NAs
    coef.super.matrix.100[is.na(coef.super.matrix.100)] = 0
}

### Write to file

In [None]:
#write.table(coefficients.10k, file="../Data_Analysis/log_reg/LogRegCoef_10k.txt", append=FALSE, sep="\t", dec=".", row.names=TRUE, col.names=TRUE)
write.table(coef.super.matrix.10k, file="super_matrix/coef_10k.txt", append=FALSE, sep="\t", dec=".", row.names=TRUE, col.names=TRUE)
write.table(coef.super.matrix.5k, file="super_matrix/coef_5k.txt", append=FALSE, sep="\t", dec=".", row.names=TRUE, col.names=TRUE)
write.table(coef.super.matrix.2.5k, file="super_matrix/coef_2.5k.txt", append=FALSE, sep="\t", dec=".", row.names=TRUE, col.names=TRUE)
write.table(coef.super.matrix.1k, file="super_matrix/coef_1k.txt", append=FALSE, sep="\t", dec=".", row.names=TRUE, col.names=TRUE)
write.table(coef.super.matrix.100, file="super_matrix/coef_100.txt", append=FALSE, sep="\t", dec=".", row.names=TRUE, col.names=TRUE)

In [None]:
write.table(accuracy.super.matrix.10k, file="super_matrix/acc_10k.txt", append=FALSE, sep="\t", dec=".", row.names=TRUE, col.names=TRUE)
write.table(accuracy.super.matrix.5k, file="super_matrix/acc_5k.txt", append=FALSE, sep="\t", dec=".", row.names=TRUE, col.names=TRUE)
write.table(accuracy.super.matrix.2.5k, file="super_matrix/acc_2.5k.txt", append=FALSE, sep="\t", dec=".", row.names=TRUE, col.names=TRUE)
write.table(accuracy.super.matrix.1k, file="super_matrix/acc_1k.txt", append=FALSE, sep="\t", dec=".", row.names=TRUE, col.names=TRUE)
write.table(accuracy.super.matrix.100, file="super_matrix/acc_100.txt", append=FALSE, sep="\t", dec=".", row.names=TRUE, col.names=TRUE)