In [1]:
options(warn=-1)

In [2]:
library(openxlsx)

In [3]:
sample_info <- read.xlsx("CAMDA_Challange_dataset_filenames.xlsx", startRow = 2)

In [4]:
sample_info <- sample_info[,c(1,2,3,4,6)]
colnames(sample_info)=c("ID","Category","Label","MCF7","PC3")

In [5]:
head(sample_info)

ID,Category,Label,MCF7,PC3
S No.,Data Type,Class Label,Compound .cel file name,Compound .cel file name
1,Training,1,5500024030403071907255.C05,5500024031723100807775.C05
2,Training,1,5500024030403071907253.A09,5500024031723100807771.A09
3,Training,0,5500024030402071707279.B01,5500024031723100807776.B01
4,Training,1,5500024030403071907257.G04,5500024030700072107992.G04
5,Training,0,'5500024037496121008324.E08,'5500024037498121108438.E02


## Cleaning columns.

In the colums, there are some extra characters, for example, '/' and extra spaces. 
In simple terms, we are remove extra characters.
gsub matches the pattern

In [6]:
sample_info$Category <- gsub("Training ", "Training", sample_info$Category)
sample_info$MCF7 <- gsub("^\'","", sample_info$MCF7)
sample_info$PC3 <- gsub("^\'","", sample_info$PC3)

In [7]:
# Separating training data 
sample_train <- sample_info[sample_info$Category=="Training",]
head(sample_train)

Unnamed: 0,ID,Category,Label,MCF7,PC3
2,1,Training,1,5500024030403071907255.C05,5500024031723100807775.C05
3,2,Training,1,5500024030403071907253.A09,5500024031723100807771.A09
4,3,Training,0,5500024030402071707279.B01,5500024031723100807776.B01
5,4,Training,1,5500024030403071907257.G04,5500024030700072107992.G04
6,5,Training,0,5500024037496121008324.E08,5500024037498121108438.E02
7,6,Training,0,5500024032848101507997.F03,5500024035736031208613.F03


In [8]:
# factoring out the label cloumn
Label <- factor(sample_train$Label,levels = c(1,0))

# replacing the column name - positive for 1 and negative for 0
levels(Label) <- c("Positive","Negative")

In [9]:
# needed to do the partition

library(caret)


Loading required package: lattice
Loading required package: ggplot2


In [10]:
# selecting the data for training, we are doing 60% for training and 40% for testing.
set.seed(2018)
inTraining <- createDataPartition(Label, p=0.6, list=FALSE, times=1)
head(inTraining)

Resample1
1
4
6
7
8
9


In [11]:
library(magrittr)

# needed these libaries for visualization
library(dplyr)
library(kableExtra)
library(knitr)
library(formattable)


Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union



In [55]:
as.data.frame(sample_info) %>% mutate(
  Category = cell_spec(Category, color = "white", background = ifelse(Category =="Training", "Blue", "darkgray")),
  Label = cell_spec(Label, color = ifelse(is.na(Label), "darkgray", ifelse(Label==1, "Red","Green")))
) %>% kable("html", align= "c", escape = F) %>%
  kable_styling("striped") %>%
  scroll_box(height = "300px")


Setting cell_spec format as html
Setting cell_spec format as html


<div style="border: 1px solid #ddd; padding: 5px; overflow-y: scroll; height:300px; "><table class="table table-striped" style="margin-left: auto; margin-right: auto;">
 <thead>
  <tr>
   <th style="text-align:center;"> ID </th>
   <th style="text-align:center;"> Category </th>
   <th style="text-align:center;"> Label </th>
   <th style="text-align:center;"> MCF7 </th>
   <th style="text-align:center;"> PC3 </th>
  </tr>
 </thead>
<tbody>
  <tr>
   <td style="text-align:center;"> S No. </td>
   <td style="text-align:center;"> <span style="     color: white;border-radius: 4px; padding-right: 4px; padding-left: 4px; background-color: darkgray;">Data Type </span> </td>
   <td style="text-align:center;"> <span style="     color: Green;">Class Label</span> </td>
   <td style="text-align:center;"> Compound .cel file name </td>
   <td style="text-align:center;"> Compound .cel file name </td>
  </tr>
  <tr>
   <td style="text-align:center;"> 1 </td>
   <td style="text-align:center;"> <span sty

## PC3 Data Setup

In [15]:
# for unzipping the file
library(R.utils)

In [16]:
# unzip all files

count = 0

for (i in 1:nrow(sample_train)) {
    tmp_PC3_cel = sample_train$PC3[i]
    
    # Unzip bzfile if necessary
    file_handle = paste("PC3/", tmp_PC3_cel, ".CEL.bz2", sep = "")
    
    if (file.exists(file_handle)) {
        # remove temporary file if interrupted before.
        if (file.exists(paste("PC3/", tmp_PC3_cel, ".CEL.tmp", sep = ""))) {
            file.remove(paste("PC3/", tmp_PC3_cel, ".CEL.tmp", sep = ""))
        }
        # Unzip bz file
        bunzip2(file_handle)
        
        count = count + 1
    } else {
        cat(paste("There is no cel file named ", file_handle, "\n", sep = ""))
    }
}

print(count)

[1] 190


In [18]:
library(limma)
library(affy)

In [19]:
missing_drug <- NULL  # If there are mismatched cel file name.
Final_matrix_PC3 <- NULL
Final_matrix_PC3_type2 <- NULL

In [20]:
for (i in 1:nrow(sample_train)) {
    tmp_PC3_cel = sample_train$PC3[i]
    
    file_handle_2 = paste("PC3/", tmp_PC3_cel, ".CEL", sep = "")
    if (file.exists(file_handle_2)) {
        ## If using RMA
        data_rma <- just.rma(file_handle_2)  # rma normalization
        data_exp <- exprs(data_rma)

        if (is.null(nrow(Final_matrix_PC3)) || nrow(Final_matrix_PC3) == nrow(data_exp)) {
            Final_matrix_PC3 <- cbind(Final_matrix_PC3, data_exp)
            print("Number 1")
        } else if (is.null(nrow(Final_matrix_PC3_type2)) || nrow(Final_matrix_PC3_type2) == nrow(data_exp)) {
            Final_matrix_PC3_type2 <- cbind(Final_matrix_PC3_type2, data_exp)
            print("Number 2")
        } else {
            cat("Cannot match any existed ArrayType!\n")
        }
        # Unclassify_data[[n]] <- data_exp n=n+1
    } else {
        cat(paste("There is no cel file named ", file_handle_2, "\n", sep = ""))
        missing_drug <- cbind(missing_drug, sample_train$ID[i])
    }
}

also installing the dependencies 'bit', 'prettyunits', 'bit64', 'blob', 'IRanges', 'DBI', 'RSQLite', 'S4Vectors', 'AnnotationDbi'



package 'bit' successfully unpacked and MD5 sums checked
package 'prettyunits' successfully unpacked and MD5 sums checked
package 'bit64' successfully unpacked and MD5 sums checked
package 'blob' successfully unpacked and MD5 sums checked
package 'IRanges' successfully unpacked and MD5 sums checked
package 'DBI' successfully unpacked and MD5 sums checked
package 'RSQLite' successfully unpacked and MD5 sums checked
package 'S4Vectors' successfully unpacked and MD5 sums checked
package 'AnnotationDbi' successfully unpacked and MD5 sums checked

The downloaded binary packages are in
	C:\Users\Acer\AppData\Local\Temp\Rtmpak3bJ6\downloaded_packages


installing the source package 'hthgu133acdf'




[1] "Number 1"
[1] "Number 1"
[1] "Number 1"
[1] "Number 1"
[1] "Number 1"
[1] "Number 1"
[1] "Number 1"
[1] "Number 1"
[1] "Number 1"
[1] "Number 1"
[1] "Number 1"
[1] "Number 1"
[1] "Number 1"
[1] "Number 1"
[1] "Number 1"
[1] "Number 1"
[1] "Number 1"
[1] "Number 1"
[1] "Number 1"
[1] "Number 1"
[1] "Number 1"
[1] "Number 1"
[1] "Number 1"
[1] "Number 1"
[1] "Number 1"
[1] "Number 1"


installing the source package 'hgu133acdf'



Attaching package: 'hgu133acdf'

The following objects are masked from 'package:hthgu133acdf':

    i2xy, xy2i



[1] "Number 2"
[1] "Number 1"
[1] "Number 1"
[1] "Number 1"
[1] "Number 1"
[1] "Number 1"
[1] "Number 1"
[1] "Number 1"
[1] "Number 1"
[1] "Number 1"
[1] "Number 1"
[1] "Number 1"
[1] "Number 1"
[1] "Number 1"
[1] "Number 2"
[1] "Number 1"
[1] "Number 1"
[1] "Number 1"
[1] "Number 2"
[1] "Number 1"
[1] "Number 2"
[1] "Number 1"
[1] "Number 1"
[1] "Number 2"
[1] "Number 1"
[1] "Number 1"
[1] "Number 1"
[1] "Number 1"
[1] "Number 1"
[1] "Number 1"
[1] "Number 1"
[1] "Number 1"
[1] "Number 1"
[1] "Number 1"
[1] "Number 2"
[1] "Number 1"
[1] "Number 1"
[1] "Number 1"
[1] "Number 1"
[1] "Number 1"
[1] "Number 1"
[1] "Number 1"
[1] "Number 1"
[1] "Number 1"
[1] "Number 1"
[1] "Number 1"
[1] "Number 1"
[1] "Number 1"
[1] "Number 1"
[1] "Number 1"
[1] "Number 1"
[1] "Number 1"
[1] "Number 1"
[1] "Number 1"
[1] "Number 1"
[1] "Number 1"
[1] "Number 1"
[1] "Number 1"
[1] "Number 2"
[1] "Number 1"
[1] "Number 1"
[1] "Number 1"
[1] "Number 1"
[1] "Number 1"
[1] "Number 1"
[1] "Number 1"
[1] "Numbe

In [21]:
common_probe <- intersect(rownames(Final_matrix_PC3),rownames(Final_matrix_PC3_type2))
Final_matrix_PC3_2 <- Final_matrix_PC3_type2[common_probe,]
data_PC3 <- cbind(Final_matrix_PC3, Final_matrix_PC3_2)

In [35]:
# writing a csv file
write.csv(data_PC3, file = "pc3_data.csv")

In [27]:
dim(data_PC3)
head(data_PC3)

Unnamed: 0,5500024031723100807775.C05.CEL,5500024031723100807771.A09.CEL,5500024031723100807776.B01.CEL,5500024030700072107992.G04.CEL,5500024037498121108438.E02.CEL,5500024035736031208613.F03.CEL,5500024031723100807771.F10.CEL,5500024024213121906564.B01.CEL,5500024035736031208613.H04.CEL,5500024035736031208613.D01.CEL,...,EC2004070116AA.CEL,610611110806.B08.CEL,EC2004060209AA.CEL,610611110806.C02.CEL,610611110806.C08.CEL,610611110806.G07.CEL,EC2004070117AA.CEL,EC2005030717AA.CEL,610611110806.F06.CEL,610611110806.H12.CEL
1007_s_at,9.714717,10.007076,9.787519,9.741467,7.086778,8.743882,9.222144,9.877804,9.029211,8.992003,...,9.268386,9.120865,8.522663,8.677258,8.102172,9.262965,9.316974,8.558927,9.398136,9.60684
1053_at,8.885926,9.581804,8.980832,9.246175,6.117286,7.893153,8.8895,7.867493,7.965054,7.978511,...,8.164277,7.153838,7.060515,5.809041,5.4496,6.31485,8.259587,7.082478,6.501114,6.956225
117_at,5.523915,6.103792,6.041133,5.269015,3.514054,4.332979,5.577933,4.331766,4.795756,4.962872,...,7.334316,4.832424,6.460355,4.04992,3.253301,4.804609,6.698009,6.243701,4.808187,5.687255
121_at,7.035597,6.790657,6.940681,6.830067,4.248243,6.201345,5.908835,6.74403,6.075953,6.52099,...,9.464658,6.907116,8.506009,6.70178,5.720865,7.178489,9.399568,8.232961,7.238113,7.594323
1255_g_at,4.502539,4.963646,5.158741,4.592952,3.014705,4.034152,4.60487,3.857908,4.09459,4.374287,...,5.12254,3.163814,3.9186,2.904824,2.361652,3.759726,4.996464,3.714551,3.265466,3.490424
1294_at,5.212404,5.762783,6.072555,4.987831,3.359254,4.69599,5.350683,4.323544,4.474941,4.876034,...,7.328417,4.652221,6.664024,4.565203,3.444351,4.820261,7.234724,6.572356,5.117041,5.914645


In [30]:
#Binding labels with the data
data_train_PC3 <- mutate(as.data.frame(t(data_PC3)), Label=Label)
dim(data_train_PC3)

In [40]:
# generating a csv file
write.csv(data_train_PC3, file = "pc3_data_labels.csv")

In [48]:
data_train_PC3[1:10, c(1:5,22275:22278)]

1007_s_at,1053_at,117_at,121_at,1255_g_at,AFFX-TrpnX-3_at,AFFX-TrpnX-5_at,AFFX-TrpnX-M_at,Label
9.714717,8.885926,5.523915,7.035597,4.502539,4.614942,4.537976,4.664314,Positive
10.007076,9.581804,6.103792,6.790657,4.963646,5.422156,4.883759,5.329954,Positive
9.787519,8.980832,6.041133,6.940681,5.158741,5.523111,5.046227,5.481926,Negative
9.741467,9.246175,5.269015,6.830067,4.592952,4.583175,4.377955,4.94105,Positive
7.086778,6.117286,3.514054,4.248243,3.014705,2.878322,2.82576,3.208905,Negative
8.743882,7.893153,4.332979,6.201345,4.034152,4.219357,3.946508,3.944669,Negative
9.222144,8.8895,5.577933,5.908835,4.60487,5.047445,4.733086,5.282131,Positive
9.877804,7.867493,4.331766,6.74403,3.857908,3.861522,3.770792,3.89171,Positive
9.029211,7.965054,4.795756,6.075953,4.09459,4.410633,4.260362,4.460469,Positive
8.992003,7.978511,4.962872,6.52099,4.374287,4.571669,4.42913,4.696386,Positive


In [36]:
save(file="GDILI_traindata_PC3.RData",data_train_PC3,sample_train)

In [None]:
## without transpose and labels original data.

In [37]:
save(file="GDILI_traindata_PC3_without_transpose_label.RData",data_PC3,sample_train)