<font color='blue'>Cell 1: Importing libraries

In [1]:
install.packages('deepnet')
library(deepnet)

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)



<font color='blue'>Cell 2: Read in the dataset

In [2]:
 
## Instructions for user: 
# Make sure you have downloaded the file "converted_data.csv"' on to your computer
# 1) Click the file icon on the left bar of the google colab interface
# 2) Click the 'upload to session storage' button
# 3) find the "converted_data.csv" file on your computer and upload it the session storage

data <- read.csv("converted_data.csv")
dim(data)
summary(data)

       X          Protein          AA.Sequence        SST3.Sequence     
 Min.   :   0   Length:7117        Length:7117        Length:7117       
 1st Qu.:1779   Class :character   Class :character   Class :character  
 Median :3558   Mode  :character   Mode  :character   Mode  :character  
 Mean   :3558                                                           
 3rd Qu.:5337                                                           
 Max.   :7116                                                           
     Length        B.Count          C.Count            H.Count      
 Min.   :  29   Min.   :0.0000   Min.   :0.007042   Min.   :0.0000  
 1st Qu.:  93   1st Qu.:0.1084   1st Qu.:0.307210   1st Qu.:0.1765  
 Median : 159   Median :0.2669   Median :0.362745   Median :0.3484  
 Mean   : 205   Mean   :0.2665   Mean   :0.376588   Mean   :0.3568  
 3rd Qu.: 274   3rd Qu.:0.3913   3rd Qu.:0.425926   3rd Qu.:0.4900  
 Max.   :1419   Max.   :0.8243   Max.   :1.000000   Max.   :0.9930  
      

<font color='blue'>Cell 3:  Remove proteins with non standard amino acids

In [3]:
# Returns the data without the rows of non standard amino acids
# I.e it Checks for any 'X's or ';'s in the Amino Acid sequences
removeNonStandardAA <- function(crude_data){
        aa_nonstd <- c()
        # iterates over every row
        for (i in 1:nrow(crude_data)){
                # checks for any rows with 'X'
                if (grepl('X',crude_data[i,3], fixed = TRUE)){
                        aa_nonstd <- c(aa_nonstd,i)
                # Checks for any rows with ';'
                }else if (grepl(';',crude_data[i,3], fixed = TRUE)){
                        aa_nonstd <- c(aa_nonstd,i)
                }
        }
        # removes the specified rows from the data set
        std_list <- crude_data[-aa_nonstd,]
        x <- list(std_list, aa_nonstd)
        return(x)
}
#Current Dataset dimensions
dim(data)
data <- removeNonStandardAA(data)[[1]]
# Dataset dimenstions after nonstandard data removed
dim(data)

<font color='blue'>Cell 4: Data set verification

In [4]:
# Check the data set to make sure no data is missing and check the class labels

verifyDatasets <- function(data){
        # if any of the rows have missing values return missing data
        all_good <- TRUE
        for (i in 1:ncol(data)){
                for (j in 1:nrow(data)){
                        if(is.null(data[j,i])){
                                cat('Null Data at row: ',j,' and column: ',i)
                                all_good <- FALSE
                                }
                }
        }
        if (all_good){
                print("Dataset is complete. There are no missing values")
        }
        
}
verifyDatasets(data)


[1] "Dataset is complete. There are no missing values"


<font color='blue'>Cell 5: Data set splitting function

In [5]:
# The below functions first reduces the data size by the limit fraction, 
# and then splits the reduced data into training and testing sets based on
# the train_size fraction
splitDataset <- function(data, limit, train_size){
        # reduce the number of rows of data
        limit <- limit*nrow(data)
        data <- data[sample(1:nrow(data), limit),]
        # create the training size based on the train_size fraction
        train_size <- nrow(data)*train_size
        training_indices <- sample(1:nrow(data), train_size)
        # create the new training and testing datasets
        training_data <- data[training_indices,]
        testing_data <- data[-training_indices,]
        # returns a list of the two datasets
        x <- list(training_data,testing_data)
        return(x)
}

<font color='blue'>Cell 6: Creating the amino acid encoding alphabet

In [6]:
#Defining codes for the alphabet
aa_codes <-  c('A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T',
                     'V','W','Y','NULL')
#An identity matrix is one such that all diagonal elements are equal to 1, while the rest are 0
aa_alphabet <- matrix(diag(21), nrow = 21, ncol = 21, dimnames = list(aa_codes,aa_codes))
aa_alphabet

Unnamed: 0,A,C,D,E,F,G,H,I,K,L,⋯,N,P,Q,R,S,T,V,W,Y,NULL
A,1,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
C,0,1,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
D,0,0,1,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
E,0,0,0,1,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
F,0,0,0,0,1,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
G,0,0,0,0,0,1,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
H,0,0,0,0,0,0,1,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
I,0,0,0,0,0,0,0,1,0,0,⋯,0,0,0,0,0,0,0,0,0,0
K,0,0,0,0,0,0,0,0,1,0,⋯,0,0,0,0,0,0,0,0,0,0
L,0,0,0,0,0,0,0,0,0,1,⋯,0,0,0,0,0,0,0,0,0,0


<font color='blue'>Cell 7: Creating the secondary structure encoding alphabet


In [7]:
# Defining the encoding matrix for the structures
sst3_codes <-  c('B','C','H')
sst3_alphabet <-  matrix(diag(3), nrow = 3, ncol = 3, dimnames = list(sst3_codes,sst3_codes))
sst3_alphabet


Unnamed: 0,B,C,H
B,1,0,0
C,0,1,0
H,0,0,1


<font color='blue'>Cell 8: Amino acid encoding and windowing function

In [8]:
aaEncode <- function(data_set,window_size){
        # null size is half of the window size (rounded down)
        null_size <-  floor(window_size/2)
        # the first and last 8 rows will be padded with 'null' rows
        null_pad <- rep((aa_alphabet[,'NULL']),null_size)
        # each row will be 17*21 = 357 bits long, with 17 1's , and the rest 0's. 
        row_length <- window_size*ncol(aa_alphabet)
        # iterates over every row in the data set
        for (i in 1:nrow(data_set)){
                current_sequence <- numeric()
                current_row <- unlist(strsplit(data_set[i,3], "" ))
                
                encoded_sequence <- numeric()
                # encodes each amino acid in current AA sequence into bits
                for (j in 1:length(current_row)){
                        base_code <- aa_alphabet[,current_row[j]]
                        encoded_sequence <- c(encoded_sequence,base_code)
                        
                }
                # pads the front and back ends with the 'NULL' Sequence
                encoded_sequence <- c(null_pad, encoded_sequence)
                encoded_sequence <- c(encoded_sequence, null_pad)
                
                #The encoded sequences representing the amino acids are gathered and flattened
                # in to groups according to the window size
                x_data <- array(dim = c(length(current_row),row_length))
                
                # Windowing :  
                # The encoded sequences representing the amino acids are gathered in groups according to the window size
                for (k in 0:(length(current_row)-1)){
                        x_data[k+1,] <- encoded_sequence[(1+k*21):(row_length+k*21)]
                }
                
                # Add the array of data to the array of all the encoded sequences
                if (i==1){ #the first iteration creates the final array
                        aa_encoded <- x_data
                }else{ # Every iteration after the first adds on to the final array
                        aa_encoded <- rbind(aa_encoded,x_data)
                }
                
        }
        return(aa_encoded)
}

<font color='blue'>Cell 9: Secondary Structure encoding function

In [9]:
sst3Encode <- function(data_set){
        
                              
        # iterate over each row of the data set
        for (i in 1:nrow(data_set)){
                sst3_seq <- unlist(strsplit(data_set[i,4],""))
                current_seq_encoded <- array(dim = c(length(sst3_seq),3))
                # iterate over the current sequence for each structure shape
                for (j in 1:length(sst3_seq)){
                        sst_code <-  sst3_alphabet[,sst3_seq[j]]
                        current_seq_encoded[j,] <- sst_code
                }
                # For the first run, create the encoded array,
                if (i  == 1){ 
                        sst3_encoded <- current_seq_encoded
                        
                }else{# For all other runs, add onto to the encoded array
                        sst3_encoded <- rbind(sst3_encoded,current_seq_encoded)
                }
                
                
        }
        return(sst3_encoded)
}

<font color='blue'>Cell 10: Running the train/test splitting function

In [10]:
data_frac <- 0.01     #data_frac is the fraction of the total data to use for both training and test sets
train_frac <-  0.7    #train_frac is the fraction of the used set to use for training

new_data <- splitDataset(data, data_frac, train_frac)
train_data <- new_data[[1]]
test_data <- new_data[[2]]

<font color='blue'>Cell 11: Encoding the training/testing input and output data

In [11]:
window_size <- 17  # A window size of 17 allows 8 amino acids bere, and after the current amino acid.
x_train <- aaEncode(train_data,window_size) 
x_test <- aaEncode(test_data,window_size)

y_train <- sst3Encode(train_data)
y_test <- sst3Encode(test_data)



<font color='blue'>Cell 12: Implement the nueralnet train function

In [12]:
#13 Implement the nueral net
hidden_layers <- c(6)
nn <- nn.train(x_train, y_train, hidden = c(hidden_layers),activationfun = 'sigm', numepochs = 5, output = "softmax")

# The number of input, hidden , and output states, respectively
nn$size


<font color='blue'>Cell 13: Make Predictions using the trained Neural net 

In [13]:
prediction <- nn.predict(nn,x_test)
dim(prediction)
dim(y_test)
min <- 1
max <- 15
results <- cbind("",round(prediction[min:max,],2),"",y_test[min:max,])
colnames(results)<-c('Predictions:','B','C','H','Actual:', 'B','C','H')
results

Predictions:,B,C,H,Actual:,B.1,C.1,H.1
,0.12,0.83,0.05,,0,1,0
,0.28,0.65,0.07,,1,0,0
,0.64,0.24,0.12,,1,0,0
,0.76,0.15,0.09,,1,0,0
,0.79,0.09,0.13,,1,0,0
,0.79,0.1,0.1,,1,0,0
,0.69,0.24,0.08,,1,0,0
,0.47,0.41,0.13,,1,0,0
,0.15,0.57,0.28,,0,1,0
,0.16,0.71,0.13,,0,1,0


<font color='blue'>Cell 14: Creating the Confustion Matrix

In [14]:
# Applying the which.max function across each row of the predicted values.
predicted_values <- apply(prediction, 1, which.max)
# Doing the same with the y_test dataset
observed_values <- apply(y_test, 1, which.max)
# Create the table
table <- table(predicted_values, observed_values)
rownames(table) <- c('B','C','H')
colnames(table) <- c('B','C','H') 

table
# Function to normalize the confusion matrix
normalizeCM <- function(cm){
        cm_norm <- cm
        for ( i in 1:nrow(cm)){
                for ( j in 1:ncol(cm)){
                        cm_norm[i,j] <- cm[i,j]/sum(cm[i,])
                }
        }
        return(round(cm_norm, 2))
}
cat("\n")
cat("Normalized Confusion Matrix: ")
cat("\n")
normalizeCM(table)

                

                observed_values
predicted_values    B    C    H
               B  467  148  323
               C  521 1410  724
               H  149  105  831


Normalized Confusion Matrix: 


                observed_values
predicted_values    B    C    H
               B 0.50 0.16 0.34
               C 0.20 0.53 0.27
               H 0.14 0.10 0.77