In [1]:
#run first
import matplotlib.pyplot as plt
import sklearn.metrics as skm

import docopt
import numpy as np
from Bio import SeqIO
import copy
from NNfxns import neural_network
from NNfxns import train


In [2]:
#run second

"""
-Make a neuralnet class
-define my functions
  -autoencoder and its parameters
  -initial values/input
  -vecotorization
  -expected values
  -output
  -backwards propagation
  -feedforward propagation
  -weights/bias
  -sigmoid function
  -sigmoid derivative

"""
# Make a class to allow self reference
class neural_network():

# The neuralnet will have a number of input neurons, a hidden layer, and number of an output neurons of the specified numbers, 
# which we apply to the class and can adjust as necessary later

    def __init__(self, input_neur=8, hidden_neur=3, output_neur=8):
        self.input_neur = input_neur
        self.hidden_neur = hidden_neur
        self.output_neur = output_neur

        # We want the autoencoder to learn the best weights for the data so we 
        # initialize matrices with random weights 0.1 > x > -0.1
        # These will link the autoencoder's different layers.
        
        self.cnx_matrix_1 = np.random.randn(self.input_neur, self.hidden_neur) / 10
        self.cnx_matrix_2 = np.random.randn(self.hidden_neur, self.output_neur) / 10

        # We have several vectors that our encoder will make to take in (input) and put out (output) data.
        # Recall the hidden layer will reduce the dimension, in a vector-wise fashion
        
        self.input_vector = None
        self.hidden_neur_output = None
        self.out_neur_output = None

        # We can also start off our autoencoder with a biased vector or matrices, depending on what we need from our model
        
        self.input_with_bias = None
        self.bias_mat1 = None
        self.bias_mat2 = None

        # Recall that we need to take the derivative of the cost function with respect to weight, and develop matrices from this
        
        self.hidden_dx_matrix = np.zeros([self.hidden_neur, self.hidden_neur])
        self.output_dx_matrix = np.zeros([self.output_neur, self.output_neur])

        # Hyperparameter: Learning Rate (controls how quickly a neuralnet learns a problem)
        # Typically in a range between [0.1, 1], configureable
        # A perfect learning rate will make the model learn to best approximate the function given available resources (layers/nodes)
        # in a given number of epochs
        # We'll start with a large one because we've got other stuff to do, dudes. The pitfall of that is that it will likely arrive at a suboptimal
        # set of weights.
        
        self.learning_rate = 1

        
        
        # Bit conversion for DNA into neuralnet inputs because I'm too lazy to think of a different way to encode these. Recall, we want either 0 or 1, not anything continuous.
        
        self.make_mulah = {'A': '0001',
                                       'C': '0010',
                                       'T': '0100',
                                       'G': '1000'
                                       }

        
        
        
        # These are the expected values for our neuralnet to return
        
        self.expected_values = None



    # Now we make moves to fill in and interpret those matrices
    
    # Initialize the values of the bias matrix given the information passing through the hidden layer and output layer
    
    def initialize_values(self):
        bias_ones_1 = np.ones([1, self.hidden_neur])
        self.bias_mat1 = np.append(self.cnx_matrix_1, bias_ones_1, axis=0)

        bias_ones_2 = np.ones([1, self.output_neur])
        self.bias_mat2 = np.append(self.cnx_matrix_2, bias_ones_2, axis=0)

    # We'll make the binding sites into binary bits that the encoder can interpret, and then tell it what to expect 
    
    def setin_n_exp_values(self, dnaIN, autoencoder=True, negative=True):
        # Convert input DNA sequence into binary bits
        self._construct_input_vector(dnaIN)
        # Set expected value depending on autoencoder or testme
        self._set_expected_values(autoencoder, negative)
        # Weight matrices and input/output vectors with bias applied
        self.input_with_bias = np.append(self.input_vector, [1])

    
    
    # This directly handles conversion of the DNA binding site string to the 1/0 vector describing it, and assigns it to the original input class
    
    def _construct_input_vector(self, dnaIN):
       
        temp_vector_list = []

        for base in dnaIN:
            for number in self.make_mulah[base]:
                temp_vector_list.append(float(number))

        self.input_vector = np.asarray(temp_vector_list)

    
    
    # This will set the values we expect from the neuralnet depending on whether or not we use the autoencoder (T/F).
    
    def _set_expected_values(self, autoencoder=True, negative=True):
        if autoencoder == True:
            self.expected_values = self.input_vector

        if autoencoder == False:
            if negative == True:
                self.expected_values = 0
            if negative == False:
                self.expected_values = 1

    # Recall an autoencoder is a feedforward method. We need to convert input to hidden layer reduced dim info to output layer
    # that is the same as the input.
    
    def forward_propogation(self):
        # Generates hidden layer outputs
        
        output_one_list = []

        for element in np.nditer(np.dot(self.input_with_bias, self.bias_mat1)):
            output_one_list.append(self._sigmoid_function(element))
        self.hidden_neur_output = np.asarray(output_one_list)

        # Calculate the square derivate matrix for the hidden layer outputs
        
        for position, value in enumerate(self.hidden_neur_output):
            self.hidden_dx_matrix[position][position] = self._sigmoid_function_derivative(value)

        # The results from the output layer 
        # Add bias to hidden_neur_output
        
        self.hidden_output_bias = np.append(self.hidden_neur_output, [1])

        output_two_list = []
        for element in np.nditer(np.dot(self.hidden_output_bias, self.bias_mat2)):
            output_two_list.append(self._sigmoid_function(element))
        self.out_neur_output = np.asarray(output_two_list)

        # Calculate square derivate matrix for output layer outputs
        
        for position, value in enumerate(self.out_neur_output):
            self.output_dx_matrix[position][position] = self._sigmoid_function_derivative(value)

    
    # Recall that in stochastic gradient descent, we estimate the error gradient for the current state of the model using
    # examples from the training. The backwards propagation is what we use to update the weights of the model. 
    
    def backward_propogation(self):
        # Output Layer error
        
        deviations = self.out_neur_output - self.expected_values

        out_neur_errors = np.dot(self.output_dx_matrix, deviations)

        # Hidden Layer error
        
        hidden_neur_errors = np.dot(np.dot(self.hidden_dx_matrix, self.cnx_matrix_2), out_neur_errors)

        # Matrix 2 Errors (those of our output layer)
        
        output_rated_row_vector = np.asmatrix(-(self.learning_rate * out_neur_errors)).transpose()
        errors_mat2_transposed = np.dot(output_rated_row_vector, np.asmatrix(self.hidden_output_bias))
        self.errors_mat2 = errors_mat2_transposed.transpose()

        # Matrix 1 Errors (those of our hidden layer)
        
        hidden_rated_row_vector = np.asmatrix(-(self.learning_rate * hidden_neur_errors)).transpose()
        errors_mat1_transposed = np.dot(hidden_rated_row_vector, np.asmatrix(self.input_with_bias))
        self.errors_mat1 = errors_mat1_transposed.transpose()

    
    # Basically the sum between the output and hidden layers' bias and errors is what we use...
    
    def weight_bias_renew(self):
        self.bias_mat1 = self.bias_mat1 + self.errors_mat1
        self.bias_mat2 = self.bias_mat2 + self.errors_mat2

        self.cnx_matrix_1 = self.bias_mat1[:-1]
        self.cnx_matrix_2 = self.bias_mat2[:-1]

    
    # The activation function for the layers. Some people use relu. Its more of less simple to take the deriv of sigmoid. 
    
    def _sigmoid_function(self, input):
        return float(1 / (1 + np.exp(-input)))

    
    # Recall we also take the derivative of the activatio function. 
    def _sigmoid_function_derivative(self, input):
        return float(self._sigmoid_function(input) * (1 - self._sigmoid_function(input)))

In [1]:
#run third
"""
Training the neural net

What can be used:
     autoencoder
     testme <splits> <sampling>
     test
Arguments:
    autoencoder
        Run ze autoencoder
    testme
        Do the rap1 learning task including cross-validation
    test
        Classify test data and output to tsv file type
    <splits>
        Number of splits to make for cross-valitation (k-fold!)
    <sampling>
        Sampling method for neuralnet training input
        (slide) Go over each sequence in 17 nucleotide sliding frame (the length of the positives/test binding sites)
        (space) Each sequence is cut into 17 nucleotide bits for inputs (the binary bits that are useable by our model)
"""

def testme():
    """
    Train neural network on RAP1 binding sites
        * Input layer with 17*4 nodes (because 17 different nucleotides defining each sequence, and 4 different
          possible nucleotides describing those positions) + bias
        * Hidden layer with 23-35 nodes (merge at least 2 input neurons)+ bias
        * One output layer node (number of neurons in the output layer will equal the number of outputs
          associated with each input; we want one answer)
    Train against negative and positive binding sites
        * Import all negative sequences from .fa file
        * For each sequence, iterate every 17 bases and train with
          expected of 0 (since these are negatives, while positives should be 1)
        * Because it is so important to have a 1:1 ratio of negatives:positives when training our model, 
          for every 137 negative training instances, need to train it against all positive binding sites
          with expected of 1 
        * Will continue until negative binding sites have been fully ran through
    """

    # This part takes care of bringing in all those positive sequences
    
    pos = [pos_sequence.strip() for pos_sequence in open('data/rap1-lieb-positives.txt')]

    
    
    # This part takes care of bringing in all those neg sequences
    
    neg = list(SeqIO.parse('data/yeast-upstream-1k-negative.fa', 'fasta'))
    
    

    # Separate into random sections for the k-fold x-validation for both our positives and negatives 
    # Taken from : http://stackoverflow.com/questions/3352737/python-randomly-partition-a-list-into-n-nearly-equal-parts
    
    
    splits = int(args['<splits>'])
    negative_division = len(neg) / float(splits) # how many splits we can make from the sites given
    neg_split_up = [neg[int(round(negative_division * i)): int(round(negative_division * (i + 1)))]
                                     for i in range(splits)] # makes a list of those splits thru each site

    pos_division = len(pos) / float(splits) # ditto ^^
    pos_split_up = [pos[int(round(pos_division * i)): int(round(pos_division * (i + 1)))]
                                     for i in range(splits)]

    
    
    # Go thru neg sites subsets for x-validation, keep track of how many separations we do based on splits
    
    separation = 0
    for index in range(int(args['<splits>'])):
        # Set up cross-validation sets for the positives and negatives
        neg_site_list_copy = copy.deepcopy(neg_split_up)
        del neg_site_list_copy[index]
        neg_site_training = [seq for partition in neg_site_list_copy for seq in partition]
        neg_cross_validation_set = neg_split_up[index]

        pos_site_list_copy = copy.deepcopy(pos_split_up)
        del pos_site_list_copy[index]
        pos_site_training = [seq for partition in pos_site_list_copy for seq in partition]
        pos_cross_validation_set = pos_split_up[index]

        print("Training on the training set...")

        # Input our hyperparameters = # nodes
        neuralnet = neural_network(68, 23, 1)

        # See neural_net.py to get info on initialization
        neuralnet.initialize_values()

        pos_counter = 0
        counter = 0

        # If we're sampling from tneg then we'll slide over 17 nucleotides 
        
        if args['<sampling>'] == 'slide':
            for site in neg_site_training:

                # Iterate over site in 17 nucleotide sliding frames in negative sites, decide which model to use
                for chunky in range(len(site) - 16):
                    slice = site[chunky:(chunky + 17)].seq
                    if slice not in pos:
                        if all([slice[4] == 'C', slice[5] == 'C', slice[9] == 'C']) == False:
                            neuralnet.setin_n_exp_values(slice, autoencoder=False, negative=True)
                            neuralnet.forward_propogation()
                            neuralnet.backward_propogation()
                            neuralnet.weight_bias_renew()
                            pos_counter += 1
                        else:
                            print(slice)

                    if pos_counter == len(pos_site_training):
                        for pos_site in pos_site_training:
                            neuralnet.setin_n_exp_values(pos_site, autoencoder=False, negative=False)
                            neuralnet.forward_propogation()
                            neuralnet.backward_propogation()
                            neuralnet.weight_bias_renew()

                        pos_counter = 0

                # have reset the positives counter and will now say that we've done some training on those
                
                counter += 1

                print("Training set: {}/{} completed...".format(counter, len(neg_cross_validation_set)))

                greatestdelta_1 = neuralnet.errors_mat1.max()
                smallestdelta_1 = neuralnet.errors_mat1.min()
                greatestdelta_2 = neuralnet.errors_mat2.max()
                smallestdelta_2 = neuralnet.errors_mat2.min()

                if any([greatestdelta_1 < 0.00000000001 and greatestdelta_1 > 0,
                        smallestdelta_1 > -.00000000001 and smallestdelta_1 < 0]) and any(
                    [greatestdelta_2 < 0.00000000001 and greatestdelta_2 > 0,
                     smallestdelta_2 > -0.00000000001 and smallestdelta_2 < 0]):
                    print("Stop criterion met after {} iterations".format(counter))
                    break

        #when we sample from the negatives we only take 17 nucleotide chunks from each site
        
        if args['<sampling>'] == 'space':
            for site in neg_site_training:
                
                number_of_chunkys = int(len(site) / 17) #length of neg site tells us the amount of 17 length chunks possible

                for chunky in range(number_of_chunkys):
                    slice = site[(chunky * 17):((chunky + 1) * 17)].seq
                    if slice not in pos:
                        if all([slice[4] == 'C', slice[5] == 'C', slice[9] == 'C']) == False:
                            neuralnet.setin_n_exp_values(slice, autoencoder=False, negative=True)
                            neuralnet.forward_propogation()
                            neuralnet.backward_propogation()
                            neuralnet.weight_bias_renew()
                            pos_counter += 1

                        else:
                            print(slice)

                    #quick check to make sure that we've finished going thru the positives yet
                    if pos_counter == len(pos_site_training):
                        for pos_site in pos_site_training:
                            neuralnet.setin_n_exp_values(pos_site, autoencoder=False, negative=False)
                            neuralnet.forward_propogation()
                            neuralnet.backward_propogation()
                            neuralnet.weight_bias_renew()

                        pos_counter = 0

                    counter += 1

                greatestdelta_1 = neuralnet.errors_mat1.max()
                smallestdelta_1 = neuralnet.errors_mat1.min()
                greatestdelta_2 = neuralnet.errors_mat2.max()
                smallestdelta_2 = neuralnet.errors_mat2.min()

                if any([greatestdelta_1 < 0.00000000001 and greatestdelta_1 > 0,
                        smallestdelta_1 > -.00000000001 and smallestdelta_1 < 0]) and any(
                    [greatestdelta_2 < 0.00000000001 and greatestdelta_2 > 0,
                     smallestdelta_2 > -0.00000000001 and smallestdelta_2 < 0]):
                    print("Stop criterion met after {} iterations".format(counter))
                    break

        # taken each partition and trained model
        print("Performing Cross-validation")

        pos_list = []
        neg_list = []

        
        # Return the sets of positives and negatives from the x-validation
        
        print("Negative cross-validation set...")
        counter = 0
        for site in neg_cross_validation_set:
            for slice in range(len(site) - 16):
                neuralnet.setin_n_exp_values(site[slice:slice + 17].seq, autoencoder=False, negative=True)
                neuralnet.forward_propogation()
                neg_list.append(neuralnet.output_layer_output)
            counter += 1
            print("Negative cross-validation: {}/{} completed...".format(counter, len(neg_cross_validation_set)))
            break

        print("Positive cross-validation set...")
        for site in pos_cross_validation_set:
            neuralnet.setin_n_exp_values(site, autoencoder=False)
            neuralnet.forward_propogation()
            pos_list.append(neuralnet.output_layer_output)

        print('Positive avg: {}'.format(sum(pos_list) / len(pos_list)))
        print('Negative avg: {}'.format(sum(neg_list) / len(neg_list)))
        print(neuralnet.bias_mat1)
        print(neuralnet.bias_mat2)

        # Output the coneuralnetection matrices with greatest separation between the average positive and negative scores
        if ((sum(pos_list) / len(pos_list)) - (sum(neg_list) / len(neg_list))) > separation:
            np.savetxt('cnx_matrix_1.csv', neuralnet.bias_mat1, delimiter=',')
            np.savetxt('cnx_matrix_2.csv', neuralnet.bias_mat2, delimiter=',')
            separation = (sum(pos_list) / len(pos_list)) - (sum(neg_list) / len(neg_list))


# A simple definition of the autoencoder that uses those same neuralnet parameters
def autoencoder():
  
    neuralnet = neural_network()
    neuralnet.setin_n_exp_values('GA', autoencoder=True)
    neuralnet.initialize_values()

    # Stop criterion
    finished_working = False

    while finished_working == False:
        neuralnet.forward_propogation()
        neuralnet.backward_propogation()
        neuralnet.weight_bias_renew()

        greatestdelta_1 = neuralnet.errors_mat1.max()
        smallestdelta_1 = neuralnet.errors_mat1.min()
        greatestdelta_2 = neuralnet.errors_mat2.max()
        smallestdelta_2 = neuralnet.errors_mat2.min()

        if any([greatestdelta_1 < 0.00001 and greatestdelta_1 > 0,
                smallestdelta_1 > -.00001 and smallestdelta_1 < 0]) or any(
            [greatestdelta_2 < 0.00001 and greatestdelta_2 > 0,
             smallestdelta_2 > -0.00001 and smallestdelta_2 < 0]):
            finished_working = True

    print(neuralnet.output_layer_output)

def test():
    test_sequences = open('data/rap1-lieb-test.txt')
    neuralnet = neural_network(68, 23, 1)
    neuralnet.bias_mat1 = np.loadtxt('cnx_matrix_1.csv', delimiter=',')
    neuralnet.bias_mat2 = np.loadtxt('cnx_matrix_2.csv', delimiter=',')

    neuralnet_outputs = open('neuralnet_predictions.txt', 'w')

    for test_seq in test_sequences:
        neuralnet.setin_n_exp_values(test_seq.strip())
        neuralnet.forward_propogation()
        neuralnet_outputs.write('{}\t{}\n'.format(test_seq.strip(), neuralnet.output_layer_output[0]))

    neuralnet_outputs.close()

if __name__ == 'train':
    import docopt
    import numpy as np
    from Bio import SeqIO
    import copy
    from .neuralnetfxns import neural_network

    args = docopt.docopt(__doc__)

    if args['autoencoder']:
        autoencoder()

    if args['testme']:
        testme()

    if args['test']:
        test()



In [4]:
#run 4th
testme()

In [5]:
#run 5th
test()

In [4]:
import numpy as np
training_set = np.zeros((8,8), dtype = np.int)
for i in range(0,8):
    training_set[i][i] = 1
training_set

array([[1, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 1]])