# Transcription Factor Classifier

In [1]:
from nn.io import read_text_file
from nn.io import read_fasta_file
from nn.preprocess import sample_seqs
from nn.preprocess import one_hot_encode_seqs
import numpy as np
from nn.nn import NeuralNetwork
from sklearn.model_selection import train_test_split



### Use the 'read_text_file' function from preprocess.py to read in the 137 positive Rap1 motif examples

In [2]:
rap1_postive = read_text_file("data/rap1-lieb-positives.txt")

### Use the 'read_fasta_file' function to read in all the negative examples from all 1kb upstream in yeast.

In [3]:
rap1_negative = read_text_file("data/yeast-upstream-1k-negative.fa")

In [4]:
## exploring the data 
len(rap1_postive[1]) ## each sequence has 17 
len(rap1_negative[1]) ## For the naegative they have 60 nucleotides
len(rap1_postive) ## 137 sequences in total 
len(rap1_negative) ## 56908 sequences in total 

56908

### Implement a sampling scheme in the 'sample_seq' function in the preprocess.py file

In [5]:
sampled_seqs,sampled_labels = sample_seqs(rap1_postive,rap1_negative)

### Explain in your jupyter notebook why chose the sampling scheme that you did.

*Answer*: I decided to use the following sample scheme. First I sub-sample the negative sequences, I settled on keeping a radio 1:3, given that we should't have a extreme imbalance in the classification of our sequences but we still want sufficient information. Finally I wanted to have a standart input so I resolved on all my sequences being 17 nucleotides (the len of the positive seqs)

### Generate a training and a validation set for training your classifier.

In [6]:
X_train, X_test, y_train, y_test = train_test_split(sampled_seqs, sampled_labels, test_size=0.33, random_state=2)

### One hot encode your training and validation sets

In [7]:
onecode = (one_hot_encode_seqs(X_train))
X_train = np.stack(onecode,axis=0) 

onecode = (one_hot_encode_seqs(X_test))
X_test = np.stack(onecode,axis=0) 

### Train your neural network!

In [19]:
nn_ar = [{'input_dim': 68, 'output_dim': 16, 'activation' : 'Sigmoid'},
         {'input_dim': 16, 'output_dim': 1, 'activation' : 'Sigmoid'}]
lr = .0001
seed = 2
batch_size = 30
epochs = 50
loss_function = "cross_entropy"

TF_network = NeuralNetwork(nn_ar,lr, seed, batch_size,epochs,loss_function)


In [20]:
TF_network.fit(X_train, y_train, X_test, y_test)

([0.7790664276887858,
  0.5195811038207605,
  0.6938054676016341,
  0.5212170424043678,
  0.7272196750454635,
  0.5256244307014363,
  0.7594751886369507,
  0.531525837609593,
  0.7867350558321544,
  0.5373014369426881,
  0.8070529857477198,
  0.5419014123792834,
  0.8207152400564283,
  0.5450861593706426,
  0.8292411606266744,
  0.5470989604379695,
  0.8343061794622988,
  0.5483014276434852,
  0.8372252426601824,
  0.5489962211426536,
  0.8388777168190068,
  0.5493900335405666,
  0.8398036204621178,
  0.5496108316635968,
  0.8403194164827062,
  0.5497338731627106,
  0.8406058215097252,
  0.5498022063113894,
  0.8407645659905859,
  0.5498400846862358,
  0.8408524644203272,
  0.5498610594345089,
  0.8409011076340893,
  0.5498726672504568,
  0.8409280186311422,
  0.549879089171986,
  0.8409429041295055,
  0.5498826414129917,
  0.8409511370906667,
  0.5498846061174523,
  0.8409556903892408,
  0.5498856927143626,
  0.8409582085513144,
  0.5498862936482058,
  0.8409596011767003,
  0.54988662

(4, 1)

(4, 1)