# Pipeline

In [1]:
import sys
import pandas as pd
import subprocess
from tqdm import tqdm
import tensorflow as tf  # Make sure version 1.3.0 is installed
import numpy as np
import os
# Imports the mLSTM babbler model, for unirep vector generation.
from utils_2 import uniprotRetrieve
from unirep import babbler64 as babbler
# Imports the neural network, for classification.
import shallow_nn as nn
# Imports tools for sequence query and parsing.
from Bio import Entrez, SeqIO

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [12]:
""" Imports unirep and classifier models """

def unirep_model():
    # Where model weights are stored.
    MODEL_WEIGHT_PATH = "./64_weights"
    # Generates the model.
    batch_size = 12
    b = babbler(batch_size=batch_size, model_path=MODEL_WEIGHT_PATH)
    return b

def load_classifier():
    """
    Loads the classifier model.
    """
    # Imports model weights and biases.
    weights = np.load('weights.npy', allow_pickle=True)
    biases = np.load('biases.npy', allow_pickle=True)
    # Generates a classifier with given weights and biases.
    classifier = nn.ShallowNetwork([64, 22, 2])
    classifier.weights = weights
    classifier.biases = biases
    return classifier

In [3]:
""" Function to scale the target with same factor as training """

def scale_vector(vector):
    """
    Apply the scaling factor used on the training set in the laerning phase
    to the vector to be predicted.
    """
    # Standard deviation for each feature.
    scale_factor = np.load('scale_factor.npy', allow_pickle=True)
    # Mean for each feature
    mean_vector = np.load('scale_mean.npy', allow_pickle=True)
    # Scales the new vector using training parameters.
    return (vector - mean_vector) / scale_factor

In [4]:
""" Query part """

def query_protein(query_terms, email=None):
    """
    Queries the protein database and returns the ID's of the hit.

    An interface allows to show all the hits from the given query, and to select
    one of them to get it's sequence.
    """
    # Query phase.
    # Email used to be contacted in case of abuse.
    Entrez.email = email
    # Terms to be searched.
    query = query_terms
    # Queries and parse the results. 
    handle = Entrez.esearch(db="protein", term=query, limit=10)
    records = Entrez.read(handle)
    # List of hits ID's
    id_list = records['IdList']
    handle.close()
    # Selection phase, user is invited to select the hit of interest.
    select = -2
    # Prints informations about the hits.
    for i, each_id in enumerate(id_list):
        fasta = Entrez.efetch(db="protein", id=each_id, rettype="fasta")
        fasta_record = SeqIO.read(fasta, "fasta")
        print(f'{i}: {each_id}| {fasta_record.description}')
    # Selection phase.
    # -2 means no correct input has been entered.
    while select == -2:
        # Asks user to give an integer input, and checks if this integer is
        # a valid index for the list of hits.
        select = int(input("Enter desired sequence number. Type -1 to leave. "))
        if select not in range(len(id_list)):
            # If value is -1, exit the program successfully.
            if select == -1:
                raise ValueError("End of query")
            # If value is not valid, prints an error message, and asks again.
            select = -2
            print("Wrong number")
    return id_list[select]

In [5]:
""" Retrieves sequence from ID """

def get_sequence(prot_id):
    """
    Gets the amino-acid sequence of the protein of interest.
    """
    fasta = Entrez.efetch(db="protein", id=prot_id, rettype="fasta")
    fasta_record = SeqIO.read(fasta, "fasta")
    return fasta_record.seq

In [18]:
""" Main function """

def unirep_vectorize(model, classifier, protein_id='556503394',
                     query_terms=False, email='ancnudde@ulb.ac.be'):
    """
    Queries protein database, then uses the mLSTM model to generate the unirep 
    vector.
    """
    translate_prediction = {0: 'Periplasmic', 1:'Cytoplasmic'}
    # If query term is entered, queries the database.
    if query_terms:
        query_id = query_protein(query_terms, email)
    # If protein ID is given, skips query.
    elif protein_id:
        query_id = protein_id
    # Retrieves the sequence from the ID.
    sequence = get_sequence(query_id)
    # Get UniRep vector from the sequence.
    vector = model.get_rep(sequence)[0]
    # Scales the vector.
    scaled_vector = scale_vector(vector).reshape(-1, 1)
    # Makes prediction.
    prediction = classifier.predict(scaled_vector)
    return translate_prediction[prediction]

First, we generate the unirep and the neural network models to be used.

In [10]:
uni = unirep_model()
net = load_classifier()

<shallow_nn.ShallowNetwork object at 0x7ff8ab16e860>


Then, we call the main function. The first possible way to use it is to provide a protein identifier. This way, it just retrieves the sequence of the protein associated with this ID, vectorize it, scales the vector and use it as input of the classifier.

In [13]:
unirep_vectorize(uni, net, protein_id='556503394')

Email address is not specified.

To make use of NCBI's E-utilities, NCBI requires you to specify your
email address with each request.  As an example, if your email address
is A.N.Other@example.com, you can specify it as follows:
   from Bio import Entrez
   Entrez.email = 'A.N.Other@example.com'
In case of excessive usage of the E-utilities, NCBI will attempt to contact
a user at the email address provided before blocking access to the
E-utilities.


1


'Cytoplasmic'

Or we can use query terms. This will list entries corresponding to the query, and ask the user to enter the number corresponding to the choosen entry. Then, it retrieves the ID of the protein, and follows the same process as before.

In [24]:
unirep_vectorize(uni, net, query_terms='10 kda chaperonin lysobacter')

0: 1823210566| WP_166293826.1 chaperonin GroEL [Lysobacter sp. HDW10]
1: 1823038960| WP_166209686.1 chaperonin GroEL [Lysobacter sp. YJ15]
2: 1698423802| WP_141622416.1 chaperonin GroEL [Lysobacter alkalisoli]
3: 1597780029| WP_133479912.1 chaperonin GroEL [Lysobacter segetis]
4: 1821134034| QIK80510.1 chaperonin GroEL [Lysobacter sp. HDW10]
5: 1796062818| WP_159016573.1 chaperonin GroEL [Lysobacter sp. CHu50b-3-2]
6: 1796030061| WP_158984181.1 chaperonin GroEL [Lysobacter panacisoli]
7: 1795771114| WP_158732255.1 chaperonin GroEL [Lysobacter sp. SYSU H10001]
8: 1425913211| WP_112927536.1 chaperonin GroEL [Lysobacter oculi]
9: 1783784838| WP_156639548.1 chaperonin GroEL [Lysobacter sp. HX-5-24]
10: 1740894305| WP_149353281.1 chaperonin GroEL [Lysobacter sp. UKS-15]
11: 1718403133| WP_144814963.1 chaperonin GroEL [Lysobacter ruishenii]
12: 1713065202| WP_143879244.1 chaperonin GroEL [Lysobacter lycopersici]
13: 1698144575| WP_141516746.1 chaperonin GroEL [Lysobacter aestuarii]
14: 16981

'Cytoplasmic'

In [19]:
""" Vibrio serine endoprotease, Periplasmic """
unirep_vectorize(uni, net, protein_id='1353621482')

'Periplasmic'

In [22]:
""" Acinetobacter albensis Tol-Pal system protein, Periplasmic """
unirep_vectorize(uni, net, protein_id='1842549391')

'Periplasmic'

In [25]:
""" Lysobacter 10 kda chaperonin, Cytoplasmic """
unirep_vectorize(uni, net, protein_id='1823210566')

'Cytoplasmic'