In [512]:
import requests
import ast
import pandas as pd
import Bio
from Bio import Entrez
from Bio import SeqIO
from time import sleep
from orffinder import orffinder
import optipyzer
from Bio.Restriction.Restriction import RestrictionBatch
from IPython.display import clear_output

In [515]:
def getUniprotInfo(id) -> list:
    params = {
        'query': id,
        'fields': ['xref_refseq', 'sequence']
    }

    response = requests.get('https://rest.uniprot.org/uniprotkb/search', params=params)
    uniprotValue = ast.literal_eval(response.content.decode('utf-8'))['results'][0]['sequence']['value']
    potential_ids = pd.json_normalize(ast.literal_eval(response.content.decode('utf-8'))['results'][0])['uniProtKBCrossReferences'][0]
    sequence_length = pd.json_normalize(ast.literal_eval(response.content.decode('utf-8'))['results'][0])['sequence.length'][0]
    return [uniprotValue, potential_ids, sequence_length]

In [516]:
def matchAmino(uniprotValue, potential_ids) -> list:
    if potential_ids == None:
        print('No potential ids provided.')
        return 0

    for count, i in enumerate(potential_ids):
        test = i['properties'][0]['value']
        print(f'Checking {count+1} of {len(potential_ids)} potential matches')
        print(f'Trying to match [{test}]...')

        Entrez.email = 'andrewbrenth@gmail.com'
        handle = Entrez.efetch(db='nuccore', id=test, rettype='gb', retmode='text')
        sequence = SeqIO.read(handle, "genbank")

        for feature in sequence.features:
            if feature.type=='CDS':
                ncbi_match = feature.qualifiers['translation'][0]

        if ncbi_match == uniprotValue:
            print(f'Match found with [{test}]')
            return [test, sequence, ncbi_match]
        elif count+1 == len(potential_ids):
            continue
        else:
            print('Not a match.')
            sleep(10)

In [517]:
def matchTranslation(orfs, uniprotValue, sequence) -> str:
    """
    Parameters
    ----------
    orfs : dict
    dict of orfs site from orffinder utility

    uniprotValue : str
    string from uniprot based on id
    """
    
    for i in range(len(orfs)):
        if uniprotValue+'*' == str(sequence.seq[orfs[i]['start']-1:orfs[i]['end']-1].translate()):
            print(f'Matched ORF at {orfs[i]['start']-1} to {orfs[i]['end']}')
            slice = str(sequence.seq[orfs[i]['start']-1:orfs[i]['end']-1])
            return slice

In [518]:
def optimizeSlice(slice) -> str:
    """
    Parameters
    ----------
    slice : str
    
    get string of appropriate ORF section from matchTranslation
    """

    api = optipyzer.API()
    gblock = str(slice)

    result = api.optimize(
        seq=gblock,
        seq_type="dna",
        weights={"e_coli": 1}
    )

    optimized = result['optimized_sd']
    return optimized


In [519]:
def findSites(optimizedCodon) -> dict:
    """
    Parameters
    ----------
    optimizedCodon : str
    
    Provider E. Coli optimized codon from optimizedSlice(). Will return dictionary
    with whether enzyme sites are present or not.
    """
    results = {'ins':[], 'outs':[]}
    enzymes = ['BsaI', 'BbsI', 'BsmBI', 'Esp3I', 'SapI']
    batch = RestrictionBatch()
    for i in enzymes:
        batch.add(i)

    for i in enzymes:
        enzyme = batch.get(i)
        if enzyme.site in optimizedCodon:
            results['ins'].append(i)
            print(f'{i} in sequence')
        else:
            results['outs'].append(i)
            print(f'{i} NOT in sequence')

    return results
    

In [525]:
def main(df) -> pd.DataFrame:
    """
    Parameters
    ----------
    df : pd.DataFrame
    csv with uniprot_id column
    """
    
    uniprotId = []
    uniprotValue = []
    sequence_length = []
    nih_id = []
    full_sequence = []
    amino_sequence = []
    orf = []
    optimized_codon = []
    enzymes = []

    for count, i in enumerate(df.uniprot_id):
        uniprotId.append(i)
        clear_output(wait=True)
        print(f'Checking {count+1} of {len(df.uniprot_id)}')
        uValue, potential_ids, length = getUniprotInfo(i)
        try:
            matchedId, sequence, amino = matchAmino(uValue, potential_ids)
        except:
            uniprotValue.append(uValue)
            sequence_length.append(length)
            nih_id.append(pd.NA)
            full_sequence.append(pd.NA)
            amino_sequence.append(pd.NA)
            orf.append(pd.NA)
            optimized_codon.append(pd.NA)
            enzymes.append(pd.NA)
            continue
        orfs = orffinder.getORFs(sequence, minimum_length=length, remove_nested=True)
        slice = matchTranslation(orfs, uValue, sequence)
        try:
            codon = optimizeSlice(slice)
            enzyme_results = findSites(optimizedCodon=codon)
        except ValueError:
            print('Something\'s wrong with the codon')
            codon = pd.NA
            enzyme_results = pd.NA
        

        sequence_length.append(length)
        uniprotValue.append(uValue)
        nih_id.append(matchedId)
        full_sequence.append(str(sequence.seq))
        amino_sequence.append(amino)
        orf.append(slice)
        optimized_codon.append(codon)
        enzymes.append(enzyme_results)
        

    result = pd.DataFrame([
                uniprotValue,
                sequence_length,
                nih_id,
                full_sequence,
                amino_sequence,
                orf,
                optimized_codon,
                enzymes],
            index=[
                'uniprotValue',
                'sequence_length',
                'nih_id',
                'full_sequence',
                'amino_sequence',
                'orf',
                'optimized_codon',
                'enzymes',
            ]).transpose()
    
    clear_output(wait=True)
    print('Done')
    return result


In [None]:
df = pd.read_csv('uniprot_ids.csv')

In [526]:
result = main()

Done


In [528]:
result.to_csv('results.csv')