In [43]:
import requests
import ast
import pandas as pd
import Bio
from Bio import Entrez
from Bio import SeqIO
from time import sleep
from time import monotonic
from orffinder import orffinder
import optipyzer
from Bio.Restriction.Restriction import RestrictionBatch
from IPython.display import clear_output
import re
import json
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import os   

In [44]:
df = pd.read_csv('results.csv')
df = df.drop('Unnamed: 0', axis=1)

In [81]:
def optimizeSlice(slice) -> str:
    """
    Parameters
    ----------
    slice : str
    
    get string of appropriate ORF section from matchTranslation
    """

    api = optipyzer.API()
    gblock = str(slice)

    result = api.optimize(
        seq=gblock,
        seq_type="dna",
        weights={"e_coli": 1}
    )

    optimized = result['optimized_sd']
    return optimized

def findSites(optimizedCodon) -> dict:
    """
    Parameters
    ----------
    optimizedCodon : str
    
    Provider E. Coli optimized codon from optimizedSlice(). Will return dictionary
    with whether enzyme sites are present or not.
    """
    results = {'ins':[], 'outs':[]}
    enzymes = ['BsaI', 'BbsI', 'BsmBI', 'Esp3I', 'SapI']
    batch = RestrictionBatch()
    for i in enzymes:
        batch.add(i)

    for i in enzymes:
        enzyme = batch.get(i)
        if enzyme.site in optimizedCodon:
            results['ins'].append(i)
            print(f'{i} in sequence')
        else:
            results['outs'].append(i)
            print(f'{i} NOT in sequence')

    return results
    


def remove_enzyme(enzyme, sequence):
    res_one = findSites(sequence)
    remove_enz = enzyme

    if remove_enz in res_one['outs']:
        print(f'{remove_enz} removed')
        return sequence
    else:
        #Return False when removal fails
        return False
    
def iterateOptimizer(enzyme, sequence):
    sequence = optimizeSlice(df['orf'][0])

    if not remove_enzyme(enzyme, sequence):
        'Enzyyme not removed... retrying.'
        for i in range(10):
            sequence = optimizeSlice(df['orf'][0])
            if remove_enzyme(enzyme, sequence):
                print(f'Optimized after {i+1} attempts.')
                return sequence
    else:
        return sequence
    
def findSites(optimizedCodon) -> dict:
    """
    Parameters
    ----------
    optimizedCodon : str
    
    Provider E. Coli optimized codon from optimizedSlice(). Will return dictionary
    with whether enzyme sites are present or not.
    """
    results = {'ins':[], 'outs':[]}
    enzymes = ['BsaI', 'BbsI', 'BsmBI', 'Esp3I', 'SapI']
    batch = RestrictionBatch()
    for i in enzymes:
        batch.add(i)

    for i in enzymes:
        enzyme = batch.get(i)
        if enzyme.site in optimizedCodon:
            results['ins'].append(i)
            print(f'{i} in sequence')
        else:
            results['outs'].append(i)
            print(f'{i} NOT in sequence')

    return results

In [83]:
opt = []
for i in df.index:
    if df.loc[i, 'optimized_codon'].startswith('ATG'):
        new_codon = iterateOptimizer('Esp3I', df.loc[i, 'optimized_codon'])

        df.loc[i, 'optimized_codon'] = new_codon

BsaI in sequence
BbsI NOT in sequence
BsmBI NOT in sequence
Esp3I NOT in sequence
SapI NOT in sequence
Esp3I removed
BsaI in sequence
BbsI NOT in sequence
BsmBI NOT in sequence
Esp3I NOT in sequence
SapI NOT in sequence
Esp3I removed
BsaI NOT in sequence
BbsI in sequence
BsmBI NOT in sequence
Esp3I NOT in sequence
SapI NOT in sequence
Esp3I removed
BsaI NOT in sequence
BbsI NOT in sequence
BsmBI NOT in sequence
Esp3I NOT in sequence
SapI NOT in sequence
Esp3I removed
BsaI NOT in sequence
BbsI NOT in sequence
BsmBI NOT in sequence
Esp3I NOT in sequence
SapI NOT in sequence
Esp3I removed
BsaI NOT in sequence
BbsI NOT in sequence
BsmBI NOT in sequence
Esp3I NOT in sequence
SapI NOT in sequence
Esp3I removed
BsaI NOT in sequence
BbsI NOT in sequence
BsmBI in sequence
Esp3I in sequence
SapI NOT in sequence
BsaI NOT in sequence
BbsI in sequence
BsmBI NOT in sequence
Esp3I NOT in sequence
SapI NOT in sequence
Esp3I removed
Optimized after 1 attempts.
BsaI NOT in sequence
BbsI NOT in sequence


In [68]:
enzymes = []
for i in df.optimized_codon:
    enzymes.append(findSites(i))

BsaI in sequence
BbsI NOT in sequence
BsmBI NOT in sequence
Esp3I NOT in sequence
SapI NOT in sequence
BsaI in sequence
BbsI NOT in sequence
BsmBI NOT in sequence
Esp3I NOT in sequence
SapI NOT in sequence
BsaI NOT in sequence
BbsI NOT in sequence
BsmBI NOT in sequence
Esp3I NOT in sequence
SapI NOT in sequence
BsaI NOT in sequence
BbsI NOT in sequence
BsmBI NOT in sequence
Esp3I NOT in sequence
SapI NOT in sequence
BsaI NOT in sequence
BbsI NOT in sequence
BsmBI NOT in sequence
Esp3I NOT in sequence
SapI NOT in sequence
BsaI NOT in sequence
BbsI NOT in sequence
BsmBI in sequence
Esp3I in sequence
SapI NOT in sequence
BsaI NOT in sequence
BbsI NOT in sequence
BsmBI NOT in sequence
Esp3I NOT in sequence
SapI NOT in sequence
BsaI NOT in sequence
BbsI in sequence
BsmBI NOT in sequence
Esp3I NOT in sequence
SapI NOT in sequence
BsaI in sequence
BbsI in sequence
BsmBI NOT in sequence
Esp3I NOT in sequence
SapI NOT in sequence
BsaI NOT in sequence
BbsI NOT in sequence
BsmBI NOT in sequence
E

In [75]:

df['enzymes'] = pd.DataFrame([enzymes], index=['enzymes']).transpose()

In [77]:
for i in df.index:
    if pd.isna(df.loc[i,'enzymes']):
        continue
    
    cols = df.loc[i, 'enzymes']['ins'] + df.loc[i, 'enzymes']['outs']
    cols.sort()

for i in cols:
    df[i] = pd.NA

for i in df.index:
    if pd.isna(df.loc[i, 'enzymes']):
        continue

    for j in df.loc[i, 'enzymes']['ins']:
        df.loc[i, j] = True

    for k in df.loc[i, 'enzymes']['outs']:
        df.loc[i, k] = False

df = df.drop('enzymes', axis=1)

In [79]:
df.to_csv('results.csv')