In [1]:
#import modules
import numpy as np
import pandas as pd
import glob
import re
from collections import Counter
from Bio.Seq import Seq

#Snapgene module
from snapgene_reader import snapgene_file_to_dict, snapgene_file_to_seqrecord

#Function
def string_set(string_list):
    return set(i for i in string_list if not any(i in s for s in string_list if i != s))

def list_sort(curr_list): 
    curr_list.sort(key = lambda x: x[0]) 
    return pd.DataFrame(curr_list,columns = ['fragment_number','dna_sequence'])

def sim_score(seq1,seq2):
    return sum (seq1[i] == seq2[i] for i in range(len(seq1)))

def small_consecutive_frag(len_list):
    scf_flag = 1
    if len(len_list) > 2:
        for i in range(len(len_list)-1):
            if len_list[i] < 80 and len_list[i+1] < 80:
                scf_flag = 0
                break
                
        if len_list[len(len_list)-1] < 80 and len_list[0] < 80:
            scf_flag = 0
            
    return scf_flag

for files in glob.glob("*.dna"):
    snap_file = files  
    
    #reading snap_gene_file
    dictionary = snapgene_file_to_dict(snap_file)
    seqrecord = snapgene_file_to_seqrecord(snap_file)
    filename = snap_file.split('.')[0]

    count = 0
    data_stat = []
    for i in range(len(dictionary['features'])):
        if 'fragment' in dictionary['features'][i]['name']:
            count = count + 1
            frag_start, frag_end = next(iter(dictionary['features'][i]['segments'][0].items()))[1].split('-')
            f_start = int(frag_start)-1
            f_end = int(frag_end)
            if f_start < f_end:
                curr_seq = dictionary['seq'][f_start:f_end]
            else:
                curr_seq = dictionary['seq'][f_start:len(dictionary['seq'])] + dictionary['seq'][0:f_end]
            data_stat.append([int(dictionary['features'][i]['name'].replace('fragment','')),curr_seq])
            
        if 'Fragment' in dictionary['features'][i]['name']:
            count = count + 1
            frag_start, frag_end = next(iter(dictionary['features'][i]['segments'][0].items()))[1].split('-')
            f_start = int(frag_start)-1
            f_end = int(frag_end)
            if f_start < f_end:
                curr_seq = dictionary['seq'][f_start:f_end]
            else:
                curr_seq = dictionary['seq'][f_start:len(dictionary['seq'])] + dictionary['seq'][0:f_end]
            data_stat.append([int(dictionary['features'][i]['name'].replace('Fragment','')),curr_seq])

    df = list_sort(data_stat)
    insert_order_specified = np.array(df.fragment_number)
    sequences_specified = np.array(df.dna_sequence)

    order = insert_order_specified.argsort()
    insert_order = insert_order_specified[order]
    sequences_case = sequences_specified[order]
    
    pre_sequences = []
    len_pre_sequences = []
    for i in range(len(sequences_case)):
        pre_sequences.append(sequences_case[i].upper())
        len_pre_sequences.append(len(sequences_case[i]))
        
    #Consecutive Small Fragments
    if small_consecutive_frag(len_pre_sequences) == 0:
        print('Please redesign ' + filename + ' plasmid to avoid consecutive fragments of less than 80 bp')
        
    del_index = []
    for i in range(len(pre_sequences)):
        if len(pre_sequences[i])<80:
            del_index.append(i)
            if len(pre_sequences[i])%2 == 0:
                first_half = pre_sequences[i][0:len(pre_sequences[i])//2]
                second_half = pre_sequences[i][len(pre_sequences[i])//2:]
            else:
                first_half = pre_sequences[i][0:len(pre_sequences[i])//2]
                second_half = pre_sequences[i][len(pre_sequences[i])//2:]

            if i==0:
                pre_sequences[len(pre_sequences)-1] = pre_sequences[len(pre_sequences)-1] + first_half
                pre_sequences[i+1] = second_half + pre_sequences[i+1]

            elif i==len(pre_sequences)-1: 
                pre_sequences[i-1] = pre_sequences[i-1] + first_half
                pre_sequences[0] =  second_half + pre_sequences[0]
            else:
                pre_sequences[i-1] = pre_sequences[i-1] + first_half
                pre_sequences[i+1] = second_half + pre_sequences[i+1]

    sequences = []
    if len(del_index) > 0:
        for i in range(len(pre_sequences)):
            if len(pre_sequences[i]) >= 80:
                sequences.append(pre_sequences[i])
    else:
        sequences = pre_sequences
    
    ##Duplicate Guide/Recognition Junctions
    
    #Guide Library
    search_space = '80 bp fragment'
    guide_24bp = []
    pre_mmlig_guide = []
    for i in range(len(sequences)):
        guide_24bp.append('guide for joint'+str(i))
        pre_mmlig_guide.append([i,'guide for joint'+str(i)])
        if i == 0:
            search_space = np.vstack((search_space,sequences[len(sequences)-1][-40:]+sequences[i][0:40]))
        else:
            search_space = np.vstack((search_space,sequences[i-1][-40:]+sequences[i][0:40]))
            
    junction_flag = 1
    problematic_junction = []
    for i in range(1,len(search_space)):
        for j in range(1,len(search_space)):
            if j > i:
                if sim_score(search_space[i][0],search_space[j][0]) > 60:
                    junction_flag = 0
                    problematic_junction.append([i,j])
                    
    if junction_flag == 0:
        for i in range(np.shape(problematic_junction)[0]):
            print('Junction ' + str(problematic_junction[i][0]) + ' is very similar to Junction ' + str(problematic_junction[i][1])+'. Finding the guides for ' + filename + ' plasmid might be difficult')
    
    ##Repeated Fragments for Primer Non-specific binding
    
    count_issue_case = []
    for i in range(len(sequences)):
        mystring = sequences[i]
        mystring_len = len(mystring)
        rev_mystring = str(Seq(mystring).reverse_complement())
        
        possible_matches = []
        matches = []

        for start_index in range(0, mystring_len-3):
            for end_index in range(start_index+1, mystring_len+1):
                current_string = mystring[start_index:end_index]
                if len(current_string) < 15: continue 
                possible_matches.append(mystring[start_index:end_index])

        for possible_match, count in Counter(possible_matches).most_common():
            if count <= 1: break
            matches.append(possible_match)

        matches_string = string_set(matches)
        for match in matches_string:
            curr_string = match
            string_pos = [m.start() for m in re.finditer(curr_string, mystring)]
            
            #print(i+1)
            #print(string_pos)
            #print(len(mystring))
            
            #Case 1
            for j in range(len(string_pos)):
                if string_pos[j] < 60:
                    count_issue_case.append(1)
                    
            #Rare Case 3 
            #if curr_string in rev_mystring:
            #    count_issue_case.append(1)
             
        possible_matches = []
        matches = []

        for start_index in range(0, mystring_len-3):
            for end_index in range(start_index+1, mystring_len+1):
                current_string = rev_mystring[start_index:end_index]
                if len(current_string) < 15: continue 
                possible_matches.append(rev_mystring[start_index:end_index])

        for possible_match, count in Counter(possible_matches).most_common():
            if count <= 1: break
            matches.append(possible_match)

        matches_string = string_set(matches)
        for match in matches_string:
            curr_string = match
            string_pos = [m.start() for m in re.finditer(curr_string, rev_mystring)]
        
            #Case 3
            for j in range(len(string_pos)):
                if string_pos[j] < 60:
                    count_issue_case.append(1)
            
            #Rare Case 4
            #if curr_string in mystring:
            #    count_issue_case.append(1)
            
    if len(count_issue_case) > 0:
        print(filename + ' plasmid can be problematic in PCR')
    else:
        print(filename + ' plasmid PCRs should work')

MZ21 plasmid PCRs should work
