In [1]:
# 2. Hard coded input data
input_data = [
    'ATGGAACAAGAATGA',
    'ATGATTTAAATGATCTAAATGATTTAA',
    'CATATGATTATTTAAATCATGATTATTTAGGATATGGATATTTAGATT',
    'ATGATTATGTAA',
    'ATTATGTAA',
    'ATGCGTCGT'
]

In [2]:
# 3. Hard coded codon-to-protein dictionary (uses T instead of U since going straight from DNA)
# Assumption: Just coded ATG as start instead of start/methionine, since no clear heuristic for determining which one to use given that not all sequences are real proteins

codon_to_protein_dict = {
    'TTT': ['F'],
    'TTC': ['F'],
    'TTA': ['L'],
    'TTG': ['L'],
    'CTT': ['L'],
    'CTC': ['L'],
    'CTA': ['L'],
    'CTG': ['L'],
    'ATT': ['I'],
    'ATC': ['I'],
    'ATA': ['I'],
    'ATG': ['START'],
    'GTT': ['V'],
    'GTC': ['V'],
    'GTA': ['V'],
    'GTG': ['V'],
    'TCT': ['S'],
    'TCC': ['S'],
    'TCA': ['S'],
    'TCG': ['S'],
    'CCT': ['P'],
    'CCC': ['P'],
    'CCA': ['P'],
    'CCG': ['P'],
    'ACT': ['T'],
    'ACC': ['T'],
    'ACA': ['T'],
    'ACG': ['T'],
    'GCT': ['A'],
    'GCC': ['A'],
    'GCA': ['A'],
    'GCG': ['A'],
    'TAT': ['Y'],
    'TAC': ['Y'],
    'TAA': ['STOP'],
    'TAG': ['STOP'],
    'CAT': ['H'],
    'CAC': ['H'],
    'CAA': ['Q'],
    'CAG': ['Q'],
    'AAT': ['N'],
    'AAC': ['N'],
    'AAA': ['K'],
    'AAG': ['K'],
    'GAT': ['D'],
    'GAC': ['D'],
    'GAA': ['E'],
    'GAG': ['E'],
    'TGT': ['C'],
    'TGC': ['C'],
    'TGA': ['STOP'],
    'TGG': ['W'],
    'CGT': ['R'],
    'CGC': ['R'],
    'CGA': ['R'],
    'CGG': ['R'],
    'AGT': ['S'],
    'AGC': ['S'],
    'AGA': ['R'],
    'AGG': ['R'],
    'GGT': ['G'],
    'GGC': ['G'],
    'GGA': ['G'],
    'GGG': ['G']
}

In [3]:
# Import library
import numpy as np

# Define methods
def get_codon(input_codon):
    '''
    Input: single codon DNA sequence
    Output: corresponding protein value
    '''
    prot_val = codon_to_protein_dict[input_codon]
    return prot_val[0]

def translate_dna(input_dna):
    '''
    Input: single mini-chromosome DNA sequence
    Output: translated protein sequence
    Does initial assumption checks before translating
    '''
    # Make all bases upper-case
    temp_dna = input_dna.upper()
    # Check if multiple of 3
    assert len(temp_dna) % 3 == 0, 'DNA sequence is not a multiple of 3!'
    # Get list of available codon characters
    available_chars = set(''.join(list(codon_to_protein_dict.keys())))
    assert all([(i in available_chars) for i in temp_dna]), 'DNA sequence has non-codon bases!'
    
    # Iterate through each codon
    translated_prot = ''
    for i in range(int(len(temp_dna)/3)):
        codon = get_codon(temp_dna[(i*3):(i*3)+3])
        translated_prot += codon + '-'
    return translated_prot[:-1]

def get_start_stop(input_prot_seq):
    '''
    Input: input protein sequence
    Output: index position for start and stop codon
    '''
    prot_list = input_prot_seq.split('-')
    in_new_prot = False
    start_stop_list = []
    temp_pair_list = [-1,-1]
    # Iterate through each amino acid
    for prot_index in range(len(prot_list)):
        if prot_list[prot_index] == 'START':
            temp_pair_list[0] = prot_index            
            start_stop_list.append(temp_pair_list.copy())
            in_new_prot = True
        elif prot_list[prot_index] == 'STOP':
            if in_new_prot:
                # Has corresponding START codon
                start_stop_list[-1][1] = prot_index
            else:
                # Doesn't have corresponding START codon
                temp_pair_list[1] = prot_index
                # Keep record of start/stop pairs
                start_stop_list.append(temp_pair_list.copy())
            # Reset temporary flags
            in_new_prot = False
            temp_pair_list = [-1,-1]
                        
    return start_stop_list
        
def check_real_protein_pair(input_ss_pair):
    if -1 in input_ss_pair:
        return False
    else:
        return True

def get_protein_length(input_ss_pair):
    if -1 in input_ss_pair:
        return -1
    else:
        return input_ss_pair[1]-input_ss_pair[0]-1
    
def check_DNA_failure(input_prot_seq):
    '''
    Input: translated protein sequence of a mini-chromosome
    Output: number of DNA failures
    '''
    # Get start/end index
    start_stop_list = get_start_stop(input_prot_seq)
    dna_failures = 0
    # Iterate through start and stop pairs
    for ss_pair in start_stop_list:
        if not check_real_protein_pair(ss_pair):
            dna_failures += 1
        elif get_protein_length(ss_pair)==0:
            dna_failures += 1

    return dna_failures

# 4. Compute values for DNA sequence
def analyze_sequence(input_prot_seq):
    '''
    Input: translated protein sequence of a mini-chromosome
    Output: Print outs of sequence analysis
    '''
    # Get protein sequence in list format
    prot_list = input_prot_seq.split('-')
    # Get start/end index
    start_stop_list = get_start_stop(input_prot_seq)
    # Get length of real encoded proteins
    prot_length_list = [get_protein_length(i) for i in start_stop_list if check_real_protein_pair(i)]
    # Get number of encoded proteins
    num_encoded_prot = len(prot_length_list)
    # Get shortest and longest encoded protein
    if num_encoded_prot > 0 and any([i > 0 for i in prot_length_list]):
        # 6. Get list of encoded protein sequence
        encoded_prot_list = ["".join(prot_list[i[0]+1:i[1]]) for i in start_stop_list if check_real_protein_pair(i)]
        print(f"Test feature: encoding of a protein. This DNA encodes for {encoded_prot_list}. It has these properties:")
        print('Num encoded proteins: ' + str(num_encoded_prot) + ' proteins')
        print('Shortest protein: ' + str(min(prot_length_list)) + ' amino acids')
        print('Longest protein: ' + str(max(prot_length_list)) + ' amino acids')
        # Get average encoded protein length
        print('Mean protein length: ' + str(np.mean(prot_length_list)) + ' amino acids')
    else:
        print('No encoded protein in sequence!')
    # Get DNA failures
    dna_failures = check_DNA_failure(input_prot_seq)
    print('Number of DNA failures: ' + str(dna_failures) + ' times')
    # Get amount of non-coding DNA
    # Assumption: Start and End codons are included in coding DNA
    # Assumption: START and STOP codons are not coding if not encoded protein sequence in between
    print('Amount of non-coding DNA: ' + str(3*len(prot_list)-3*sum([i+2 for i in prot_length_list if i > 0])) + ' base pairs')
    
# 5. Iterate through each mini-chromosome and analyze them
def trans_trans(input_data):
    '''
    Input: list of DNA sequences
    Output: Analyzed output of DNA sequences and translated proteins
    '''
    for test_seq in input_data:
        print('Input DNA Sequence: ' + test_seq)
        test_prot_seq = translate_dna(test_seq)
        print('Raw Translated Protein Sequence: ' + test_prot_seq)
        analyze_sequence(test_prot_seq)
        print('-'*50)
        print('')  

In [6]:
testing_data = ['ATGATGACGTAA']
# testing_data = ['ATGATGACzTAA']
# testing_data = ['ATGATGAcGTAA']

trans_trans(testing_data)

Input DNA Sequence: ATGATGACGTAA
Raw Translated Protein Sequence: START-START-T-STOP
Test feature: encoding of a protein. This DNA encodes for ['T']. It has these properties:
Num encoded proteins: 1 proteins
Shortest protein: 1 amino acids
Longest protein: 1 amino acids
Mean protein length: 1.0 amino acids
Number of DNA failures: 1 times
Amount of non-coding DNA: 3 base pairs
--------------------------------------------------



In [5]:
# 7. Print out analysis
trans_trans(input_data)

Input DNA Sequence: ATGGAACAAGAATGA
Raw Translated Protein Sequence: START-E-Q-E-STOP
Test feature: encoding of a protein. This DNA encodes for ['EQE']. It has these properties:
Num encoded proteins: 1 proteins
Shortest protein: 3 amino acids
Longest protein: 3 amino acids
Mean protein length: 3.0 amino acids
Number of DNA failures: 0 times
Amount of non-coding DNA: 0 base pairs
--------------------------------------------------

Input DNA Sequence: ATGATTTAAATGATCTAAATGATTTAA
Raw Translated Protein Sequence: START-I-STOP-START-I-STOP-START-I-STOP
Test feature: encoding of a protein. This DNA encodes for ['I', 'I', 'I']. It has these properties:
Num encoded proteins: 3 proteins
Shortest protein: 1 amino acids
Longest protein: 1 amino acids
Mean protein length: 1.0 amino acids
Number of DNA failures: 0 times
Amount of non-coding DNA: 0 base pairs
--------------------------------------------------

Input DNA Sequence: CATATGATTATTTAAATCATGATTATTTAGGATATGGATATTTAGATT
Raw Translated Protei