In [1]:
# Install necessary packages

from Bio import SeqIO
from Bio.Data import CodonTable
import pandas as pd

In [2]:
# 1.
# Dr. X's file calling function
# I changed the name of the function from "get_sequences_from_file' to "get_seq" because the original name was long
# I also changed the name of the dictionary from "sequence_data_dict" to "seqdat_dict"

def get_seq(fasta_fn):                                         # def defines the following as a function. Function given name. fasta_fn is the single argument of the function
    seqdat_dict = {}                                           # creates an empty dictionary
    for record in SeqIO.parse(fasta_fn, "fasta"):              # SeqIO reads SeqRecords. "fasta" tells it that the file in question is in FASTA format
                                                                   # SeqIO.parse separates the multiple FASTA records in one file
                                                                   # for loop loops through every record from the FASTA argument file parsed out and separated by SeqIO
        description = record.description.split()               # split the description metadata line that starts the FASTA record into a list of strings
        species_name = description[1] + " " + description[2]   # variable "species_name" is set to the 1st and 2nd items in "description" for the current record
        seqdat_dict[species_name] = record.seq                 # add the species name as the key and add the sequence of the current record to the dictionary
    return(seqdat_dict)                                        # output the contents of the dictionary
 
#get_seq("bears_cytb.fasta")                                    # call get_seq function with the bear cytochrome C fasta file as argument, importing sequence data

In [3]:
# 2. 
# Translation function utilizing codon table from FASTA file with multiple records
# Following outlines of pseudocode in sequence_translate.py

def translate_function(string_nucleotides):                                         # initiate and name function. Argument is a string of nucleotides                 
    mito_table = CodonTable.unambiguous_dna_by_name["Vertebrate Mitochondrial"]     # sets mito_table as the vertebrate mitochondrial codon table (a table showing which codons code for which amino acids)
                                                                                         # mitochondrial table used since cytochrome C is a mitochondrially produced protein - http://biopython.org/DIST/docs/tutorial/Tutorial.html
    aa_seq_string = []                                                              # create empty list   
    i = 3                                                                           # initialize i at 3
    for position in string_nucleotides:                                             # for loop through all bases in the sequence
        codon = str(string_nucleotides[(i-3):i:])                                   # set codon as the three bases. Specifically, the three bases between i and (i-3) are passed to codon
        if codon in {"AGA", "AGG", "TAA", "TAG"}:                                   # if codon is a mitochondrial stop codon - http://biopython.org/DIST/docs/tutorial/Tutorial.html, https://stackoverflow.com/questions/15112125/how-to-test-multiple-variables-against-a-value
            break                                                                        # break, leave for loop and don't translate stop codon - https://www.digitalocean.com/community/tutorials/how-to-use-break-continue-and-pass-statements-when-working-with-loops-in-python-3
        aa_seq_string += mito_table.forward_table[codon]                            # retrieve amino acid corresponding to the three bases in codon
        i += 3                                                                      # increment i by 3, moving to next codon in sequence
    aa_seq_string = ''.join(aa_seq_string)                                          # concatenate list of strings into string - https://stackoverflow.com/questions/12453580/concatenate-item-in-list-to-strings
    return(aa_seq_string)

#for key, value in (get_seq("bears_cytb.fasta")).items():                            # call get_seq to create dictionary with species name (key) and cytC sequence (value)
#    print(translate_function(value))                                                # call translate_function with the cytC sequence (value) as the argument and print

In [4]:
# 3.
# Translate using built-in Biopython capabilities - based on Dr. Friedburg's lecture "biopython_lecture-2018-11-07.pdf" on Slack

from Bio.Seq import Seq                                                             # import Seq object
from Bio.Alphabet import IUPAC                                                      # import IUPAC alphabets for Seq objects

def better_translate(seqobj_DNA):                                                   # define function - argument is a DNA string
    return(seqobj_DNA.translate(table = 2, to_stop = True))                         # use translate method to translate argument. Table 2 is the mitochondrial translation table. 
                                                                                        # setting "to_stop" equal to "True" causes stop codons to be ignored
#for key, value in (get_seq("bears_cytb.fasta")).items():                            # call get_seq to create dictionary with species name (key) and cytC sequence (value)
#    print(better_translate(value))

In [15]:
# 4.
# Molecular Weight function

from Bio.SeqUtils.ProtParam import ProteinAnalysis                  # import necessary library

def mol_wt(string_aa):                                              # define function. Argument is an aa string
    seqobj_prot = ProteinAnalysis(string_aa)                        # ProteinAnalysis transforms the aa string into a protein Seq object
    return(seqobj_prot.molecular_weight())                          # the molecular_weight method calculates the molecular weight of the input protein Seq object

#for key, value in (get_seq("bears_cytb.fasta")).items():            # call get_seq to create dictionary with species name (key) and cytC sequence (value)
#    cytc_aa = str(better_translate(value))                          # turn output of get_seq into a string
#    print(mol_wt(cytc_aa))                                          

42458.79919999999
42414.743499999975
42306.67349999998
42551.98999999998
42427.74389999999
42560.89100000001
42702.184499999996
42384.82659999999
42454.78729999998
