In [10]:
from Bio import Entrez
from Bio import SeqIO

EMAIL = 's2614533@ed.ac.uk'
Entrez.email = EMAIL

def get_content(sequence, nucleotide):
    return round(100 * sequence.count(nucleotide)/len(sequence),2)

def get_most_frequent_aminoacid(sequence):
    aminoacids = ['A','R','D','N','C','E','Q','G','H','I','L','K','M','F','P','S','T','W','Y','V']
    
    max_count = 0
    most_frequent_aminoacids = []

    for a in aminoacids:
        count = sequence.count(a)
        if count > max_count:
            most_frequent_aminoacids = [a]
            max_count = count
        elif count == max_count:
            most_frequent_aminoacids.append(a)
    
    return most_frequent_aminoacids

    
accession_ids = 'NM_033646.4, NM_004361.5, NM_001317214.3, NM_001362438.2'
handle = Entrez.efetch( db = 'Nucleotide', id = accession_ids, rettype = 'gb', retmode = 'text')
records = list(SeqIO.parse(handle, 'genbank')) # SeqIO.parse returns an iterator which can be used to iterate only once. Since we want to iterate twice, we need to use a list

print(f"{'Accession number':20}{'%G':10}{'%C':10}{'%T':10}{'%A':10}{'Length':10}")
for entry in records:
    sequence = entry.seq
    print(f"{entry.id:20}{get_content(sequence, 'G'):<10}{get_content(sequence, 'C'):<10}{get_content(sequence, 'T'):<10}{get_content(sequence, 'A'):<10}{len(sequence):<10}")


print("\n\n")
# Do we need to find translations to protein in all 6 reading frames?
for entry in records:
    print(f"Accession ID of the gene transcript: {entry.id}")
    for feature in entry.features:
        if feature.type == "CDS":
            print(f"Protein ID: {feature.qualifiers['protein_id']}")
            print(f"Location of the CDS = {feature.location}\n")
            current_sequence = feature.location.extract(entry).seq
            print('Protein Sequence')
            protein_sequence = current_sequence.translate(to_stop = True)
            print(protein_sequence)
            print(f"Length of the protein sequence = {len(protein_sequence)}") # outputs number one bigger than it should be
            print(f"Most frequent aminoacid(s): {get_most_frequent_aminoacid(protein_sequence)} \n")


Accession number    %G        %C        %T        %A        Length    
NM_033646.4         18.51     17.33     30.92     33.24     12126     
NM_004361.5         18.68     17.52     30.77     33.03     12136     
NM_001317214.3      22.04     22.07     27.36     28.53     3407      
NM_001362438.2      19.49     18.0      30.38     32.13     12938     



Accession ID of the gene transcript: NM_033646.4
Protein ID: ['NP_387450.1']
Location of the CDS = [325:2683](+)

Protein Sequence
MKLGKVEFCHFLQLIALFLCFSGMSQAELSRSRSKPYFQSGRSRTKRSWVWNQFFVLEEYMGSDPLYVGKLHSDVDKGDGSIKYILSGEGASSIFIIDENTGDIHATKRLDREEQAYYTLRAQALDRLTNKPVEPESEFVIKIQDINDNEPKFLDGPYTAGVPEMSPVGTSVVQVTATDADDPTYGNSARVVYSILQGQPYFSVEPKTGVIKTALPNMDREAKDQYLLVIQAKDMVGQNGGLSGTTSVTVTLTDVNDNPPRFPRRSYQYNVPESLPVASVVARIKAADADIGANAEMEYKIVDGDGLGIFKISVDKETQEGIITIQKELDFEAKTSYTLRIEAANKDADPRFLSLGPFSDTTTVKIIVEDVDEPPVFSSPLYPMEVSEATQVGNIIGTVAAHDPDSSNSPVRYSIDRNTDLERYFNIDANSGVITTAKSLDRETNAIHNITVLAMESQNPSQVGRGYVAITILDINDNAPEFAMDYETTVCENAQPGQVIQKISAVDKDEP

By searching the NCBI databases through NCBI website, we have found this information about human Calderin 7: 
Gene ID: 1005
Official Symbol: CDH7
Also known as: CDH7L1

https://www.ncbi.nlm.nih.gov/nuccore/NM_004361.5,NM_033646.4,NM_001317214.3,NM_001362438.2
4 different transcripts:
a) NM_033646.4      12126 bp
b) NM_004361.5      12136 bp
c) NM_001317214.3   3407 bp
d) NM_001362438.2   12938 bp 

In [15]:
# pairwise sequence alignment
from Bio import Entrez
from Bio import SeqIO
from Bio import pairwise2 as pw
from Bio import AlignIO
from Bio import Align as al

EMAIL = 's2614533@ed.ac.uk'
Entrez.email = EMAIL

mx = al.substitution_matrices.load('BLOSUM45')

shortest_seq = records[2].seq
longest_seq = records[3].seq
alignments = pw.align.localds(longest_seq,shortest_seq,mx, -10, -0.5)

In [17]:
print("Number of different alignments: ",len(alignments))
print("Score: ",alignments[0][2])

# print(pw.format_alignment(*alignments[0]))

alignment_fasta = \
">"+records[3].name+" "+records[3].description+"\n"+alignments[0][0] \
+"\n"+ \
">"+records[2].name+" "+records[2].description+"\n"+alignments[0][1]

# write it to a file
fh = open('calderin_alignment_blosum62.fa','w')
fh.write(alignment_fasta)
fh.close()

# read in the file using AlignIO
alignment = AlignIO.read('calderin_alignment_blosum62.fa', "fasta")

# convert to clustal
print(format(alignment,'clustal'))

Number of different alignments:  1000
Score:  19000.0
CLUSTAL X (1.81) multiple sequence alignment


NM_001362438                        AGTCTGCCCCGCGCGCGGAGCTGCGCGCACTGGGTCCCCAAGAGCCCGCG
NM_001317214                        AGTCTGCCCCGCGCGCGGAGCTGCGCGCACTGGGTCCCCAAGAGCCCGCG

NM_001362438                        GGCGTCCGGCAGCCGAGCGCACGTTCTTTCGGATGCACACGCCCGGGTCC
NM_001317214                        GGCGTCCGGCAGCCGAGCGCACGTTCTTTCGGATGCACACGCCCGGGTCC

NM_001362438                        CTGGCGTCTGACGCCGTGGGGAGGGCAGCGAGGCCCCAGGTGAGTGTGTC
NM_001317214                        CTGGCGTCTGACGCCGTGGGGAGGGCAGCGAGGCCC--------------

NM_001362438                        TGCCTGCGCGGGGCTGGGGAGGCGCCGCTGGGCAGATGTGCGCCCTTGCT
NM_001317214                        --------------------------------------------------

NM_001362438                        GTGCGCCTTTGGAAGCAGGACTAATCAGTGAGCGGAGAGCGGGCGGGGAT
NM_001317214                        --------------------------------------------------

NM_001362438            