In [None]:
from Bio import Entrez
from Bio import SeqIO

EMAIL = 's2614533@ed.ac.uk'
Entrez.email = EMAIL

def get_content(sequence, nucleotide):
    return round(100 * sequence.count(nucleotide)/len(sequence),2)

def get_most_frequent_aminoacid(sequence):
    aminoacids = ['A','R','D','N','C','E','Q','G','H','I','L','K','M','F','P','S','T','W','Y','V']
    frequency = {}
    for a in aminoacids:
        count = sequence.count(a)
        frequency[a] = count
    pass
    


accession_ids = 'NM_033646.4, NM_004361.5, NM_001317214.3, NM_001362438.2'
handle = Entrez.efetch( db = 'Nucleotide', id = accession_ids, rettype = 'gb', retmode = 'text')
records = list(SeqIO.parse(handle, 'genbank')) # SeqIO.parse returns an iterator which can be used to iterate only once. Since we want to iterate twice, we need to use a list

print(f"{'Accession number':20}{'%G':10}{'%C':10}{'%T':10}{'%A':10}")
for entry in records:
    sequence = entry.seq
    print(f"{entry.id:20}{get_content(sequence, 'G'):<10}{get_content(sequence, 'C'):<10}{get_content(sequence, 'T'):<10}{get_content(sequence, 'A'):<10}")

print("\n\n")
# Do we need to find translations to protein in all 6 reading frames?
for entry in records:
    print(f"Accession ID of the gene transcript: {entry.id}")
    for feature in entry.features:
        if feature.type == "CDS":
            print(f"Protein ID: {feature.qualifiers['protein_id']}")
            print(f"Location of the CDS = {feature.location}\n")
            current_sequence = feature.location.extract(entry).seq
            print('Protein Sequence')
            protein_sequence = current_sequence.translate()
            print(protein_sequence)
            print(f"Length of the protein sequence = {len(protein_sequence)}") # outputs number one bigger than it should be
            print(get_most_frequent_aminoacid(protein_sequence),"\n")


By searching the NCBI databases through NCBI website, we have found this information about human Calderin 7: 
Gene ID: 1005
Official Symbol: CDH7
Also known as: CDH7L1

https://www.ncbi.nlm.nih.gov/nuccore/NM_004361.5,NM_033646.4,NM_001317214.3,NM_001362438.2
4 different transcripts:
a) NM_033646.4      12126 bp
b) NM_004361.5      12136 bp
c) NM_001317214.3   3407 bp
d) NM_001362438.2   12938 bp 

In [39]:
# ex 1 trying to find accession IDs using code, not the website
handle = Entrez.esearch( db = 'gene', term = "CDH7[Gene] AND human")
record = Entrez.read(handle)
handle.close()
# doesn't work
print(len(record['IdList']))

handle = Entrez.efetch(db="gene", id=record['IdList'][:1], retmode="xml")
records = Entrez.read(handle)
handle.close()

# look at the first record by iterating through the keys of the dict
# NB there's a lot of information in here
for feature in list(records[0]):
    print(feature,':',records[0][feature])

# find it in the database using the above
# find the one corresponding to human (homo sapiens)
# fetch that one
# look at the record

# maybe hopeless, it is enough that we found the transcript usinf website

20
Entrezgene_track-info : {'Gene-track': {'Gene-track_geneid': '132504105', 'Gene-track_status': StringElement('0', attributes={'value': 'live'}), 'Gene-track_create-date': {'Date': {'Date_std': {'Date-std': {'Date-std_year': '2023', 'Date-std_month': '10', 'Date-std_day': '9'}}}}, 'Gene-track_update-date': {'Date': {'Date_std': {'Date-std': {'Date-std_year': '2023', 'Date-std_month': '10', 'Date-std_day': '10'}}}}}}
Entrezgene_type : 6
Entrezgene_source : {'BioSource': {'BioSource_genome': StringElement('1', attributes={'value': 'genomic'}), 'BioSource_origin': StringElement('1', attributes={'value': 'natural'}), 'BioSource_org': {'Org-ref': {'Org-ref_taxname': 'Lagenorhynchus albirostris', 'Org-ref_common': 'white-beaked dolphin', 'Org-ref_db': [{'Dbtag_db': 'taxon', 'Dbtag_tag': {'Object-id': {'Object-id_id': '27610'}}}], 'Org-ref_orgname': {'OrgName': {'OrgName_name': {'OrgName_name_binomial': {'BinomialOrgName': {'BinomialOrgName_genus': 'Lagenorhynchus', 'BinomialOrgName_species