In [27]:
# Import Biopython
import Bio
# Import pandas
import pandas as pd
# Initialize data list
data = []
# Set the fasta file pathway
fasta_file = '229924765095.fasta'
# Set the email address
Bio.Entrez.email = ""

# Iterate through the sequences of the fasta file
for i, record in enumerate (Bio.SeqIO.parse(fasta_file, "fasta")):
    # Get the accession number
    accession_number = record.id
    # Get the record in the requested format 
    handle = Bio.Entrez.efetch(db = "nucleotide", id = accession_number, rettype = "gb")
    # Parses the XML results returned by the above function
    record = Bio.SeqIO.read(handle, "gb")
    # Get the features of the record
    features = record.features[0].qualifiers
    # Get the sequence
    sequence = str(record.seq).upper()
    # Get the length of te sequence
    sequence_length = len(sequence)
    # Get the nucleotide composition
    percentage_adenine = round(sequence.count("A") / sequence_length * 100, 2)
    percentage_cytosine = round(sequence.count("C") / sequence_length * 100, 2)
    percentage_guanine = round(sequence.count("G") / sequence_length * 100, 2)
    percentage_thymine = round(sequence.count("T") / sequence_length * 100, 2)
    percentage_unknown = round(100 - (percentage_adenine + percentage_cytosine + percentage_guanine + percentage_thymine), 2)
    # Get the name of the organism
    try: organism = features.get('organism')[0]
    except: organism = None
    # Get the name of the molecule type
    try: molecule_type = features.get('mol_type')[0]
    except: molecule_type = None
    # Get the strain
    try: strain = features.get('strain')[0]
    except: strain = None
    # Get the isolation source
    try: isolation_source = features.get('isolation_source')[0]
    except: isolation_source = None
    # Get the host
    try: host = features.get('host')[0]
    except: host = None
    # Get the country
    try: country = features.get('country')[0]
    except: country = None
    # Get the collection date
    try: collection_date = features.get('collection_date')[0]
    except: collection_date = None
    # Save the data
    data.append([i + 1, accession_number, sequence_length, percentage_adenine, percentage_cytosine, percentage_guanine, percentage_thymine, percentage_unknown, organism, molecule_type, strain, isolation_source, host, country, collection_date])
    # Close the record
    handle.close()
  

In [28]:
# Convert data to dataframe
df = pd.DataFrame (data, columns = ["SEQUENCE_NUMBER",    "ACCESSION_NUMBER",    "SEQUENCE_LENGTH", 
                                    "PERCENTAGE_ADENINE", "PERCENTAGE_CYTOSINE", "PERCENTAGE_GUANINE", 
                                    "PERCENTAGE_THYMINE", "PERCENTAGE_UNKNOWN",  "ORGANISM", 
                                    "MOLECULE_TYPE",      "STRAIN",              "ISOLATION_SOURCE", 
                                    "HOST",               "COUNTRY",             "COLLECTION_DATE"])
# Save the dataframe to excel file
df.to_excel("data.xlsx")                      