# Base Counting

## Without using str.count()

In [None]:
def base_counter(sequence):
    base_counts = {"A": 0, "C": 0, "G": 0, "T": 0}
    for base in sequence:
        base_counts[base] += 1
    return base_counts

# Some tests
print(base_counter("")) # Should print {'A': 0, 'C': 0, 'G': 0, 'T': 0}
print(base_counter("ACGT")) # Should print {'A': 1, 'C': 1, 'G': 1, 'T': 1}
print(base_counter("TGCAACGT")) # Should print {'A': 2, 'C': 2, 'G': 2, 'T': 2}
print(base_counter("GATTACA")) # Should print {'A': 3, 'C': 1, 'G': 1, 'T': 2}

## Using str.count()

In [None]:
def base_counter(sequence):
    return {"A": sequence.count("A"), "C": sequence.count("C"), "G": sequence.count("G"), "T": sequence.count("T")}

# Some tests
print(base_counter("")) # Should print {'A': 0, 'C': 0, 'G': 0, 'T': 0}
print(base_counter("ACGT")) # Should print {'A': 1, 'C': 1, 'G': 1, 'T': 1}
print(base_counter("TGCAACGT")) # Should print {'A': 2, 'C': 2, 'G': 2, 'T': 2}
print(base_counter("GATTACA")) # Should print {'A': 3, 'C': 1, 'G': 1, 'T': 2}

## Read a FASTA File

In [None]:
PATH_TO_FILE = "../Data/aroK.fasta"

# a variable to store the DNA string
dna = ""

with open(PATH_TO_FILE) as file:
    # read the first line and print it to the screen.
    print(file.readline())  

    # read in the DNA line by line ...
    while True:
        line = file.readline()
        if not line:
            break  # stop when end of file is reached
        dna = "".join([dna, line.strip()])  # join the new DNA (minus the '\n') to the existing string

# print out the DNA ...
print(dna)


In [None]:
# a function to calculate the GC content using base_counter
def gc_content(sequence):
    counts = base_counter(sequence)
    return (counts['C'] + counts['G']) / sum(counts.values())

print(f"GC content: {gc_content(dna) * 100}%")


## Translate DNA to Protein

In [None]:
# 1. Codon List

codons = []
for i in range(0, len(dna), 3):
    codons.append(dna[i:(i+3)])

print(codons)

In [None]:
# 2.1 Load the Translation Table

PATH_TO_TRANSLATION_TABLE = '../Data/codon_table_11.txt'

translation_table = {}
with open(PATH_TO_TRANSLATION_TABLE) as file:
    while True:
        line = file.readline()
        if not line:
            break  # stop when end of file is reached
        parts = line.strip().split(" ")  # split the line into the codon and amino acid parts
        translation_table[parts[0]] = parts[1]  # add the new codon to the table

print(translation_table)

In [None]:
# 2.2 Translate to Protein

protein = ""
for c in codons:
    protein = "".join([protein, translation_table[c]])

protein

In [None]:
# Save as FASTA file

OUTPUT_FILENAME = "my_protein.fasta"
LINE_LENGTH = 50  # how many amino acids for each line (not necessary but makes the file easier to read)

protein = protein[:-1]  # remove the '*' (stop signal) from the end of the sequence

with open(OUTPUT_FILENAME, 'w') as file:
    file.write(">aroK protein\n")
    for i in range(len(protein) // LINE_LENGTH  + 1):  # how many lines to print
        file.write(protein[:LINE_LENGTH])
        file.write("\n")
        protein = protein[LINE_LENGTH:]


You can check the correct protein sequence at GenBank [here](https://www.ncbi.nlm.nih.gov/protein/AAC36834.1?report=fasta).