In [None]:
# QCB455/COS551 Final Project Base Distributions
# Author: Supraj Gunda
# Produces figure 1 from the paper

# imports
from Bio import SeqIO
import matplotlib.pyplot as plt
import numpy as np

In [None]:
# counts bases in all each of the fasta files, counts array is size (# of fasta files, 5)
def countBases(fastaFiles):
    counts = []
    for fastaFile in fastaFiles:
        base_count = [0, 0, 0, 0, 0]

        # did not include other because so miniscule
        for genome in SeqIO.parse(fastaFile, "fasta"):
            # making the genomes readable
            sequence = str(genome.seq).upper()
            base_count[0] = base_count[0] + sequence.count("A")
            base_count[1] = base_count[1] + sequence.count("C")
            base_count[2] = base_count[2] + sequence.count("G")
            base_count[3] = base_count[3] + sequence.count("T")
            base_count[4] = base_count[4] + sequence.count("N")
        counts.append(base_count)
    return counts


In [None]:
def plotBarChart(counts, fastaFiles):

    labels = ["A", "C", "G", "T", "N"]
    # "other"
    
    # extract frequency of all nucleotides for all genomes
    baseVals = {}
    for i, label in enumerate(labels):
        temp = []
        for base_count in counts:
            temp.append(base_count[i])
        baseVals[label] = temp

    x = np.arange(len(fastaFiles)) 
    bottom = np.zeros(len(fastaFiles)) 
    colors = ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd"]
    
    fig, ax = plt.subplots(figsize=(10, 6))
    
    # set height of every bar and placement along x axis
    for i in range(len(labels)):
        label = labels[i]
        color = colors[i]
        ax.bar(x, baseVals[label], 0.8, label=label, color=color, bottom=bottom)
        bottom += baseVals[label]
    
    # labels
    ax.set_xlabel("Genomes", fontsize=15)
    ax.set_ylabel("Base Counts (1e9)", fontsize=15)
    ax.set_title("Nucleotide Base Distribution Across Genomes", fontsize=20)
    ax.set_xticks(x)
    ax.set_xticklabels([    
    'Ash1',
    'ASM',
    'hg',
    'h38',
    'T2T',
    'mouse',
    'dog',
    'cattle',
    'pig'])

    # making layout better for viewing in papers
    ax.legend(title="Bases", loc=4, prop={'size': 12})
    ax.tick_params(axis='both', which='major', labelsize=13)
    plt.tight_layout()
    plt.show()


In [None]:
# making it generalizable in case we want to add more genomes to the pipeline
fasta_files = [
    '/Users/Supraj1/qcb455/fastaFiles/GCA_Ash1.fna',
    '/Users/Supraj1/qcb455/fastaFiles/GCA_ASM.fna',
    '/Users/Supraj1/qcb455/fastaFiles/GCA_hg.fna',
    '/Users/Supraj1/qcb455/fastaFiles/GCF_GRCh38.fna',
    '/Users/Supraj1/qcb455/fastaFiles/GCF_T2T.fna',
    '/Users/Supraj1/qcb455/fastaFiles/GCF_mouse.fna',
    '/Users/Supraj1/qcb455/fastaFiles/GCF_dog.fna',
    '/Users/Supraj1/qcb455/fastaFiles/GCF_cattle.fna',
    '/Users/Supraj1/qcb455/fastaFiles/GCF_pig.fna'
]

# get frequency of each base and plot
base_counts = countBases(fasta_files)
plotBarChart(base_counts, fasta_files)