---
# **Viral Genomes**: Extract Codons Usage

---

## Imports and Globals

In [1]:
!pip3 install num2words



In [2]:
# imports

from Bio import SeqIO
import pandas as pd
from collections import Counter
from num2words import num2words

In [3]:
# Initialize Codons data

CODONS_LIST = [
    "TTT",
    "TTC",
    "TTA",
    "TTG",
    "TCT",
    "TCC",
    "TCA",
    "TCG",
    "TAT",
    "TAC",
    "TAA",
    "TAG",
    "TGT",
    "TGC",
    "TGA",
    "TGG",
    "CTT",
    "CTC",
    "CTA",
    "CTG",
    "CCT",
    "CCC",
    "CCA",
    "CCG",
    "CAT",
    "CAC",
    "CAA",
    "CAG",
    "CGT",
    "CGC",
    "CGA",
    "CGG",
    "ATT",
    "ATC",
    "ATA",
    "ATG",
    "ACT",
    "ACC",
    "ACA",
    "ACG",
    "AAT",
    "AAC",
    "AAA",
    "AAG",
    "AGT",
    "AGC",
    "AGA",
    "AGG",
    "GTT",
    "GTC",
    "GTA",
    "GTG",
    "GCT",
    "GCC",
    "GCA",
    "GCG",
    "GAT",
    "GAC",
    "GAA",
    "GAG",
    "GGT",
    "GGC",
    "GGA",
    "GGG",
]

CODONS_AMINO_ACIDS_MAP = {"TTT": "[F] Phenylalanine",
    "TTC": "[F] Phenylalanine",
    "TTA": "[L] Leucine",
    "TTG": "[L] Leucine",
    "TCT": "[S] Serine",
    "TCC": "[S] Serine",
    "TCA": "[S] Serine",
    "TCG": "[S] Serine",
    "TAT": "[Y] Tyrosine",
    "TAC": "[Y] Tyrosine",
    "TAA": "[*] STOP",
    "TAG": "[*] STOP",
    "TGT": "[C] Cysteine",
    "TGC": "[C] Cysteine",
    "TGA": "[*] STOP",
    "TGG": "[W] Tryptophan",
    "CTT": "[L] Leucine",
    "CTC": "[L] Leucine",
    "CTA": "[L] Leucine",
    "CTG": "[L] Leucine",
    "CCT": "[P] Proline",
    "CCC": "[P] Proline",
    "CCA": "[P] Proline",
    "CCG": "[P] Proline",
    "CAT": "[H] Histidine",
    "CAC": "[H] Histidine",
    "CAA": "[Q] Glutamine",
    "CAG": "[Q] Glutamine",
    "CGT": "[R] Arginine",
    "CGC": "[R] Arginine",
    "CGA": "[R] Arginine",
    "CGG": "[R] Arginine",
    "ATT": "[I] Isoleucine",
    "ATC": "[I] Isoleucine",
    "ATA": "[I] Isoleucine",
    "ATG": "[M] Methionine",
    "ACT": "[T] Threonine",
    "ACC": "[T] Threonine",
    "ACA": "[T] Threonine",
    "ACG": "[T] Threonine",
    "AAT": "[N] Asparagine",
    "AAC": "[N] Asparagine",
    "AAA": "[K] Lysine",
    "AAG": "[K] Lysine",
    "AGT": "[S] Serine",
    "AGC": "[S] Serine",
    "AGA": "[R] Arginine",
    "AGG": "[R] Arginine",
    "GTT": "[V] Valine",
    "GTC": "[V] Valine",
    "GTA": "[V] Valine",
    "GTG": "[V] Valine",
    "GCT": "[A] Alanine",
    "GCC": "[A] Alanine",
    "GCA": "[A] Alanine",
    "GCG": "[A] Alanine",
    "GAT": "[D] Aspartic Acid",
    "GAC": "[D] Aspartic Acid",
    "GAA": "[E] Glutamic Acid",
    "GAG": "[E] Glutamic Acid",
    "GGT": "[G] Glycine",
    "GGC": "[G] Glycine",
    "GGA": "[G] Glycine",
    "GGG": "[G] Glycine",
}

## Method that counts codons in given CDS string

In [4]:
#
#
# 
def count_codons(cds):
    # print(type(cds))
    counts = [0] * 64
    codons = [cds[i:i+3] for i in range(0, len(cds), 3)]
    # print(codons)
    codons_count = dict(Counter(codons))
    
    array_count = list(map(lambda cod: 0 if not cod in codons_count else codons_count[cod], CODONS_LIST))
    return array_count

## Extract genes information

In [7]:
genes_and_proteins_tsv = open("./viral_genes_and_proteins.tsv", "w")

# Read the GBFF file
gbff_file = "./viral.1.genomic.gbff"
genes_and_proteins_tsv.write("ncbi_id\ttax_url\torganism\tstrain\thost\tgene\tprotein_id\tprotein_name\t" + "\t".join(CODONS_LIST) + "\n")

records = list(SeqIO.parse(gbff_file, "genbank"))
for record in records:
    # Get the name of the virus
    ncbi_id = record.id
    organism = record.annotations["organism"]
    strain = "UNKNOWN"
    host = "UNKNOWN"
    taxonomy = ";".join(record.annotations["taxonomy"])
    tax_url = "ERROR"
    #if "Human" in record_str:
    for feature in record.features:
        if feature.type == "source":
            if "host" in feature.qualifiers:
                host = feature.qualifiers["host"][0]
            if "strain" in feature.qualifiers:
                strain = feature.qualifiers["strain"][0]
            # print(host, ":", record.annotations["organism"], ":", "/".join(record.annotations["taxonomy"]))
            if len(record.annotations["taxonomy"]) <= 0:
                tax_url = host + "://" + "Viruses;" + organism + ";" + strain
            else:
                tax_url = host + "://" + taxonomy + ";" + organism + ";" + strain
        elif feature.type == "CDS": # Check if the feature is a CDS (coding sequence)
            gene_name = "UNKNOWN"
            protein_id = "UNKNOWN"
            protein_name = "UNKNOWN"
            if "gene" in feature.qualifiers:
                gene_name = feature.qualifiers["gene"][0]
            if "protein_id" in feature.qualifiers:
                protein_id = feature.qualifiers["protein_id"][0]
            if "product" in feature.qualifiers:
                protein_name = feature.qualifiers["product"][0]                                              
            cds = str(feature.extract(record.seq)) # Extract the protein sequence from the CDS
            codons_count = "\t".join(map(str, count_codons(cds)))
            start = feature.location.start
            end = feature.location.end
            genes_and_proteins_tsv.write(ncbi_id + "\t" + tax_url + "\t" + organism + "\t" + strain + "\t" + host + "\t" + gene_name + "\t" + protein_id + "\t" + protein_name + "\t" + codons_count + "\n")
        
genes_and_proteins_tsv.close()



In [10]:


gpdf = pd.read_csv("./viral_genes_and_proteins.tsv", sep="\t")
# print(gpdf)


In [12]:

gpdf.groupby(["organism"]).count().sort_values(by="ncbi_id")

Unnamed: 0_level_0,ncbi_id,tax_url,strain,host,gene,protein_id,protein_name,TTT,TTC,TTA,...,GCA,GCG,GAT,GAC,GAA,GAG,GGT,GGC,GGA,GGG
organism,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
unidentified entomopoxvirus,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
Sclerotinia sclerotiorum mitovirus 1 HC025,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
Sclerotinia sclerotiorum mitovirus 2,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
Sclerotinia sclerotiorum mitovirus 3,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
Sclerotinia sclerotiorum mitovirus 6,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Pandoravirus quercus,1185,1185,1185,1185,1185,1185,1185,1185,1185,1185,...,1185,1185,1185,1185,1185,1185,1185,1185,1185,1185
Orpheovirus IHUMI-LCC2,1199,1199,1199,1199,1199,1199,1199,1199,1199,1199,...,1199,1199,1199,1199,1199,1199,1199,1199,1199,1199
Pandoravirus salinus,1430,1430,1430,1430,1430,1430,1430,1430,1430,1430,...,1430,1430,1430,1430,1430,1430,1430,1430,1430,1430
African swine fever virus,1557,1557,1557,1557,1557,1557,1557,1557,1557,1557,...,1557,1557,1557,1557,1557,1557,1557,1557,1557,1557


## Python code to process protein Codons

In [None]:
final_count = gpdf[gpdf.columns[2:]].sum()

In [None]:
print(final_count)

In [None]:

print(final_count.idxmin(), final_count[final_count.idxmin()])
print(final_count.idxmax(), final_count[final_count.idxmax()])

In [None]:
# print(final_count.idxmin())
# print(final_count[final_count.idxmin()])

# print(final_count.idxmax())
# print(final_count[final_count.idxmax()])

print(final_count.sort_values())

print(final_count.sum())


In [None]:
print(num2words(94510990) + " Codons in Human Proteins")
print(num2words(94510990 * 3) + " Nucleotides in Human Proteins")

In [None]:
len(gpdf)

In [None]:

import nglview as nv

nv.show_file("./sample.pdb")