---
# **Human Protein CDS**: Extract Codons Usage

---

## Imports and Globals

In [3]:
!pip3 install num2words


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0[0m[39;49m -> [0m[32;49m23.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [4]:
# imports

from Bio import SeqIO
import pandas as pd
from collections import Counter
from num2words import num2words

In [5]:
# Initialize Codons data

CODONS_LIST = [
    "TTT",
    "TTC",
    "TTA",
    "TTG",
    "TCT",
    "TCC",
    "TCA",
    "TCG",
    "TAT",
    "TAC",
    "TAA",
    "TAG",
    "TGT",
    "TGC",
    "TGA",
    "TGG",
    "CTT",
    "CTC",
    "CTA",
    "CTG",
    "CCT",
    "CCC",
    "CCA",
    "CCG",
    "CAT",
    "CAC",
    "CAA",
    "CAG",
    "CGT",
    "CGC",
    "CGA",
    "CGG",
    "ATT",
    "ATC",
    "ATA",
    "ATG",
    "ACT",
    "ACC",
    "ACA",
    "ACG",
    "AAT",
    "AAC",
    "AAA",
    "AAG",
    "AGT",
    "AGC",
    "AGA",
    "AGG",
    "GTT",
    "GTC",
    "GTA",
    "GTG",
    "GCT",
    "GCC",
    "GCA",
    "GCG",
    "GAT",
    "GAC",
    "GAA",
    "GAG",
    "GGT",
    "GGC",
    "GGA",
    "GGG",
]

CODONS_AMINO_ACIDS_MAP = {
    "TTT": "[F] Phenylalanine",
    "TTC": "[F] Phenylalanine",
    "TTA": "[L] Leucine",
    "TTG": "[L] Leucine",
    "TCT": "[S] Serine",
    "TCC": "[S] Serine",
    "TCA": "[S] Serine",
    "TCG": "[S] Serine",
    "TAT": "[Y] Tyrosine",
    "TAC": "[Y] Tyrosine",
    "TAA": "[*] STOP",
    "TAG": "[*] STOP",
    "TGT": "[C] Cysteine",
    "TGC": "[C] Cysteine",
    "TGA": "[*] STOP",
    "TGG": "[W] Tryptophan",
    "CTT": "[L] Leucine",
    "CTC": "[L] Leucine",
    "CTA": "[L] Leucine",
    "CTG": "[L] Leucine",
    "CCT": "[P] Proline",
    "CCC": "[P] Proline",
    "CCA": "[P] Proline",
    "CCG": "[P] Proline",
    "CAT": "[H] Histidine",
    "CAC": "[H] Histidine",
    "CAA": "[Q] Glutamine",
    "CAG": "[Q] Glutamine",
    "CGT": "[R] Arginine",
    "CGC": "[R] Arginine",
    "CGA": "[R] Arginine",
    "CGG": "[R] Arginine",
    "ATT": "[I] Isoleucine",
    "ATC": "[I] Isoleucine",
    "ATA": "[I] Isoleucine",
    "ATG": "[M] Methionine",
    "ACT": "[T] Threonine",
    "ACC": "[T] Threonine",
    "ACA": "[T] Threonine",
    "ACG": "[T] Threonine",
    "AAT": "[N] Asparagine",
    "AAC": "[N] Asparagine",
    "AAA": "[K] Lysine",
    "AAG": "[K] Lysine",
    "AGT": "[S] Serine",
    "AGC": "[S] Serine",
    "AGA": "[R] Arginine",
    "AGG": "[R] Arginine",
    "GTT": "[V] Valine",
    "GTC": "[V] Valine",
    "GTA": "[V] Valine",
    "GTG": "[V] Valine",
    "GCT": "[A] Alanine",
    "GCC": "[A] Alanine",
    "GCA": "[A] Alanine",
    "GCG": "[A] Alanine",
    "GAT": "[D] Aspartic Acid",
    "GAC": "[D] Aspartic Acid",
    "GAA": "[E] Glutamic Acid",
    "GAG": "[E] Glutamic Acid",
    "GGT": "[G] Glycine",
    "GGC": "[G] Glycine",
    "GGA": "[G] Glycine",
    "GGG": "[G] Glycine",
}

## Method that counts codons in given CDS string

In [6]:
#
#
# 
def count_codons(cds):
    # print(type(cds))
    counts = [0] * 64
    codons = [cds[i:i+3] for i in range(0, len(cds), 3)]
    # print(codons)
    codons_count = dict(Counter(codons))
    
    array_count = list(map(lambda cod: 0 if not cod in codons_count else codons_count[cod], CODONS_LIST))
    return array_count

## Extract genes information

In [10]:
#
# Uncomment to re-create the tsv file <<<<<<<<<<<
#
genes_and_proteins_tsv = open("./human_genes_and_proteins.tsv", "w")
proteins = []

with open("./cds_from_genomic.fna") as handle:
    for record in SeqIO.parse(handle, "fasta"):
        # print(record.id)
        desc = record.description
        tokens = [ "id=" + record.id.split("|")[1] ]
        tokens = tokens + [protein for protein in [i.split("]")[0] for i in desc.split("[")[1:]] if protein.split("=")[0] in ["gene", "protein", "protein_id"]]
        tokens.append("cds=" + str(record.seq))
        proteins.append(tokens)

coding_proteins = [protein for protein in proteins if len(protein) == 5]
just_gene_protein_id = [[value.split("=")[1] for value in rec] for rec in coding_proteins]
# print(coding_proteins)
# print(just_gene_protein_id)

genes_and_proteins_tsv.write("ncbi_id\tgene\tprotein_id\tprotein_name\t" + "\t".join(CODONS_LIST) + "\n")
# print("ncbi_id\tgene\tprotein_id\tprotein_name\t" + "\t".join(CODONS_LIST) + "\n")
for p in just_gene_protein_id:
    genes_and_proteins_tsv.write(p[0] + "\t" + p[1] + "\t" + p[3] + "\t" + p[2] + "\t" + "\t".join(map(str, count_codons(p[4]))) + "\n")
    # print(p[0] + "\t" + p[2] + "/" + p[1] + "\t" + "\t".join(map(str, count_codons(p[3]))))


In [6]:


gpdf = pd.read_csv("./human_genes_and_proteins.tsv", sep="\t")
# print(gpdf)

gpdf.groupby(["gene"]).count()

Unnamed: 0_level_0,protein,TTT,TTC,TTA,TTG,TCT,TCC,TCA,TCG,TAT,...,GCA,GCG,GAT,GAC,GAA,GAG,GGT,GGC,GGA,GGG
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
A1CF,14,14,14,14,14,14,14,14,14,14,...,14,14,14,14,14,14,14,14,14,14
A2M,5,5,5,5,5,5,5,5,5,5,...,5,5,5,5,5,5,5,5,5,5
A2ML1,7,7,7,7,7,7,7,7,7,7,...,7,7,7,7,7,7,7,7,7,7
A3GALT2,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZYG11A,9,9,9,9,9,9,9,9,9,9,...,9,9,9,9,9,9,9,9,9,9
ZYG11B,3,3,3,3,3,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,3
ZYX,8,8,8,8,8,8,8,8,8,8,...,8,8,8,8,8,8,8,8,8,8
ZZEF1,8,8,8,8,8,8,8,8,8,8,...,8,8,8,8,8,8,8,8,8,8


## Python code to process protein Codons

In [82]:
final_count = gpdf[gpdf.columns[2:]].sum()

In [83]:
print(final_count)

TTT    1611118
TTC    1627492
TTA     833315
TTG    1278242
TCT    1607659
        ...   
GAG    3771832
GGT    1025541
GGC    1867973
GGA    1616254
GGG    1442185
Length: 64, dtype: int64


In [103]:

print(final_count.idxmin(), final_count[final_count.idxmin()])
print(final_count.idxmax(), final_count[final_count.idxmax()])

TAG 30731
GAG 3771832


In [86]:
# print(final_count.idxmin())
# print(final_count[final_count.idxmin()])

# print(final_count.idxmax())
# print(final_count[final_count.idxmax()])

print(final_count.sort_values())

print(final_count.sum())


TAG      30731
TAA      38263
TGA      69014
TCG     385999
CGT     431421
        ...   
AAG    3016615
GAA    3189350
CAG    3384175
CTG    3397427
GAG    3771832
Length: 64, dtype: int64
94510990


In [104]:
print(num2words(94510990) + " Codons in Human Proteins")
print(num2words(94510990 * 3) + " Nucleotides in Human Proteins")

ninety-four million, five hundred and ten thousand, nine hundred and ninety Codons in Human Proteins
two hundred and eighty-three million, five hundred and thirty-two thousand, nine hundred and seventy Nucleotides in Human Proteins


In [None]:
len(gpdf)

In [111]:

import nglview as nv

nv.show_file("./sample.pdb")

NGLWidget()