In [2]:
import torch
from typing import Dict, List

In [3]:
CODON_TABLE : Dict[str, List[str]] = {
    'A': ['GCU', 'GCC', 'GCA', 'GCG'],
    'C': ['UGU', 'UGC'],
    'D': ['GAU', 'GAC'],
    'E': ['GAA', 'GAG'],
    'F': ['UUU', 'UUC'],
    'G': ['GGU', 'GGC', 'GGA', 'GGG'],
    'H': ['CAU', 'CAC'],
    'I': ['AUU', 'AUC', 'AUA'],
    'K': ['AAA', 'AAG'],
    'L': ['UUA', 'UUG', 'CUU', 'CUC', 'CUA', 'CUG'],
    'M': ['AUG'],
    'N': ['AAU', 'AAC'],
    'P': ['CCU', 'CCC', 'CCA', 'CCG'],
    'Q': ['CAA', 'CAG'],
    'R': ['CGU', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG'],
    'S': ['UCU', 'UCC', 'UCA', 'UCG', 'AGU', 'AGC'],
    'T': ['ACU', 'ACC', 'ACA', 'ACG'],
    'V': ['GUU', 'GUC', 'GUA', 'GUG'],
    'W': ['UGG'],
    'Y': ['UAU', 'UAC'],
    '*': ['UAA', 'UAG', 'UGA'],  # Stop codons
}

# Dictionary ambiguous amino acids to standard amino acids
AMBIGUOUS_AMINOACID_MAP: Dict[str, list[str]] = {
    "B": ["N", "D"],  # Asparagine (N) or Aspartic acid (D)
    "Z": ["Q", "E"],  # Glutamine (Q) or Glutamic acid (E)
    "X": ["A"],  # Any amino acid (typically replaced with Alanine)
    "J": ["L", "I"],  # Leucine (L) or Isoleucine (I)
    "U": ["C"],  # Selenocysteine (typically replaced with Cysteine)
    "O": ["K"],  # Pyrrolysine (typically replaced with Lysine)
}

AA_LIST = list(CODON_TABLE.keys())

In [None]:
def protein_to_tensor(protein):

    amino_acid_counts = [0] * len(protein)

    if protein is None or protein == '':
        return torch.tensor(amino_acid_counts, dtype=torch.float)
    
    for amino_acid in protein:
        if amino_acid in AA_LIST:
            idx = AA_LIST.index(amino_acid)
            amino_acid_counts[idx] += 1

    return torch.tensor(amino_acid_counts, dtype=torch.float)

In [9]:
protein = 'MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGG'

In [10]:
t = protein_to_tensor(protein)
t

tensor([ 8.,  2.,  2.,  5.,  3.,  7.,  2.,  0.,  1., 14.,  2.,  1.,  4.,  3.,
         4.,  1.,  2.,  5.,  2.,  2.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.])

In [12]:
len(protein)

70

In [13]:
amino_acid_counts = [0] * len(protein)

for amino_acid in protein:
    if amino_acid in AA_LIST:
        idx = AA_LIST.index(amino_acid)
        amino_acid_counts[idx] += 1