## Create a one-hot encoding map for amino acids

This is intended to transform peptide sequences into the input format for a pytorch Embedding layer. The padding character is `-`, and its index will be index `0`.

## Imports

#### Standard pydata imports

This also creates a default `logger` for use in the notebook.

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

from argparse import Namespace

try:
    import pyllars.logging_utils as logging_utils
    logger = logging_utils.get_ipython_logger()
    logger.setLevel('INFO')
except:
    pass

# standard pydata imports
import joblib
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
import pathlib
import seaborn as sns; sns.set(style='white', color_codes=True)
import tqdm
import yaml

INFO     : NumExpr defaulting to 4 threads.


#### Custom imports

In [2]:
import pyllars.collection_utils as collection_utils
import pyllars.mpl_utils as mpl_utils
import pyllars.pandas_utils as pd_utils

import lifesci.amino_acid_utils as aa_utils

#### Constants

In [3]:
BASE = pathlib.Path("/prj/peptide-encoder/data/intermediate/")

#### Functions

## Start code

In [4]:
args = Namespace()
args.out = BASE / "oh-aa-encoding-map.jpkl"

In [9]:
aa_letters = aa_utils.aa_letters.copy()
idxs = np.arange(1, len(aa_utils.aa_letters)+1)

it = zip(aa_letters, idxs)

oh_aa_encoding_map = {
    aa: idx for aa, idx in it
}
         
oh_aa_encoding_map['-'] = 0
                 
oh_aa_encoding_map

{'A': 1,
 'R': 2,
 'N': 3,
 'D': 4,
 'C': 5,
 'E': 6,
 'Q': 7,
 'G': 8,
 'H': 9,
 'I': 10,
 'L': 11,
 'K': 12,
 'M': 13,
 'F': 14,
 'P': 15,
 'S': 16,
 'T': 17,
 'W': 18,
 'Y': 19,
 'V': 20,
 '-': 0}

In [10]:
joblib.dump(oh_aa_encoding_map, args.out)

['/prj/peptide-encoder/data/intermediate/oh-aa-encoding-map.jpkl']