In [32]:
# Imports and add ons

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import pickle
import os
import Bio

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from sklearn.model_selection import train_test_split

from pathlib import Path

In [33]:
# Load the clean data into variables
Processed_Folder = Path(r"ANN.ipynb").parent.absolute() / Path("Processed Data")

dicts = ['EFI_ID_List', 'metabolite_dict', 'Protein_seq_dict']

with open(Processed_Folder / Path('EFI_ID_List.p'), 'rb') as EFI_ID:
    EFI_ID_List = pickle.load(EFI_ID)

with open(Processed_Folder / Path('metabolite_dict.p'), 'rb') as metabolite:
    metabolite_dict = pickle.load(metabolite)

with open(Processed_Folder / Path('Protein_seq_dict.p'), 'rb') as Protein_seq:
    Protein_seq_dict = pickle.load(Protein_seq)

activations = pd.read_csv(Processed_Folder / Path('activations.csv'), index_col=0)

In [36]:
from Bio.Alphabet.IUPAC import IUPACProtein

# +1 and start at 1 in this dictionary indexing in order to allow for 0 to be used when padding
Amino_lookup_dict = {list(IUPACProtein.letters)[i]: i for i in range(0,len(IUPACProtein.letters))}

def Single_One_Hot_Encoder(Protein_Seq=str):
    # Each amino acid entry in the protein sequence is numbered 
    Sequence_list = [Amino_lookup_dict[i] for i in list(Protein_Seq)]
    

    Num_Acids = len(IUPACProtein.letters) # Number of unique amino acids possible in proteins, should be 20
    encoded_acid = np.zeros((len(Protein_Seq),Num_Acids))
    # Each column vector represents a single entry in the protein sequence

    for i in range(0,len(Protein_Seq)):
        acid = Protein_Seq[i]
        Acid_Number = Amino_lookup_dict[acid]
        encoded_acid[i,Acid_Number] = 1
    
    # Optional depends on how you want the data
    encoded_acid = encoded_acid.T

    print(encoded_acid)
    return(encoded_acid)

def One_Hot_Encoder(ID_List:list, Protein_Dict:dict):
    from Bio.Seq import Seq
    from Bio.Alphabet import IUPAC
    from Bio.Alphabet.IUPAC import IUPACProtein

    ''' Since the sequences are all different length arrays, 
    we will solve the problem by storing them in dictionaries
    '''
    
    data_dict = {}
    # Key = EFI ID
    # Value = numpy Array containing one_hot_encoding of protein sequence

    longest_acid = max(Protein_seq_dict.values(), key = len) 
    num_proteins = len(Protein_Dict)

    for i in range(0,len(ID_List)):
        ID = ID_List[i]
        P_seq = Protein_Dict[ID]
        P_seq = str(P_seq).upper()
        acid_sequence = Single_One_Hot_Encoder(Protein_Seq=P_seq)
        # Only update the sequences, so automatically 0 padding used
        encoded_data[i,0:len(acid_sequence)] = list(acid_sequence)
        
    encoded_data = encoded_data.astype(np.int)

    print(encoded_data)
    return(encoded_data)

In [37]:
P_seq = str(Protein_seq_dict[EFI_ID_List[1]]).upper()
x = Amino_Acid_Sequencer(Protein_Seq=P_seq)


[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [13]:
x = 'abc'
len(x)

3