In [29]:
# Imports and add ons

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import pickle
import os
import Bio

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from sklearn.model_selection import train_test_split

from pathlib import Path

In [30]:
# Load the clean data into variables
Processed_Folder = Path(r"ANN.ipynb").parent.absolute() / Path("Processed Data")

dicts = ['EFI_ID_List', 'metabolite_dict', 'Protein_seq_dict']

with open(Processed_Folder / Path('EFI_ID_List.p'), 'rb') as EFI_ID:
    EFI_ID_List = pickle.load(EFI_ID)

with open(Processed_Folder / Path('metabolite_dict.p'), 'rb') as metabolite:
    metabolite_dict = pickle.load(metabolite)

with open(Processed_Folder / Path('Protein_seq_dict.p'), 'rb') as Protein_seq:
    Protein_seq_dict = pickle.load(Protein_seq)

activations = pd.read_csv(Processed_Folder / Path('activations.csv'), index_col=0)

In [42]:
from Bio.Alphabet.IUPAC import IUPACProtein

# +1 and start at 1 in this dictionary indexing in order to allow for 0 to be used when padding
Amino_lookup_dict = {list(IUPACProtein.letters)[i]: (i+1) for i in range(0,len(IUPACProtein.letters))}

def Amino_Acid_Sequencer(Protein_Seq=str):
    Sequence_list = [Amino_lookup_dict[i] for i in list(Protein_Seq)]
    return(Sequence_list)

def Amino_Sequence_Encoder(ID_List:list, Protein_Dict:dict):
    from Bio.Seq import Seq
    from Bio.Alphabet import IUPAC
    from Bio.Alphabet.IUPAC import IUPACProtein

    longest_acid = max(Protein_seq_dict.values(), key = len) 
    num_proteins = len(Protein_Dict)

    encoded_data = np.zeros((num_proteins, len(longest_acid)))

    for i in range(0,len(ID_List)):
        ID = ID_List[i]
        P_seq = Protein_Dict[ID]
        P_seq = str(P_seq).upper()
        acid_sequence = Amino_Acid_Sequencer(Protein_Seq=P_seq)
        # Only update the sequences, so automatically 0 padding used
        encoded_data[i,0:len(acid_sequence)] = list(acid_sequence)
        
    encoded_data = encoded_data.astype(np.int)

    print(encoded_data)
    return(encoded_data)

In [36]:
longest_acid = max(Protein_seq_dict.values(), key = len) 

In [44]:
input_data = Amino_Sequence_Encoder(ID_List=EFI_ID_List, Protein_Dict=Protein_seq_dict)

[[11 17  8 ...  0  0  0]
 [11  9 10 ...  0  0  0]
 [11  3 17 ...  0  0  0]
 ...
 [11 16  7 ...  0  0  0]
 [11  4 17 ...  0  0  0]
 [11 16 10 ...  0  0  0]]


In [45]:
longest_acid

'mkrqwkfivvlvlgaiglitdnvlhipgiaevlfdiagayisfslgkemfgdlkeghwgidvlaliavismmitrdywaewmilvmstggesledyatgqanrelralldknprvagklvdgkvvevkvddlqigdqvlikpgqqvpvdgtiiegssvfdqssltgesvpvdktvgddlmsgslngetavtmevkklakdseyqtivelvkssaaqpakfvkmadryavpftiisliigiaawvttgnftrfaevmvvaspcplliaapvalvagmssmskhhiivksgstleqlakaktfafdktgtltqnqlviqdvlpengfdketiqsyaasleqqsdhiianslvqgtdknliqavsnlqestgngvsgtvdgknvmvgklsyvapdanvnkakttavyvsvdgkyagcitfkdrlrpetpqtlarlrkqgakhimmltgdnkdvaqaiadvagvddvrasllpaqkieaiknvapenrpvvmtgdgindapsltaadvgiamgakgasaasesadavimvndlskindavaiakhtmkvaeigiitaivvviilelvaftglipafwgavlqevvdmisiclallaktepknpkqtgl'