In [4]:
# Imports and add ons

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import pickle
import os
import Bio

from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
from Bio.Alphabet.IUPAC import IUPACProtein

from pathlib import Path

In [11]:
# Load the clean data into variables
Processed_Folder = Path(r"One_Hot_Encoder.ipynb").parent.absolute() / Path("Processed Data")

dicts = ['EFI_ID_List', 'metabolite_dict', 'Protein_seq_dict']

with open(Processed_Folder / Path('EFI_ID_List.p'), 'rb') as EFI_ID:
    EFI_ID_List = pickle.load(EFI_ID)

with open(Processed_Folder / Path('metabolite_dict.p'), 'rb') as metabolite:
    metabolite_dict = pickle.load(metabolite)

with open(Processed_Folder / Path('Protein_seq_dict.p'), 'rb') as Protein_seq:
    Protein_seq_dict = pickle.load(Protein_seq)

activations = pd.read_csv(Processed_Folder / Path('activations.csv'), index_col=0)

In [12]:
# Log number of amino acids in each protein into dictionary
def amino_acid_counter(Protein_Seq):
    from Bio.Seq import Seq
    from Bio.Alphabet import IUPAC
    from Bio.Alphabet.IUPAC import IUPACProtein
    
    my_prot = Seq(Protein_Seq, IUPAC.protein).upper()
    amino_acids = list(IUPACProtein.letters)
    acid_dict = {amino_acids[i]: 0 for i in range(0, len(amino_acids))}
    
    for amino_acid in amino_acids:
        acid_dict[amino_acid] = my_prot.count(amino_acid)
    
    # print(acid_dict)
    return(acid_dict)

In [13]:
# Encodes protein sequence using one-hot method and counting
def one_hot_encoder_count(ID_List:list, Protein_Dict:dict):
    from Bio.Seq import Seq
    from Bio.Alphabet import IUPAC
    from Bio.Alphabet.IUPAC import IUPACProtein

    num_acids = len(IUPACProtein.letters)
    num_proteins = len(Protein_Dict)

    encoded_data = np.zeros((num_proteins, num_acids))

    for i in range(0,len(ID_List)):
        ID = ID_List[i]
        P_seq = Protein_Dict[ID]
        acid_dict = amino_acid_counter(Protein_Seq=P_seq)
        list(list(acid_dict.values()))
        encoded_data[i,:] = list(list(acid_dict.values()))

    encoded_data = encoded_data.astype(int)
    print(encoded_data)    
    return(encoded_data)

In [14]:
input_matrix = one_hot_encoder_count(ID_List=EFI_ID_List, Protein_Dict=Protein_seq_dict)
input_df = pd.DataFrame(data=input_matrix,
                        index=EFI_ID_List,
                        columns=list(IUPACProtein.letters))

[[ 8  1 10 ...  5  4  5]
 [21  3 11 ... 14  3  6]
 [25  7 12 ... 12  6  0]
 ...
 [65  4 39 ... 49  4  3]
 [19  1 11 ... 13  0  4]
 [29  3 18 ... 11  5  2]]
