## Create modified one-hot encoded protein sequences

In [1]:
# from collections import defaultdict
import pandas as pd
# import seaborn as sns
# import matplotlib.pyplot as plt
import numpy as np
import pickle
import gzip

from sklearn.preprocessing import OneHotEncoder

import sys
sys.path.insert(0, '../src')
from pssm_scoring import *
from sequence_removal import *

In [2]:
# Load a dataset that contains proteins with different localization signals including nls
new_df = pd.read_csv('../data/finalized_df_cleaned.csv')
print(new_df.head())

      ACC                                       AnnotEncoded  \
0  O75439  3333333333333333333333333333333333333333333330...   
1  Q2TBK2  3333333333333333333333333333333333333333333333...   
2  Q5VY80  0000000000000000000000000000000000000000000000...   
3  Q9BZM6  0000000000000000000000000000000000000000000000...   
4  O75489  3333333333333333333333333333333333330000000000...   

                                            Sequence Types  Length  
0  MAAAAARVVLSSAARRRLWGFSESLLIRGAAGRSLYFGENRLRSTQ...    MT     489  
1  MAAAAFAVPRGVQLRVLTERLLRGGVRELLRPRLSGSTPGSERDFS...    MT     268  
2  MAAAAIPALLLCLPLLFLLFGWSRARRDDPHSLCYDITVIPKFRPG...   GPI     246  
3  MAAAASPAFLLCLPLLHLLSGWSRAGWVDTHCLCYDFIITPKSRPE...   GPI     244  
4  MAAAAVARLWWRGILGASALTRGTGRPSVLLLPVRRESAGADTRPT...    MT     264  


In [3]:
# Clean data
cleaned_new_df = remove_sequences(new_df, 'Sequence')

In [4]:
# Load the nuclear pssm from the file
with open('../data/nls_pssm.pkl', 'rb') as f:
    nls_pssm = pickle.load(f)

In [5]:
"""
Function that computes the one-hot encoding of a protein sequence. We modify the 1's by the pssm scores of the amino acids.
"""

standard_amino_acids = 'ACDEFGHIKLMNPQRSTVWY'  # 20 standard amino acids
list_standard_amino_acids = list(standard_amino_acids)

encoder = OneHotEncoder(categories =[list_standard_amino_acids], sparse_output=False)  # Using sparse=False to get a dense array

def create_modified_one_hot_encoding(df, column_name):
    modified_one_hot_encoded_list = []

    for protein_seq in df[column_name]:
        data = np.array(list(protein_seq)).reshape(-1,1)


        one_hot_encoded = encoder.fit_transform(data)

        # Obtain PSSM scores, and pad or truncate as necessary
        pssm_scores = nls_pssm.calculate(protein_seq)
        padded_pssm_scores = np.append(pssm_scores, np.zeros(17))
        
        # Element-wise multiplication of one-hot encoded matrix with PSSM scores
        modified_one_hot_encoded = one_hot_encoded * padded_pssm_scores[:len(one_hot_encoded)][:, None]


        # Ensure each matrix has exactly 1000 rows
        target_length = 1000
        current_length = modified_one_hot_encoded.shape[0]
        
        if current_length < target_length:
            # Pad with zeros if fewer than 1000 rows
            padding = np.zeros((target_length - current_length, modified_one_hot_encoded.shape[1]))
            modified_one_hot_encoded = np.vstack([modified_one_hot_encoded, padding])
        else:
            # Truncate if more than 1000 rows
            modified_one_hot_encoded = modified_one_hot_encoded[:target_length]
        # Append the final modified encoding to the list
        modified_one_hot_encoded_list.append(modified_one_hot_encoded)

    return modified_one_hot_encoded_list


def create_one_hot_encoding(df, column_name):
    one_hot_encoded_list = []

    for protein_seq in df[column_name]:
        data = np.array(list(protein_seq)).reshape(-1,1)


        one_hot_encoded = encoder.fit_transform(data)


        # Ensure each matrix has exactly 1000 rows
        target_length = 1000
        current_length = one_hot_encoded.shape[0]
        
        if current_length < target_length:
            # Pad with zeros if fewer than 1000 rows
            padding = np.zeros((target_length - current_length, one_hot_encoded.shape[1]))
            one_hot_encoded = np.vstack([one_hot_encoded, padding])
        else:
            # Truncate if more than 1000 rows
            one_hot_encoded = one_hot_encoded[:target_length]
        # Append the final modified encoding to the list
        one_hot_encoded_list.append(one_hot_encoded)

    return one_hot_encoded_list


In [6]:
modified_one_hot_encoding_list = create_modified_one_hot_encoding(cleaned_new_df, 'Sequence')
# one_hot_encoding_list = create_one_hot_encoding(new_df_cleaned, 'Sequence')

In [16]:
# Create the dataset
encoded_new_df = cleaned_new_df[['ACC']].copy()
encoded_new_df['Encoding'] = modified_one_hot_encoding_list
encoded_new_df['Label'] = cleaned_new_df['Types'].str.contains('NLS', na=False).astype(int)


encoded_new_df.head()

Unnamed: 0,ACC,Encoding,Label
0,O75439,"[[-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0...",0
1,Q2TBK2,"[[-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0...",0
2,Q5VY80,"[[-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0...",0
3,Q9BZM6,"[[-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0...",0
4,O75489,"[[-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0...",0


In [18]:
# Save the dataset
with gzip.open('../data/cnn_input_data.pkl.gz', 'wb') as f:
    pickle.dump(encoded_new_df, f)


In [None]:
# naive_encoded_new_df = pd.DataFrame({'Encoding': one_hot_encoding_list})
# naive_encoded_new_df['Label'] = new_df_cleaned['Types'].str.contains('NLS', na=False).astype(int)


# with gzip.open('naive_encoded_cnn_input_data.pkl.gz', 'wb') as f:
#     pickle.dump(encoded_new_df, f)

# Create testing datasets

In [9]:
# Load nuclear and non-nuclear protein sequences
nuclear_protein_df = pd.read_csv('../data/data_NLS.csv')
non_nuclear_proteins_df = pd.read_csv('../data/data_non_nuclear_proteins.csv')

In [10]:
# Clean data

# Define the length of the feature vectors
length_cutoff = 40

nuclear_protein_df_cleaned = remove_sequences(nuclear_protein_df,'Sequence_y')
non_nuclear_proteins_df_cleaned = remove_short_sequences(remove_sequences(non_nuclear_proteins_df, 'Sequence'),length_cutoff)

In [11]:
# Downsample the non-nuclear proteins 

downsampled_non_nuclear_df = non_nuclear_proteins_df_cleaned.sample(n=len(nuclear_protein_df_cleaned), random_state= 30)  # Match the number of nuclear samples


nls_encoding_list = create_modified_one_hot_encoding(nuclear_protein_df_cleaned, 'Sequence_y')
non_nls_encoding_list = create_modified_one_hot_encoding(downsampled_non_nuclear_df, 'Sequence')

In [12]:
# Check there are equal number of nuclear and non nuclear nuclear sequences 
print(len(nls_encoding_list),len(non_nls_encoding_list))

1357 1357


In [13]:
# Combine two dfs and add labels
encoded_nls_df = pd.DataFrame({'Encoding' : nls_encoding_list})
encoded_nls_df['Label'] = 1

encoded_non_nls_df = pd.DataFrame({'Encoding' : non_nls_encoding_list})
encoded_non_nls_df['Label'] = 0

encoded_all_df = pd.concat([encoded_nls_df, encoded_non_nls_df], axis=0)
shuffled_encoded_all_df = encoded_all_df.sample(frac=1, random_state=50).reset_index(drop=True)

print(shuffled_encoded_all_df.head(10))


                                            Encoding  Label
0  [[-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0...      0
1  [[-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0...      1
2  [[-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0...      0
3  [[-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0...      0
4  [[-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0...      1
5  [[-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0...      0
6  [[-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0...      1
7  [[-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0...      1
8  [[-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0...      0
9  [[-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0...      0


In [14]:
# Save data
with gzip.open('../data/cnn_testing_data.pkl.gz', 'wb') as f:
    pickle.dump(shuffled_encoded_all_df , f)


In [None]:
## Create the standard one-hot encoding
# nls_naive_encoding_list = create_one_hot_encoding(nuclear_protein_df_cleaned, 'Sequence_y')
# non_nls_naive_encoding_list = create_one_hot_encoding(downsampled_non_nuclear_df, 'Sequence')

# naive_encoded_nls_df = pd.DataFrame({'Encoding' : nls_naive_encoding_list})
# naive_encoded_nls_df['Label'] = 1

# naive_encoded_non_nls_df = pd.DataFrame({'Encoding' : non_nls_naive_encoding_list})
# naive_encoded_non_nls_df['Label'] = 0

# naive_encoded_all_df = pd.concat([naive_encoded_nls_df, naive_encoded_non_nls_df], axis=0)
# shuffled_naive_encoded_all_df = naive_encoded_all_df.sample(frac=1, random_state=50).reset_index(drop=True)

# print(shuffled_naive_encoded_all_df.head(10))



In [None]:
# with gzip.open('testing_data_naive_encoded.pkl.gz', 'wb') as f:
#     pickle.dump(shuffled_naive_encoded_all_df , f)