## Turn nuclear and non nuclear protein sequences into feature vectors of size n=20 using pssm

In [1]:
from collections import defaultdict
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pickle

import sys
sys.path.insert(0, './src')
from pssm_scoring import *
from sequence_removal import *

In [2]:
# Find the top n scores
def compute_top_n_scores(array, length=20):
    top_n = np.partition(array, -length)[-length:]
    top_n_sorted = np.sort(top_n)[::-1]  # Sort and reverse to show largest first
    return top_n_sorted

In [3]:
# Create a DataFrame from the list of top n scores and label
def create_feature_vectors(df, pssm, column_name, feature_vec_length, label ):
    top_n_scores_list = [compute_top_n_scores(pssm.calculate(protein_seq),feature_vec_length) for protein_seq in df[column_name]]
    scores_df = pd.DataFrame(top_n_scores_list, columns=[f'Score_{i+1}' for i in range(feature_vec_length)])
    scores_df['Label'] = label
    return scores_df


In [4]:
# Load the nuclear pssm from the file
with open('data/nls_pssm.pkl', 'rb') as f:
    nls_pssm = pickle.load(f)

In [5]:
# Load nuclear and non-nuclear protein sequences
nuclear_protein_df = pd.read_csv('data/data_NLS.csv')
non_nuclear_proteins_df = pd.read_csv('data/data_non_nuclear_proteins.csv')

In [6]:
# Define the length of the feature vectors
feature_vec_length = 20
length_cutoff = feature_vec_length + 20

In [7]:
# Clean data
nuclear_protein_df_cleaned = remove_sequences(nuclear_protein_df,'Sequence_y')
non_nuclear_proteins_df_cleaned = remove_short_sequences(remove_sequences(non_nuclear_proteins_df, 'Sequence'),length_cutoff)

In [8]:
# Create nuclear and non-nuclear feature vectors with labels
NLS_feature_df = create_feature_vectors(nuclear_protein_df_cleaned,nls_pssm, 'Sequence_y',feature_vec_length, 1)
non_NLS_feature_df = create_feature_vectors(non_nuclear_proteins_df_cleaned,nls_pssm, 'Sequence', feature_vec_length, 0)

In [9]:
NLS_feature_df.sample(5)

Unnamed: 0,Score_1,Score_2,Score_3,Score_4,Score_5,Score_6,Score_7,Score_8,Score_9,Score_10,...,Score_12,Score_13,Score_14,Score_15,Score_16,Score_17,Score_18,Score_19,Score_20,Label
327,8.091913,7.089682,5.21138,4.072215,3.116793,2.629029,2.158111,2.150467,1.927975,1.890708,...,1.351728,1.257393,1.151911,0.629349,0.313196,0.295512,0.057089,0.049096,-0.098203,1
180,24.2966,19.652372,19.294035,18.255049,18.19759,18.178928,17.280661,17.249743,16.55913,16.178709,...,15.747438,15.551126,14.307144,13.836783,13.834719,13.570904,13.221354,12.934178,12.266491,1
350,3.212575,0.847767,-0.697915,-1.116895,-1.749096,-2.091114,-2.462132,-2.5976,-2.999829,-3.069537,...,-3.627649,-3.722806,-3.842481,-4.187544,-4.526583,-4.648609,-4.673775,-4.779947,-4.964108,1
589,15.614695,14.208913,14.040953,13.919377,13.398337,12.446679,12.434358,12.257321,12.091343,11.884075,...,11.459797,10.560224,10.338562,10.253052,10.159348,10.143777,9.900549,9.304184,9.079195,1
580,7.209533,7.18223,3.884188,3.738303,3.632991,2.897428,2.832086,2.182937,1.522273,0.438137,...,-0.149894,-0.303083,-0.734945,-0.770374,-0.828553,-1.00292,-1.175865,-1.355783,-1.563299,1


In [10]:
non_NLS_feature_df.sample(5)

Unnamed: 0,Score_1,Score_2,Score_3,Score_4,Score_5,Score_6,Score_7,Score_8,Score_9,Score_10,...,Score_12,Score_13,Score_14,Score_15,Score_16,Score_17,Score_18,Score_19,Score_20,Label
6141,3.264045,3.261103,2.884362,1.825447,1.651902,0.515872,0.467728,-0.07983,-0.505814,-0.508887,...,-0.550152,-1.126066,-1.181561,-1.400623,-1.819449,-1.954868,-2.0478,-2.313533,-2.340401,0
48172,12.325925,10.567476,9.258512,7.614379,6.612967,6.298271,6.11987,5.516294,5.458074,4.110182,...,2.989952,2.685587,2.55811,2.489893,1.388809,0.818246,0.480644,-0.302769,-0.345985,0
5393,3.057035,-0.386865,-0.953474,-1.002151,-1.104077,-1.340664,-1.388157,-1.398591,-1.554675,-1.690446,...,-2.060084,-2.113668,-2.178853,-2.211115,-2.51313,-2.590217,-2.729263,-2.765656,-2.821556,0
26318,2.43211,1.19086,0.604529,-0.07114,-0.07114,-0.266154,-0.444183,-1.102603,-1.649574,-1.87534,...,-1.883958,-1.931231,-2.109113,-2.63071,-2.916106,-2.94459,-2.947003,-2.960735,-3.020169,0
61079,5.325541,3.787784,3.281542,2.961138,2.871904,2.236572,1.589586,0.982262,0.836103,0.694267,...,0.267906,-0.097262,-0.248081,-0.303597,-0.990495,-1.028134,-1.103318,-1.352198,-1.479391,0


In [11]:
print(len(NLS_feature_df),len(non_NLS_feature_df))

1357 65590


In [12]:
# Downsample the non-nuclear proteins and combine with nuclear proteins
downsampled_non_nuclear_df = non_NLS_feature_df.sample(n=len(NLS_feature_df), random_state=50)  # Match the number of nuclear samples

feature_df = pd.concat([NLS_feature_df,downsampled_non_nuclear_df], axis=0)
shuffled_feature_df = feature_df.sample(frac=1, random_state=42).reset_index(drop=True)

## Binary classification

In [13]:
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier


from sklearn.metrics import classification_report

In [14]:
# define training and further testing data
X = shuffled_feature_df.iloc[:,:feature_vec_length]
y = shuffled_feature_df.iloc[:,feature_vec_length]

further_test_df = non_NLS_feature_df.sample(n=1000, random_state=100)  


In [15]:
# Initialize and train classifier
classifier_dict = {'logistic_regression' : LogisticRegression(),
                   'support_vector_machine' : SVC(kernel='linear'),
                    'decision_tree' : DecisionTreeClassifier(random_state=43),
                    'random_forest' : RandomForestClassifier(n_estimators=100, random_state=43),
                    'gradient_boosting_machines' : GradientBoostingClassifier(random_state=43),
                    'naive_bayes' : GaussianNB(),
                    'MLP' : MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, activation='relu', solver='adam', random_state=43)}


In [71]:
# Binary classification 
def train_and_predict(X,y,further_test_df, feature_vec_length, classifier_name):
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)  # 60% training, 40% temp
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)  # Split temp into 50% test, 50% validation
    # Create futher non nuclear data set and make prediction
    X_further_test = further_test_df.iloc[:,:feature_vec_length]
    y_further_test = further_test_df.iloc[:,feature_vec_length]


    classifier = classifier_dict[classifier_name]
    classifier.fit(X_train, y_train)

    print(f"The classifier is {classifier_name}")
    # Predict on the validation set    
    val_predictions = classifier.predict(X_val)
    print("Validation Report:")
    print(classification_report(y_val, val_predictions))

    # Predict on the test set
    test_predictions = classifier.predict(X_test)

    # Evaluate the model on the test set
    print("Test Report:")
    print(classification_report(y_test, test_predictions))

    # Predict on the further dataset
    further_test_predictions = classifier.predict(X_further_test)

    # Evaluate the model on the test set
    print("Further Test Report:")
    print(classification_report(y_further_test, further_test_predictions))

    return classifier

        

In [72]:
# Save classifiers for further predictions
fitted_classifiers = dict()

# Binary classification using different classifiers
for classifier_name in classifier_dict:
    fitted_classifiers[classifier_name] = train_and_predict(X,y,further_test_df, feature_vec_length, classifier_name)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


The classifier is logistic_regression
Validation Report:
              precision    recall  f1-score   support

           0       0.69      0.73      0.71       261
           1       0.74      0.70      0.72       282

    accuracy                           0.71       543
   macro avg       0.71      0.71      0.71       543
weighted avg       0.71      0.71      0.71       543

Test Report:
              precision    recall  f1-score   support

           0       0.72      0.66      0.69       285
           1       0.66      0.72      0.69       258

    accuracy                           0.69       543
   macro avg       0.69      0.69      0.69       543
weighted avg       0.69      0.69      0.69       543

Further Test Report:
              precision    recall  f1-score   support

           0       1.00      0.72      0.84      1000
           1       0.00      0.00      0.00         0

    accuracy                           0.72      1000
   macro avg       0.50      0.36    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


The classifier is random_forest
Validation Report:
              precision    recall  f1-score   support

           0       0.77      0.71      0.74       261
           1       0.75      0.80      0.78       282

    accuracy                           0.76       543
   macro avg       0.76      0.76      0.76       543
weighted avg       0.76      0.76      0.76       543

Test Report:
              precision    recall  f1-score   support

           0       0.81      0.67      0.73       285
           1       0.69      0.83      0.75       258

    accuracy                           0.74       543
   macro avg       0.75      0.75      0.74       543
weighted avg       0.75      0.74      0.74       543

Further Test Report:
              precision    recall  f1-score   support

           0       1.00      0.73      0.84      1000
           1       0.00      0.00      0.00         0

    accuracy                           0.73      1000
   macro avg       0.50      0.36      0.42

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


The classifier is gradient_boosting_machines
Validation Report:
              precision    recall  f1-score   support

           0       0.75      0.68      0.72       261
           1       0.73      0.79      0.76       282

    accuracy                           0.74       543
   macro avg       0.74      0.74      0.74       543
weighted avg       0.74      0.74      0.74       543

Test Report:
              precision    recall  f1-score   support

           0       0.78      0.65      0.70       285
           1       0.67      0.79      0.73       258

    accuracy                           0.72       543
   macro avg       0.72      0.72      0.72       543
weighted avg       0.73      0.72      0.72       543

Further Test Report:
              precision    recall  f1-score   support

           0       1.00      0.69      0.82      1000
           1       0.00      0.00      0.00         0

    accuracy                           0.69      1000
   macro avg       0.50      0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


The classifier is MLP
Validation Report:
              precision    recall  f1-score   support

           0       0.75      0.64      0.69       261
           1       0.70      0.80      0.75       282

    accuracy                           0.72       543
   macro avg       0.73      0.72      0.72       543
weighted avg       0.73      0.72      0.72       543

Test Report:
              precision    recall  f1-score   support

           0       0.77      0.60      0.67       285
           1       0.64      0.80      0.71       258

    accuracy                           0.69       543
   macro avg       0.71      0.70      0.69       543
weighted avg       0.71      0.69      0.69       543

Further Test Report:
              precision    recall  f1-score   support

           0       1.00      0.65      0.79      1000
           1       0.00      0.00      0.00         0

    accuracy                           0.65      1000
   macro avg       0.50      0.32      0.39      1000

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Further Test on a different dataset using random forest classifier

In [22]:
# Load a different dataset that contains proteins with different localization signals including nls
# Remove certain proteins
new_test_proteins_df = pd.read_csv('data/finalized_df_cleaned.csv')
new_test_proteins_df_cleaned = remove_short_sequences(remove_sequences(new_test_proteins_df, 'Sequence'), length_cutoff)

In [68]:
# Clean data
new_feature_df = create_feature_vectors(new_test_proteins_df_cleaned,nls_pssm,'Sequence',feature_vec_length,0)
new_feature_df['Label'] = new_test_proteins_df_cleaned['Types'].str.contains('NLS', na=False).astype(int).reset_index(drop=True)

print(new_feature_df.sample())

       Score_1   Score_2   Score_3   Score_4   Score_5   Score_6   Score_7  \
2822  11.82003  9.280058  9.130585  7.368783  6.653957  6.238807  5.527428   

      Score_8   Score_9  Score_10  ...  Score_12  Score_13  Score_14  \
2822   5.3856  4.470175  4.304822  ...  3.200207  3.084001  2.797248   

      Score_15  Score_16  Score_17  Score_18  Score_19  Score_20  Label  
2822  2.648286  2.641572  2.484987  2.278341  2.273378  2.197906      1  

[1 rows x 21 columns]


In [75]:
# Binary classification using random_forest classifier
X_new_test = new_feature_df.iloc[:,:feature_vec_length]
y_new_test = new_feature_df.iloc[:,feature_vec_length]


# random_forest classfier
classifier = fitted_classifiers['random_forest']

# Predict on the test set
y_new_prediction = classifier.predict(X_new_test)

# Evaluate the model on the test set
print("New Test Report:")
print(classification_report(y_new_test, y_new_prediction))

New Test Report:
              precision    recall  f1-score   support

           0       0.91      0.87      0.89      1467
           1       0.87      0.92      0.90      1484

    accuracy                           0.89      2951
   macro avg       0.89      0.89      0.89      2951
weighted avg       0.89      0.89      0.89      2951

