## Turn nuclear and non nuclear protein sequences into feature vectors of size n=20 using pssm

In [1]:
# from collections import defaultdict
import pandas as pd
import numpy as np
import pickle
import gzip

import sys
sys.path.insert(0, './src')
from pssm_scoring import *
from sequence_removal import *
from pssm_feature import *

In [2]:
# Load the nuclear pssm from the file
with open('data/nls_pssm.pkl', 'rb') as f:
    nls_pssm = pickle.load(f)

In [3]:
# Load nuclear and non-nuclear protein sequences
nuclear_protein_df = pd.read_csv('data/data_NLS.csv')
non_nuclear_proteins_df = pd.read_csv('data/data_non_nuclear_proteins.csv')

In [4]:
# Define the length of the feature vectors
feature_vec_length = 20
length_cutoff = feature_vec_length + 20

In [5]:
# Clean data
nuclear_protein_df_cleaned = remove_sequences(nuclear_protein_df,'Sequence_y')
non_nuclear_proteins_df_cleaned = remove_short_sequences(remove_sequences(non_nuclear_proteins_df, 'Sequence'),length_cutoff)

In [6]:
# Create nuclear and non-nuclear feature vectors with labels
NLS_feature_df = create_feature_vectors(nuclear_protein_df_cleaned,nls_pssm, 'Sequence_y',feature_vec_length, 1)
non_NLS_feature_df = create_feature_vectors(non_nuclear_proteins_df_cleaned,nls_pssm, 'Sequence', feature_vec_length, 0)

In [7]:
NLS_feature_df.sample(5)

Unnamed: 0,Score_1,Score_2,Score_3,Score_4,Score_5,Score_6,Score_7,Score_8,Score_9,Score_10,...,Score_12,Score_13,Score_14,Score_15,Score_16,Score_17,Score_18,Score_19,Score_20,Label
1115,11.42607,7.875871,7.800985,7.195592,6.410354,5.919086,5.527107,5.491777,5.427918,5.098845,...,4.711524,3.634714,3.266969,3.040648,2.884757,2.788844,2.78684,2.525939,2.303292,1
409,7.687488,5.929789,4.189566,4.17198,3.686165,3.46762,3.043916,2.60461,2.46307,1.843196,...,1.620038,1.45576,1.279249,0.915139,0.883927,0.829598,0.514552,0.084073,-0.214935,1
693,11.253031,9.775498,5.815189,5.435114,4.003334,3.280595,3.102313,2.830429,2.515435,2.469193,...,1.247197,1.22644,0.825012,0.588262,0.535348,0.324697,0.06347,-0.222296,-0.565046,1
42,16.32733,14.889406,14.494901,14.383837,14.233326,13.259383,11.978874,11.730679,11.23161,10.551913,...,8.836652,8.373981,6.731974,6.382881,6.02738,5.83215,5.428034,5.150562,5.145815,1
364,3.107134,2.818884,2.102334,0.981704,-0.368037,-0.54976,-0.577193,-0.678083,-0.678674,-0.896989,...,-1.085971,-1.447215,-1.48206,-1.513124,-1.709867,-1.82437,-2.398071,-2.440276,-2.575475,1


In [8]:
non_NLS_feature_df.sample(5)

Unnamed: 0,Score_1,Score_2,Score_3,Score_4,Score_5,Score_6,Score_7,Score_8,Score_9,Score_10,...,Score_12,Score_13,Score_14,Score_15,Score_16,Score_17,Score_18,Score_19,Score_20,Label
34646,3.129185,3.110964,2.239217,1.612866,1.538147,0.875427,0.120678,-0.316506,-1.074622,-1.516077,...,-1.603001,-1.641082,-1.746533,-1.877511,-1.941735,-1.949277,-2.146586,-2.201319,-2.20777,0
18798,2.232332,0.696551,-0.219857,-0.467209,-0.736757,-1.96911,-2.200509,-2.351579,-2.89494,-3.117262,...,-3.664215,-3.818806,-3.853388,-5.001136,-5.056853,-5.074793,-5.162594,-5.293609,-5.313851,0
19983,-2.012743,-3.219804,-3.373306,-3.491336,-3.842223,-4.068383,-4.42967,-4.457203,-4.535611,-4.697738,...,-5.292454,-5.649302,-5.702418,-5.730498,-5.783978,-6.174837,-6.467794,-6.553637,-6.62017,0
43810,-0.044492,-1.474638,-2.501108,-2.652296,-3.162738,-3.507363,-3.97186,-4.498648,-4.751318,-5.245487,...,-6.062774,-6.368373,-6.644309,-7.091137,-7.323839,-7.508155,-7.765316,-7.896367,-7.912983,0
1199,1.989124,0.862909,0.057283,-1.093435,-1.982563,-2.074936,-2.165924,-2.204551,-2.2685,-2.291738,...,-2.965136,-2.980721,-3.469015,-3.736507,-3.804231,-3.80482,-4.129289,-4.196879,-4.368314,0


In [9]:
print(len(NLS_feature_df),len(non_NLS_feature_df))

1357 65590


In [10]:
# Downsample the non-nuclear proteins and combine with nuclear proteins
downsampled_non_nuclear_df = non_NLS_feature_df.sample(n=len(NLS_feature_df), random_state=50)  # Match the number of nuclear samples

feature_df = pd.concat([NLS_feature_df,downsampled_non_nuclear_df], axis=0)
shuffled_feature_df = feature_df.sample(frac=1, random_state=42).reset_index(drop=True)

## Binary classification

In [11]:
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier


from sklearn.metrics import classification_report

In [12]:
# define training and further testing data
X = shuffled_feature_df.iloc[:,:feature_vec_length]
y = shuffled_feature_df.iloc[:,feature_vec_length]

further_test_df = non_NLS_feature_df.sample(n=1000, random_state=100)  


In [13]:
# Initialize and train classifier
classifier_dict = {'logistic_regression' : LogisticRegression(),
                   'support_vector_machine' : SVC(kernel='linear'),
                    'decision_tree' : DecisionTreeClassifier(random_state=43),
                    'random_forest' : RandomForestClassifier(n_estimators=100, random_state=43),
                    'gradient_boosting_machines' : GradientBoostingClassifier(random_state=43),
                    'naive_bayes' : GaussianNB(),
                    'MLP' : MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, activation='relu', solver='adam', random_state=43)}


In [14]:
# Binary classification 
def train_and_predict(X,y,further_test_df, feature_vec_length, classifier_name):
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)  # 60% training, 40% temp
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)  # Split temp into 50% test, 50% validation
    # Create futher non nuclear data set and make prediction
    X_further_test = further_test_df.iloc[:,:feature_vec_length]
    y_further_test = further_test_df.iloc[:,feature_vec_length]


    classifier = classifier_dict[classifier_name]
    classifier.fit(X_train, y_train)

    print(f"The classifier is {classifier_name}")
    # Predict on the validation set    
    val_predictions = classifier.predict(X_val)
    print("Validation Report:")
    print(classification_report(y_val, val_predictions))

    # Predict on the test set
    test_predictions = classifier.predict(X_test)

    # Evaluate the model on the test set
    print("Test Report:")
    print(classification_report(y_test, test_predictions))

    # Predict on the further dataset
    further_test_predictions = classifier.predict(X_further_test)

    # Evaluate the model on the test set
    print("Further Test Report:")
    print(classification_report(y_further_test, further_test_predictions))

    return classifier

        

In [15]:
# Save classifiers for further predictions
fitted_classifiers = dict()

# Binary classification using different classifiers
for classifier_name in classifier_dict:
    fitted_classifiers[classifier_name] = train_and_predict(X,y,further_test_df, feature_vec_length, classifier_name)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


The classifier is logistic_regression
Validation Report:
              precision    recall  f1-score   support

           0       0.69      0.73      0.71       261
           1       0.74      0.70      0.72       282

    accuracy                           0.71       543
   macro avg       0.71      0.71      0.71       543
weighted avg       0.71      0.71      0.71       543

Test Report:
              precision    recall  f1-score   support

           0       0.72      0.66      0.69       285
           1       0.66      0.72      0.69       258

    accuracy                           0.69       543
   macro avg       0.69      0.69      0.69       543
weighted avg       0.69      0.69      0.69       543

Further Test Report:
              precision    recall  f1-score   support

           0       1.00      0.72      0.84      1000
           1       0.00      0.00      0.00         0

    accuracy                           0.72      1000
   macro avg       0.50      0.36    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


The classifier is random_forest
Validation Report:
              precision    recall  f1-score   support

           0       0.77      0.71      0.74       261
           1       0.75      0.80      0.78       282

    accuracy                           0.76       543
   macro avg       0.76      0.76      0.76       543
weighted avg       0.76      0.76      0.76       543

Test Report:
              precision    recall  f1-score   support

           0       0.81      0.67      0.73       285
           1       0.69      0.83      0.75       258

    accuracy                           0.74       543
   macro avg       0.75      0.75      0.74       543
weighted avg       0.75      0.74      0.74       543

Further Test Report:
              precision    recall  f1-score   support

           0       1.00      0.73      0.84      1000
           1       0.00      0.00      0.00         0

    accuracy                           0.73      1000
   macro avg       0.50      0.36      0.42

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


The classifier is gradient_boosting_machines
Validation Report:
              precision    recall  f1-score   support

           0       0.75      0.68      0.72       261
           1       0.73      0.79      0.76       282

    accuracy                           0.74       543
   macro avg       0.74      0.74      0.74       543
weighted avg       0.74      0.74      0.74       543

Test Report:
              precision    recall  f1-score   support

           0       0.78      0.65      0.70       285
           1       0.67      0.79      0.73       258

    accuracy                           0.72       543
   macro avg       0.72      0.72      0.72       543
weighted avg       0.73      0.72      0.72       543

Further Test Report:
              precision    recall  f1-score   support

           0       1.00      0.69      0.82      1000
           1       0.00      0.00      0.00         0

    accuracy                           0.69      1000
   macro avg       0.50      0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


The classifier is MLP
Validation Report:
              precision    recall  f1-score   support

           0       0.75      0.64      0.69       261
           1       0.70      0.80      0.75       282

    accuracy                           0.72       543
   macro avg       0.73      0.72      0.72       543
weighted avg       0.73      0.72      0.72       543

Test Report:
              precision    recall  f1-score   support

           0       0.77      0.60      0.67       285
           1       0.64      0.80      0.71       258

    accuracy                           0.69       543
   macro avg       0.71      0.70      0.69       543
weighted avg       0.71      0.69      0.69       543

Further Test Report:
              precision    recall  f1-score   support

           0       1.00      0.65      0.79      1000
           1       0.00      0.00      0.00         0

    accuracy                           0.65      1000
   macro avg       0.50      0.32      0.39      1000

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
with gzip.open('trained_classifiers/nls_random_forest_classifier.pkl.gz' , 'wb') as f:
    pickle.dump(fitted_classifiers['random_forest'], f)

## Further Test on a different dataset using random forest classifier

In [17]:
# Load a different dataset that contains proteins with different localization signals including nls
# Remove certain proteins
new_test_proteins_df = pd.read_csv('data/finalized_df_cleaned.csv')
new_test_proteins_df_cleaned = remove_short_sequences(remove_sequences(new_test_proteins_df, 'Sequence'), length_cutoff)

In [18]:
# Clean data
new_feature_df = create_feature_vectors(new_test_proteins_df_cleaned,nls_pssm,'Sequence',feature_vec_length,0)
new_feature_df['Label'] = new_test_proteins_df_cleaned['Types'].str.contains('NLS', na=False).astype(int).reset_index(drop=True)

print(new_feature_df.sample())

      Score_1   Score_2   Score_3  Score_4   Score_5   Score_6   Score_7  \
644  4.591737  1.642749  0.954031  0.73697  0.335538 -0.220655 -0.916818   

      Score_8   Score_9  Score_10  ...  Score_12  Score_13  Score_14  \
644 -1.082439 -1.170015 -1.339304  ... -1.644007  -2.04879 -2.117718   

     Score_15  Score_16  Score_17  Score_18  Score_19  Score_20  Label  
644 -2.674177 -2.742957 -3.080559 -3.131519 -3.306662 -3.407368      0  

[1 rows x 21 columns]


In [19]:
# Binary classification using random_forest classifier
X_new_test = new_feature_df.iloc[:,:feature_vec_length]
y_new_test = new_feature_df.iloc[:,feature_vec_length]


# random_forest classfier
classifier = fitted_classifiers['random_forest']

# Predict on the test set
y_new_prediction = classifier.predict(X_new_test)

# Evaluate the model on the test set
print("New Test Report:")
print(classification_report(y_new_test, y_new_prediction))

New Test Report:
              precision    recall  f1-score   support

           0       0.91      0.87      0.89      1467
           1       0.87      0.92      0.90      1484

    accuracy                           0.89      2951
   macro avg       0.89      0.89      0.89      2951
weighted avg       0.89      0.89      0.89      2951

