## Turn nuclear and non nuclear protein sequences into feature vectors of size n=20 using pssm

In [20]:
from collections import defaultdict
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pickle

import sys
sys.path.insert(0, './src')
from pssm_scoring import *

In [21]:
# Find the top n scores
def compute_top_n_scores(array, length=20):
    top_n = np.partition(array, -length)[-length:]
    top_n_sorted = np.sort(top_n)[::-1]  # Sort and reverse to show largest first
    return top_n_sorted

In [22]:
# Remove sequences that contain unknown aa
standard_amino_acids = 'ACDEFGHIKLMNPQRSTVWY'  # 20 standard amino acids
def remove_sequences(df, column_name):
    remove_index = []
    for i,seq in enumerate(df[column_name]):
        if any(aa not in standard_amino_acids for aa in seq):
            remove_index.append(i)

    return df.drop(df.index[remove_index])

def remove_short_sequences(df, length_cutoff):
    return df[df['Length'] >= length_cutoff]

In [23]:
# Create a DataFrame from the list of top n scores and label
def create_feature_vectors(df, pssm, column_name, feature_vec_length, label ):
    top_n_scores_list = [compute_top_n_scores(pssm.calculate(protein_seq),feature_vec_length) for protein_seq in df[column_name]]
    scores_df = pd.DataFrame(top_n_scores_list, columns=[f'Score_{i+1}' for i in range(feature_vec_length)])
    scores_df['Label'] = label
    return scores_df


In [24]:
# Load the nuclear pssm from the file
with open('data/nls_pssm.pkl', 'rb') as f:
    nls_pssm = pickle.load(f)

In [5]:
# Load nuclear and non-nuclear protein sequences
nuclear_protein_df = pd.read_csv('data/data_NLS.csv')
non_nuclear_proteins_df = pd.read_csv('data/data_non_nuclear_proteins.csv')

In [6]:
# Define the length of the feature vectors
feature_vec_length = 20
length_cutoff = feature_vec_length + 20

In [7]:
# Clean data
nuclear_protein_df_cleaned = remove_sequences(nuclear_protein_df,'Sequence_y')
non_nuclear_proteins_df_cleaned = remove_short_sequences(remove_sequences(non_nuclear_proteins_df, 'Sequence'),length_cutoff)

In [25]:
# Create nuclear and non-nuclear feature vectors with labels
NLS_feature_df = create_feature_vectors(nuclear_protein_df_cleaned,nls_pssm, 'Sequence_y',feature_vec_length, 1)
non_NLS_feature_df = create_feature_vectors(non_nuclear_proteins_df_cleaned,nls_pssm, 'Sequence', feature_vec_length, 0)

In [26]:
NLS_feature_df.sample(5)

Unnamed: 0,Score_1,Score_2,Score_3,Score_4,Score_5,Score_6,Score_7,Score_8,Score_9,Score_10,...,Score_12,Score_13,Score_14,Score_15,Score_16,Score_17,Score_18,Score_19,Score_20,Label
1170,5.141886,4.724116,3.683609,3.546735,2.712349,2.26228,2.219584,1.77003,1.181785,1.135787,...,0.747513,0.521171,0.234614,0.108036,0.089892,-0.170862,-0.392231,-0.749415,-0.807705,1
118,4.058798,1.69356,1.621346,1.299358,1.095644,0.844506,0.8215,0.713321,0.495712,-0.36681,...,-0.995496,-1.196517,-1.310946,-1.346947,-1.372618,-1.468269,-1.54827,-1.801031,-1.844818,1
96,9.118779,6.277234,6.016829,5.968548,5.509716,5.458274,4.329693,4.247753,4.035923,3.515393,...,3.037692,2.706254,2.5577,2.546307,2.505921,2.198168,2.046007,1.845421,1.8292,1
1000,11.004908,9.298845,8.020465,7.322304,5.91903,5.633911,5.140097,4.920547,4.852774,4.217218,...,3.759845,3.251276,2.708628,2.40648,2.201303,1.982353,1.918615,1.916733,1.903443,1
899,24.906069,23.218674,22.737,22.333403,18.150452,17.886356,17.237713,15.608488,14.106956,14.028743,...,13.140636,12.696486,12.556838,9.502391,9.047017,8.617747,8.589288,6.926116,6.421696,1


In [27]:
non_NLS_feature_df.sample(5)

Unnamed: 0,Score_1,Score_2,Score_3,Score_4,Score_5,Score_6,Score_7,Score_8,Score_9,Score_10,...,Score_12,Score_13,Score_14,Score_15,Score_16,Score_17,Score_18,Score_19,Score_20,Label
48421,7.557083,6.647229,5.341608,5.119096,4.808097,4.502225,3.100829,3.037402,2.926094,2.333342,...,1.78218,1.739136,1.282862,1.043963,1.038223,0.876496,0.739709,0.703845,0.416444,0
60415,1.679797,-1.127315,-1.696114,-2.168103,-2.176711,-2.530419,-2.65095,-2.738926,-2.870982,-3.215358,...,-3.5237,-3.791667,-3.84198,-3.857994,-4.071139,-4.082572,-4.095625,-4.10074,-4.364736,0
33898,1.113741,0.625018,0.500162,0.234711,-0.197442,-0.417494,-0.451419,-0.62839,-1.510631,-1.655857,...,-2.012618,-2.378323,-2.555069,-2.573125,-2.627829,-2.725794,-2.778367,-2.857371,-2.917282,0
23978,10.813778,8.499226,8.227059,6.469109,5.334997,5.008693,4.558183,3.993275,3.471257,3.449827,...,2.835068,2.704193,2.618517,2.59441,2.519485,2.493372,2.471543,2.318758,2.287037,0
2086,-3.281637,-3.783387,-3.82087,-4.281095,-4.782397,-4.866241,-5.004274,-5.531252,-5.750714,-5.792025,...,-6.427528,-6.855747,-7.284554,-7.872125,-8.144029,-9.092268,-9.120149,-9.344419,-9.404174,0


In [28]:
print(len(NLS_feature_df),len(non_NLS_feature_df))

1357 65590


In [29]:
# Downsample the non-nuclear proteins and combine with nuclear proteins
downsampled_non_nuclear_df = non_NLS_feature_df.sample(n=len(NLS_feature_df), random_state=42)  # Match the number of nuclear samples

feature_df = pd.concat([NLS_feature_df,downsampled_non_nuclear_df], axis=0)
shuffled_feature_df = feature_df.sample(frac=1, random_state=42).reset_index(drop=True)

## Binary classification

In [30]:
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier


from sklearn.metrics import classification_report

In [31]:
# define training and further testing data
X = shuffled_feature_df.iloc[:,:feature_vec_length]
y = shuffled_feature_df.iloc[:,feature_vec_length]

further_test_df = non_NLS_feature_df.sample(n=1000, random_state=100)  


In [32]:
# Initialize and train classifier
classifier_dict = {'logistic_regression' : LogisticRegression(),
                   'support_vector_machine' : SVC(kernel='linear'),
                    'decision_tree' : DecisionTreeClassifier(random_state=43),
                    'random_forest' : RandomForestClassifier(n_estimators=100, random_state=43),
                    'gradient_boosting_machines' : GradientBoostingClassifier(random_state=43),
                    'naive_bayes' : GaussianNB(),
                    'MLP' : MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, activation='relu', solver='adam', random_state=43)}


In [33]:
# Binary classification 
def train_and_predict(X,y,further_test_df, feature_vec_length, classifier_name):
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)  # 60% training, 40% temp
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)  # Split temp into 50% test, 50% validation
    # Create futher non nuclear data set and make prediction
    X_further_test = further_test_df.iloc[:,:feature_vec_length]
    y_further_test = further_test_df.iloc[:,feature_vec_length]


    classifier = classifier_dict[classifier_name]
    classifier.fit(X_train, y_train)

    print(f"The classifier is {classifier_name}")
    # Predict on the validation set    
    val_predictions = classifier.predict(X_val)
    print("Validation Report:")
    print(classification_report(y_val, val_predictions))

    # Predict on the test set
    test_predictions = classifier.predict(X_test)

    # Evaluate the model on the test set
    print("Test Report:")
    print(classification_report(y_test, test_predictions))

    # Predict on the further dataset
    further_test_predictions = classifier.predict(X_further_test)

    # Evaluate the model on the test set
    print("Further Test Report:")
    print(classification_report(y_further_test, further_test_predictions))
        

In [34]:
# Binary classification using different classifiers
for classifier_name in classifier_dict:
    train_and_predict(X,y,further_test_df, feature_vec_length, classifier_name)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


The classifier is logistic_regression
Validation Report:
              precision    recall  f1-score   support

           0       0.71      0.74      0.72       261
           1       0.75      0.72      0.74       282

    accuracy                           0.73       543
   macro avg       0.73      0.73      0.73       543
weighted avg       0.73      0.73      0.73       543

Test Report:
              precision    recall  f1-score   support

           0       0.75      0.71      0.73       285
           1       0.70      0.74      0.72       258

    accuracy                           0.72       543
   macro avg       0.72      0.72      0.72       543
weighted avg       0.73      0.72      0.72       543

Further Test Report:
              precision    recall  f1-score   support

           0       1.00      0.71      0.83      1000
           1       0.00      0.00      0.00         0

    accuracy                           0.71      1000
   macro avg       0.50      0.36    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


The classifier is random_forest
Validation Report:
              precision    recall  f1-score   support

           0       0.78      0.72      0.75       261
           1       0.76      0.81      0.78       282

    accuracy                           0.77       543
   macro avg       0.77      0.77      0.77       543
weighted avg       0.77      0.77      0.77       543

Test Report:
              precision    recall  f1-score   support

           0       0.82      0.69      0.75       285
           1       0.71      0.83      0.77       258

    accuracy                           0.76       543
   macro avg       0.77      0.76      0.76       543
weighted avg       0.77      0.76      0.76       543

Further Test Report:
              precision    recall  f1-score   support

           0       1.00      0.71      0.83      1000
           1       0.00      0.00      0.00         0

    accuracy                           0.71      1000
   macro avg       0.50      0.36      0.42

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


The classifier is gradient_boosting_machines
Validation Report:
              precision    recall  f1-score   support

           0       0.76      0.68      0.72       261
           1       0.73      0.80      0.76       282

    accuracy                           0.74       543
   macro avg       0.75      0.74      0.74       543
weighted avg       0.75      0.74      0.74       543

Test Report:
              precision    recall  f1-score   support

           0       0.79      0.67      0.73       285
           1       0.69      0.81      0.74       258

    accuracy                           0.74       543
   macro avg       0.74      0.74      0.74       543
weighted avg       0.74      0.74      0.74       543

Further Test Report:
              precision    recall  f1-score   support

           0       1.00      0.67      0.81      1000
           1       0.00      0.00      0.00         0

    accuracy                           0.67      1000
   macro avg       0.50      0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


The classifier is MLP
Validation Report:
              precision    recall  f1-score   support

           0       0.75      0.70      0.72       261
           1       0.74      0.78      0.76       282

    accuracy                           0.74       543
   macro avg       0.74      0.74      0.74       543
weighted avg       0.74      0.74      0.74       543

Test Report:
              precision    recall  f1-score   support

           0       0.79      0.66      0.72       285
           1       0.68      0.81      0.74       258

    accuracy                           0.73       543
   macro avg       0.74      0.73      0.73       543
weighted avg       0.74      0.73      0.73       543

Further Test Report:
              precision    recall  f1-score   support

           0       1.00      0.67      0.80      1000
           1       0.00      0.00      0.00         0

    accuracy                           0.67      1000
   macro avg       0.50      0.33      0.40      1000

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Test on a different dataset

In [36]:
# Load a different dataset that contains proteins with different localization signals including nls
new_df = pd.read_csv('data/finalized_df_cleaned.csv')
print(new_df.head())

      ACC                                       AnnotEncoded  \
0  O75439  3333333333333333333333333333333333333333333330...   
1  Q2TBK2  3333333333333333333333333333333333333333333333...   
2  Q5VY80  0000000000000000000000000000000000000000000000...   
3  Q9BZM6  0000000000000000000000000000000000000000000000...   
4  O75489  3333333333333333333333333333333333330000000000...   

                                            Sequence Types  Length  
0  MAAAAARVVLSSAARRRLWGFSESLLIRGAAGRSLYFGENRLRSTQ...    MT     489  
1  MAAAAFAVPRGVQLRVLTERLLRGGVRELLRPRLSGSTPGSERDFS...    MT     268  
2  MAAAAIPALLLCLPLLFLLFGWSRARRDDPHSLCYDITVIPKFRPG...   GPI     246  
3  MAAAASPAFLLCLPLLHLLSGWSRAGWVDTHCLCYDFIITPKSRPE...   GPI     244  
4  MAAAAVARLWWRGILGASALTRGTGRPSVLLLPVRRESAGADTRPT...    MT     264  


In [39]:
# Clean data
cleaned_new_df = remove_sequences(new_df, 'Sequence')

In [188]:
# def create_scores(df, column_name):

#     sequence_scores_list = [pssm.calculate(protein_seq) for protein_seq in df[column_name]]
#     return sequence_scores_list

from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(categories =[list_standard_amino_acids], sparse_output=False)  # Using sparse=False to get a dense array

def create_modified_one_hot_encoding(df, column_name):
    modified_one_hot_encoded_list = []

    for protein_seq in df[column_name]:
        data = np.array(list(protein_seq)).reshape(-1,1)


        one_hot_encoded = encoder.fit_transform(data)

        # Obtain PSSM scores, and pad or truncate as necessary
        pssm_scores = pssm.calculate(protein_seq)
        padded_pssm_scores = np.append(pssm_scores, np.zeros(17))
        
        # Element-wise multiplication of one-hot encoded matrix with PSSM scores
        modified_one_hot_encoded = one_hot_encoded * padded_pssm_scores[:len(one_hot_encoded)][:, None]


        # Ensure each matrix has exactly 1000 rows
        target_length = 1000
        current_length = modified_one_hot_encoded.shape[0]
        
        if current_length < target_length:
            # Pad with zeros if fewer than 1000 rows
            padding = np.zeros((target_length - current_length, modified_one_hot_encoded.shape[1]))
            modified_one_hot_encoded = np.vstack([modified_one_hot_encoded, padding])
        else:
            # Truncate if more than 1000 rows
            modified_one_hot_encoded = modified_one_hot_encoded[:target_length]
        # Append the final modified encoding to the list
        modified_one_hot_encoded_list.append(modified_one_hot_encoded)

    return modified_one_hot_encoded_list


def create_one_hot_encoding(df, column_name):
    one_hot_encoded_list = []

    for protein_seq in df[column_name]:
        data = np.array(list(protein_seq)).reshape(-1,1)


        one_hot_encoded = encoder.fit_transform(data)


        # Ensure each matrix has exactly 1000 rows
        target_length = 1000
        current_length = one_hot_encoded.shape[0]
        
        if current_length < target_length:
            # Pad with zeros if fewer than 1000 rows
            padding = np.zeros((target_length - current_length, one_hot_encoded.shape[1]))
            one_hot_encoded = np.vstack([one_hot_encoded, padding])
        else:
            # Truncate if more than 1000 rows
            one_hot_encoded = one_hot_encoded[:target_length]
        # Append the final modified encoding to the list
        one_hot_encoded_list.append(one_hot_encoded)

    return one_hot_encoded_list


In [189]:
modified_one_hot_encoding_list = create_modified_one_hot_encoding(new_df_cleaned, 'Sequence')
one_hot_encoding_list = create_one_hot_encoding(new_df_cleaned, 'Sequence')

In [161]:
encoded_new_df = new_df_cleaned [['ACC']].copy()
encoded_new_df['Encoding'] = modified_one_hot_encoding_list
encoded_new_df['Label'] = new_df_cleaned['Types'].str.contains('NLS', na=False).astype(int)


encoded_new_df.head()

Unnamed: 0,ACC,Encoding,Label
0,O75439,"[[-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0...",0
1,Q2TBK2,"[[-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0...",0
2,Q5VY80,"[[-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0...",0
3,Q9BZM6,"[[-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0...",0
4,O75489,"[[-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0...",0


In [163]:
import gzip
import pickle

with gzip.open('cnn_input_data.pkl.gz', 'wb') as f:
    pickle.dump(encoded_new_df, f)


In [196]:
naive_encoded_new_df = pd.DataFrame({'Encoding': one_hot_encoding_list})
naive_encoded_new_df['Label'] = new_df_cleaned['Types'].str.contains('NLS', na=False).astype(int)


with gzip.open('naive_encoded_cnn_input_data.pkl.gz', 'wb') as f:
    pickle.dump(encoded_new_df, f)

# Create testing datasets

In [165]:
# Load nuclear and non-nuclear protein sequences
nuclear_protein_df = pd.read_csv('data_NLS.csv')
non_nuclear_proteins_df = pd.read_csv('data_non_nuclear_proteins.csv')

In [173]:
# Clean data
nuclear_protein_df_cleaned = remove_sequences(nuclear_protein_df,'Sequence_y')
non_nuclear_proteins_df_cleaned = remove_short_sequences(remove_sequences(non_nuclear_proteins_df, 'Sequence'),length_cutoff)

In [174]:
nuclear_protein_df_cleaned.head()

Unnamed: 0,ID,Begin,End,Sequence_x,Length,Sequence_y
0,Q14738,548,565,KRTVETEAVQMLKDIKKE,18,MPYKLKKEKEPPKVAKCTAKPSSSGKDGGGENTEEAQPQPQPQPQP...
1,Q13362,416,422,KLKEKLK,7,MLTCNKAGSRMVVDAANSNGPFQPVVLLHIRDVPPADQEKLFIQKL...
2,Q9NRA8,195,211,RREFGDSKRVFGERRRN,17,MDRRSMGETESGDAFLDLKKPPASKCPHRYTKEELLDIKELPHSKQ...
3,P42684,658,660,KKR,3,MGQQVGRVGEAPGLQQPQPRGIRGSSAARPSGRRRDPAGRTTETGF...
4,Q4JIM5,659,661,KKR,3,MGQQVGRVGEAPGLQQPQPRGIRGSSAARPSGRRRDPAGRTADAGF...


In [203]:
# Downsample the non-nuclear proteins 

downsampled_non_nuclear_df = non_nuclear_proteins_df_cleaned.sample(n=len(nuclear_protein_df_cleaned), random_state= 30)  # Match the number of nuclear samples


nls_encoding_list = create_modified_one_hot_encoding(nuclear_protein_df_cleaned, 'Sequence_y')
non_nls_encoding_list = create_modified_one_hot_encoding(downsampled_non_nuclear_df, 'Sequence')

In [185]:
print(len(nls_encoding_list),len(non_nls_encoding_list))

1357 1357


In [186]:
encoded_nls_df = pd.DataFrame({'Encoding' : nls_encoding_list})
encoded_nls_df['Label'] = 1

encoded_non_nls_df = pd.DataFrame({'Encoding' : non_nls_encoding_list})
encoded_non_nls_df['Label'] = 0

encoded_all_df = pd.concat([encoded_nls_df, encoded_non_nls_df], axis=0)
shuffled_encoded_all_df = encoded_all_df.sample(frac=1, random_state=50).reset_index(drop=True)

print(shuffled_encoded_all_df.head(10))


                                            Encoding  Label
0  [[-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0...      0
1  [[-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0...      1
2  [[-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0...      0
3  [[-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0...      0
4  [[-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0...      1
5  [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...      0
6  [[-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0...      1
7  [[-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0...      1
8  [[-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0...      0
9  [[-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0...      0


In [187]:
import gzip
import pickle

with gzip.open('testing_data.pkl.gz', 'wb') as f:
    pickle.dump(shuffled_encoded_all_df , f)


In [204]:
nls_naive_encoding_list = create_one_hot_encoding(nuclear_protein_df_cleaned, 'Sequence_y')
non_nls_naive_encoding_list = create_one_hot_encoding(downsampled_non_nuclear_df, 'Sequence')

naive_encoded_nls_df = pd.DataFrame({'Encoding' : nls_naive_encoding_list})
naive_encoded_nls_df['Label'] = 1

naive_encoded_non_nls_df = pd.DataFrame({'Encoding' : non_nls_naive_encoding_list})
naive_encoded_non_nls_df['Label'] = 0

naive_encoded_all_df = pd.concat([naive_encoded_nls_df, naive_encoded_non_nls_df], axis=0)
shuffled_naive_encoded_all_df = naive_encoded_all_df.sample(frac=1, random_state=50).reset_index(drop=True)

print(shuffled_naive_encoded_all_df.head(10))



                                            Encoding  Label
0  [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...      0
1  [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...      1
2  [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...      0
3  [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...      0
4  [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...      1
5  [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...      0
6  [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...      1
7  [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...      1
8  [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...      0
9  [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...      0


In [205]:
with gzip.open('testing_data_naive_encoded.pkl.gz', 'wb') as f:
    pickle.dump(shuffled_naive_encoded_all_df , f)