In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical

In [None]:
# Load the datasets
bio_dataset = pd.read_excel('BioDataset.xlsx', sheet_name='vzs3zr1726592551_s_list.mitab_p')
labeled_interactions = pd.read_csv('labeled_interactions.csv')
labeled_interactions_type = pd.read_csv('labeled_interactionsTYPE.csv')

# Merge datasets based on protein pairs
merged = pd.merge(labeled_interactions, labeled_interactions_type, on=['protein_1', 'protein_2', 'sequence_1', 'sequence_2', 'label'])

# Add relevant columns from BioDataset based on proteins
bio_dataset_filtered = bio_dataset[['Unnamed: 21', 'Unnamed: 22']]
bio_dataset_filtered.columns = ['sequence_1_bio', 'sequence_2_bio']

merged['sequence_1'] = merged['sequence_1'].combine_first(bio_dataset_filtered['sequence_1_bio'])
merged['sequence_2'] = merged['sequence_2'].combine_first(bio_dataset_filtered['sequence_2_bio'])

# Check class distribution
label_distribution = merged['label'].value_counts()
print("Class Distribution:\n", label_distribution)

# Prepare interaction type labels (convert to categorical)
interaction_types = merged['interaction_type'].unique()
interaction_type_dict = {t: i for i, t in enumerate(interaction_types)}
merged['interaction_type_encoded'] = merged['interaction_type'].map(interaction_type_dict)
y_type = to_categorical(merged['interaction_type_encoded'])

In [None]:
# Tokenize and pad sequences
tokenizer = Tokenizer()
all_sequences = merged['sequence_1'].tolist() + merged['sequence_2'].tolist()
tokenizer.fit_on_texts(all_sequences)

max_seq_length = 500  # Adjust based on input data
X_seq1 = pad_sequences(tokenizer.texts_to_sequences(merged['sequence_1']), maxlen=max_seq_length)
X_seq2 = pad_sequences(tokenizer.texts_to_sequences(merged['sequence_2']), maxlen=max_seq_length)

# Combine the padded sequences as input for NN
X = np.hstack((X_seq1, X_seq2))

# Encode labels for binary classification
y_binary = merged['label']

# Train-test split
X_train, X_test, y_train_binary, y_test_binary = train_test_split(X, y_binary, test_size=0.2, random_state=42)
_, _, y_train_type, y_test_type = train_test_split(X, y_type, test_size=0.2, random_state=42)

# Check for overlapping sequences (data leakage)
train_sequences = set(tuple(seq) for seq in X_train)
test_sequences = set(tuple(seq) for seq in X_test)
overlap = train_sequences.intersection(test_sequences)
print(f'Number of overlapping sequences between train and test sets: {len(overlap)}')

In [None]:
# Define input
input_layer = Input(shape=(2*max_seq_length,))

# Shared embedding and LSTM layers
x = Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=128, input_length=2*max_seq_length)(input_layer)
x = LSTM(64, return_sequences=True)(x)
x = Dropout(0.2)(x)
x = LSTM(32)(x)
x = Dense(16, activation='relu')(x)


# Output for binary classification
output_binary = Dense(1, activation='sigmoid', name='binary_output')(x)

# Output for interaction type classification
output_multiclass = Dense(len(interaction_types), activation='softmax', name='type_output')(x)

# Define model
model = Model(inputs=input_layer, outputs=[output_binary, output_multiclass])


In [None]:
# Compile model
model.compile(optimizer='adam',
              loss={'binary_output': 'binary_crossentropy', 'type_output': 'categorical_crossentropy'},
              metrics={'binary_output': 'accuracy', 'type_output': 'accuracy'})

# Train model
history = model.fit(X_train, 
                    {'binary_output': y_train_binary, 'type_output': y_train_type},
                    validation_data=(X_test, {'binary_output': y_test_binary, 'type_output': y_test_type}),
                    epochs=20, 
                    batch_size=32)

In [None]:
# Plot training vs validation loss for both tasks
plt.plot(history.history['loss'], label='Total Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Overall Loss Trends')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()


# Evaluate the model
loss, binary_loss, multiclass_loss, binary_acc, multiclass_acc = model.evaluate(
    X_test, {'binary_output': y_test_binary, 'type_output': y_test_type}
)


print(f'Binary Classification Accuracy: {binary_acc}')
print(f'Multiclass Classification Accuracy: {multiclass_acc}')