In [1]:
from google.colab import drive

drive.mount('/content/drive/')

Mounted at /content/drive/


# Code for NN model trained with oversampling of cleaned dataset with additional features

In [2]:
!pip install torchmetrics

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchmetrics
  Downloading torchmetrics-0.11.4-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.2/519.2 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torchmetrics
Successfully installed torchmetrics-0.11.4


In [3]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd



path = 'drive/MyDrive/StructuralBioinformaticsUNIPD/Final Project'

df = pd.read_csv(path + "/clean_additional_features_dataset.csv")
df = df.drop("Unnamed: 0", axis=1)

# Remove all rows with NaN in at least one column
# including rows with missing class (they could be false negatives)
df.dropna(inplace=True)

# drop hydrophobicity feature since highly correlated with s_aa1
df = df.drop(['s_residue_hydrophobicity', 't_residue_hydrophobicity'], axis=1)


# Fit and transform the categorical columns

label_encoder = LabelEncoder()
encoded_values = label_encoder.fit_transform(df['s_ss8'])
df['s_ss8'] = encoded_values

label_encoder = LabelEncoder()
encoded_values = label_encoder.fit_transform(df['s_ss3'])
df['s_ss3'] = encoded_values

label_encoder = LabelEncoder()
encoded_values = label_encoder.fit_transform(df['s_ss8'])
df['t_ss8'] = encoded_values

label_encoder = LabelEncoder()
encoded_values = label_encoder.fit_transform(df['t_ss3'])
df['t_ss3'] = encoded_values

In [4]:
# Convert dataset to multilabeled by grouping entries by residue identifying columns, using one-hot encoding
# in X_list and y_list we have the lists containing the dataset entries and labels, that will be used for training

grouped_df = df.drop(["pdb_id"], axis=1).groupby(['s_ch', 's_resi', 's_ins', 's_resn', 't_ch', 't_resi', 't_ins', 't_resn' ])

# List to store the values from all groups
all_group_values = []
X_list = []
y_list = []

label_dict = {"HBOND": 0,
              "IONIC": 1,
              "PICATION": 2,
              "PIPISTACK": 3,
              "SSBOND": 4,
              "VDW": 5 }

# Iterate over the groups and extract values
for group_name, group_df in grouped_df:
    values_within_group = group_df['Interaction'].tolist()
    row = group_df.iloc[:1,:-1].drop(['s_ch', 's_resi', 's_ins', 's_resn', 't_ch', 't_resi', 't_ins', 't_resn'], axis =1).values
    X_list.append(list(row[0]))

    # one-hot encoding
    labels_list = [0,0,0,0,0,0]
    for label in values_within_group:
      labels_list[label_dict[label]]= 1

    # print(labels_list)
    y_list.append(labels_list)


In [7]:
# MLSMOTE - code for oversampling of the dataset
# -*- coding: utf-8 -*-
# Importing required Library
import numpy as np
import pandas as pd
import random
from sklearn.datasets import make_classification
from sklearn.neighbors import NearestNeighbors


def get_tail_label(df):
    """
    Give tail label colums of the given target dataframe

    args
    df: pandas.DataFrame, target label df whose tail label has to identified

    return
    tail_label: list, a list containing column name of all the tail label
    """
    columns = df.columns
    n = len(columns)
    irpl = np.zeros(n)
    for column in range(n):
        irpl[column] = df[columns[column]].value_counts()[1]
    irpl = max(irpl)/irpl
    mir = np.average(irpl)
    tail_label = []
    for i in range(n):
        if irpl[i] > mir:
            tail_label.append(columns[i])
    return tail_label

def get_index(df):
  """
  give the index of all tail_label rows
  args
  df: pandas.DataFrame, target label df from which index for tail label has to identified

  return
  index: list, a list containing index number of all the tail label
  """
  tail_labels = get_tail_label(df)
  index = set()
  for tail_label in tail_labels:
    sub_index = set(df[df[tail_label]==1].index)
    index = index.union(sub_index)
  return list(index)

def get_minority_instace(X, y):
    """
    Give minority dataframe containing all the tail labels

    args
    X: pandas.DataFrame, the feature vector dataframe
    y: pandas.DataFrame, the target vector dataframe

    return
    X_sub: pandas.DataFrame, the feature vector minority dataframe
    y_sub: pandas.DataFrame, the target vector minority dataframe
    """
    index = get_index(y)
    X_sub = X[X.index.isin(index)].reset_index(drop = True)
    y_sub = y[y.index.isin(index)].reset_index(drop = True)
    return X_sub, y_sub

def nearest_neighbour(X):
    """
    Give index of 5 nearest neighbor of all the instance

    args
    X: np.array, array whose nearest neighbor has to find

    return
    indices: list of list, index of 5 NN of each element in X
    """
    nbs=NearestNeighbors(n_neighbors=5,metric='euclidean',algorithm='kd_tree').fit(X)
    euclidean,indices= nbs.kneighbors(X)
    return indices

def MLSMOTE(X,y, n_sample):
    """
    Give the augmented data using MLSMOTE algorithm

    args
    X: pandas.DataFrame, input vector DataFrame
    y: pandas.DataFrame, feature vector dataframe
    n_sample: int, number of newly generated sample

    return
    new_X: pandas.DataFrame, augmented feature vector data
    target: pandas.DataFrame, augmented target vector data
    """
    indices2 = nearest_neighbour(X)
    n = len(indices2)
    new_X = np.zeros((n_sample, X.shape[1]))
    target = np.zeros((n_sample, y.shape[1]))
    for i in range(n_sample):
        reference = random.randint(0,n-1)
        neighbour = random.choice(indices2[reference,1:])
        all_point = indices2[reference]
        nn_df = y[y.index.isin(all_point)]
        ser = nn_df.sum(axis = 0, skipna = True)
        target[i] = np.array([1 if val>2 else 0 for val in ser])
        ratio = random.random()
        gap = X.loc[reference,:] - X.loc[neighbour,:]
        new_X[i] = np.array(X.loc[reference,:] + ratio * gap)
    new_X = pd.DataFrame(new_X, columns=X.columns)
    target = pd.DataFrame(target, columns=y.columns)
    new_X = pd.concat([X, new_X], axis=0)
    target = pd.concat([y, target], axis=0)
    return new_X, target


In [8]:
# mlp for multi-label classification
from numpy import mean
from numpy import std
from sklearn.model_selection import RepeatedKFold
from keras.models import Sequential
from keras.layers import Dense, Dropout
from sklearn.metrics import accuracy_score
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import(
    classification_report,
    matthews_corrcoef,
    balanced_accuracy_score,
    average_precision_score,
    roc_auc_score,
)
from sklearn.metrics import multilabel_confusion_matrix
import keras


def NN_model(n_inputs, n_outputs):
	model = Sequential()
	model.add(Dense(64, input_dim=n_inputs, kernel_initializer='he_uniform', activation='relu'))
	model.add(Dense(32, activation='relu'))
	model.add(Dense(20,  activation='relu'))
	model.add(Dropout(0.1))
	model.add(Dense(n_outputs, activation='sigmoid'))
	model.compile(loss='binary_crossentropy', optimizer='adam') #, metrics=['accuracy', keras.metrics.Precision(), keras.metrics.Recall()])
	return model

In [9]:
# train the NN model

import torch
from torchmetrics.functional.classification import multilabel_matthews_corrcoef
from tensorflow.keras.models import save_model

def train_NN_model(X, y):
	n_inputs, n_outputs = X.shape[1], y.shape[1]

	X_train = X
	y_train = y

	X_sub, y_sub = get_minority_instace(pd.DataFrame(X_train), pd.DataFrame(y_train, columns=['HBOND', 'IONIC', 'PICATION', 'PIPISTACK', 'SSBOND', 'VDW']).astype(int))   #Getting minority instance of that datframe
	X_res,y_res =MLSMOTE(X_sub, y_sub, int(0.1*len(X_train)))

	X_train = np.concatenate((X_train, X_res[len(X_sub):].values))
	y_train = np.concatenate((y_train, y_res[len(y_sub):].values))

	# define model
	model = NN_model(n_inputs, n_outputs)
	# fit model
	early_stopping = EarlyStopping(monitor='loss', patience=3, restore_best_weights=True)
	history = model.fit(X_train, y_train, verbose=1, epochs=100, callbacks=[early_stopping])

	save_model(model, path + '/final_model.h5')


	return model

# load dataset
X1, y1 = np.array(X_list), np.array(y_list)
# evaluate model
model= train_NN_model(X1, y1)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100


In [7]:
import tensorflow as tf
import numpy as np

# just a check if model is saved and loaded properly
X1, y1 = np.array(X_list), np.array(y_list)

# Load the saved model
model = tf.keras.models.load_model(path + '/final_model.h5')

# Assuming you want to make a prediction on the first element of X1
single_input = X1[0]  # Select the desired element

# Reshape the input to match the expected shape by the model
single_input = np.expand_dims(single_input, axis=0)

print(single_input)

# Make the prediction on the single input
y_pred = model.predict(single_input)

# Round the probabilities to class labels
y_pred = np.round(y_pred)

# Print the predicted class label
print(y_pred)

[[ 6.          0.528       2.         13.         -1.611      -0.394
   0.         -0.032       0.326       2.213       0.908       1.313
   6.          0.356      12.         15.         -1.113      -0.35
   0.          1.831      -0.561       0.533      -0.277       1.648
   5.07838     5.18498936  8.75005207  0.          0.        ]]
[[1. 0. 0. 0. 0. 0.]]
