In [1]:
# general imports

import os
import sys
import pandas as pd
import torch
import time

In [2]:
# import Paragraph code

module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
src_path = os.path.join(module_path, "Paragraph")
if src_path not in sys.path:
    sys.path.append(src_path)

from Paragraph.dataset import ParagraphDataset
from Paragraph.model import EGNN_Model
from Paragraph.predict import get_dataloader, evaluate_model

In [3]:
# example filepath params

example_dir = os.path.abspath("")
trained_model_path = os.path.join(src_path, "trained_model")
pdb_folder_path = os.path.join(example_dir, "pdbs")
pdb_H_L_csv = os.path.join(example_dir, "pdb_H_L_key.csv")

In [4]:
# fix seed
seed = 0
torch.manual_seed(seed)

# use gpu if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device being used: {device}\n")

Device being used: cpu



### Data processing for input

In [5]:
# examine dataset object contents

ds = ParagraphDataset(pdb_H_L_csv=pdb_H_L_csv, pdb_folder_path=pdb_folder_path)

(feats, coors, edges), (pdb_code, AAs, AtomNum, chain, chain_type, IMGT, x, y, z) = ds.__getitem__(0)
print(pdb_code)
print("Co-ordinate dimensions: \t{}".format(coors.shape))
print("Node feature dimensions: \t{}".format(feats.shape))
print("Edge feature dimensions: \t{}\n".format(edges.shape))

df = pd.DataFrame(zip(AAs, AtomNum, chain, IMGT, x, y, z),
                  columns=["AAs", "AtomNum", "chain", "IMGT", "x", "y", "z"])
df.head()

4edw
Co-ordinate dimensions: 	torch.Size([72, 3])
Node feature dimensions: 	torch.Size([72, 22])
Edge feature dimensions: 	torch.Size([72, 72, 1])



Unnamed: 0,AAs,AtomNum,chain,IMGT,x,y,z
0,VAL,1011,H,25,9.294,-11.476,-36.29
1,SER,1018,H,26,12.006,-13.6,-38.105
2,GLY,1024,H,27,12.275,-15.929,-41.183
3,PHE,1028,H,28,10.463,-13.73,-43.752
4,SER,1039,H,29,10.485,-10.124,-45.044


### EGNN model

In [6]:
# test model architecture

# use small number of features and samples so results can be easily read manually
num_samples = 3
num_feats = 6

# one graph layer and 1 hidden linear layer
graph_hidden_layer_output_dims = [num_feats]
linear_hidden_layer_output_dims = [int(0.5*num_feats)]

# create dummy data and examine this
feats = torch.rand((num_samples, num_feats), device=device).unsqueeze_(0)
coors = torch.rand((num_samples, 3), device=device).unsqueeze_(0)
edges = torch.rand((num_samples, num_samples, 1), device=device).unsqueeze_(0)
print("Input features:\n", feats)

# create our model
dummy_net = EGNN_Model(num_feats = num_feats,
                       graph_hidden_layer_output_dims = graph_hidden_layer_output_dims,
                       linear_hidden_layer_output_dims = linear_hidden_layer_output_dims)

# add model to gpu if possible
dummy_net = dummy_net.to(device)

# pass our data through our model and examine the new embeddings
feats = dummy_net.forward(feats, coors, edges)
print("\nOutput features (i.e. node predictions):\n", feats)

# print("\nExample network architecture:\n", dummy_net)

Input features:
 tensor([[[0.4963, 0.7682, 0.0885, 0.1320, 0.3074, 0.6341],
         [0.4901, 0.8964, 0.4556, 0.6323, 0.3489, 0.4017],
         [0.0223, 0.1689, 0.2939, 0.5185, 0.6977, 0.8000]]])

Output features (i.e. node predictions):
 tensor([[[-0.4605],
         [-0.4715],
         [-0.5491]]], grad_fn=<AddBackward0>)


### Predictions

In [7]:
# network dims used in pre-trained model

num_feats = 22  # 20D one-hot encoding of AA type and 2D one-hot encoding of chain ID
graph_hidden_layer_output_dims = [num_feats]*6
linear_hidden_layer_output_dims = [10]*2

In [8]:
# weights and predictions paths

saved_model_path = os.path.join(trained_model_path, "pretrained_weights.pt")
predictions_output_path = os.path.join(example_dir, "example_predictions.csv")

In [9]:
# load net with pre-trained weights

dl = get_dataloader(pdb_H_L_csv, pdb_folder_path)

saved_net = EGNN_Model(num_feats = num_feats,
                       graph_hidden_layer_output_dims = graph_hidden_layer_output_dims,
                       linear_hidden_layer_output_dims = linear_hidden_layer_output_dims)

try:
    saved_net.load_state_dict(torch.load(saved_model_path))
except RuntimeError:
    saved_net.load_state_dict(torch.load(saved_model_path, map_location=torch.device('cpu')))
saved_net = saved_net.to(device)

In [10]:
print("Evaluating using weight file:\n{}\n".format(saved_model_path.split("Paragraph")[-1]))
start_time = time.time()

detailed_record_df = evaluate_model(model = saved_net,
                                    dataloader = dl,
                                    device = device)
detailed_record_df.to_csv(predictions_output_path, index=False)

print("Results saved to:\n{}\n".format(predictions_output_path.split("Paragraph")[-1]))
print("Total time to evaluate against test-set {:.3f}s".format(time.time()-start_time))

Evaluating using weight file:
/trained_model/pretrained_weights.pt

Results saved to:
/example/example_predictions.csv

Total time to evaluate against test-set 0.095s


In [11]:
df_pred = pd.read_csv(predictions_output_path)
df_pred

Unnamed: 0,pdb,chain_type,chain_id,IMGT,AA,atom_num,x,y,z,pred
0,4edw,H,H,25,VAL,1011,9.294,-11.476,-36.290,0.010000
1,4edw,H,H,26,SER,1018,12.006,-13.600,-38.105,0.010074
2,4edw,H,H,27,GLY,1024,12.275,-15.929,-41.183,0.279932
3,4edw,H,H,28,PHE,1028,10.463,-13.730,-43.752,0.615672
4,4edw,H,H,29,SER,1039,10.485,-10.124,-45.044,0.482708
...,...,...,...,...,...,...,...,...,...,...
67,4edw,L,L,115,PRO,3223,-13.784,-8.117,-42.343,0.042529
68,4edw,L,L,116,TYR,3230,-12.503,-11.678,-41.940,0.812428
69,4edw,L,L,117,THR,3242,-14.895,-13.611,-39.653,0.002904
70,4edw,L,L,118,PHE,3249,-14.576,-16.948,-37.805,0.004652
