# Data preprocessing

Including:
- [Statistics](##Statistics)
- [Graph construction](##Graph-construction)

In [1]:
# imports
from biopandas.pdb import PandasPdb
import pandas as pd
import numpy as np
import torch_geometric.nn as pyg_nn
from torch_geometric.data import Data, InMemoryDataset
from transformers import BertModel, BertTokenizer
import re
import os
import shutil
import sys
import pickle
from tqdm import tqdm

In [13]:
amino3to1dict = {'ASH': 'A',
                 'ALA': 'A',
                 'CYX': 'C',
                 'CYS': 'C',
                 'ASP': 'D',
                 'GLU': 'E',
                 'PHE': 'F',
                 'GLY': 'G',
                 'HIS': 'H',
                 'HID': 'H',
                 'HIE': 'H',
                 'HIP': 'H',
                 'ILE': 'I',
                 'LYS': 'K',
                 'LEU': 'L',
                 'MET': 'M',
                 'MSE': 'M',
                 'ASN': 'N',
                 'PYL': 'O',
                 'HYP': 'P',
                 'PRO': 'P',
                 'GLN': 'Q',
                 'ARG': 'R',
                 'SER': 'S',
                 'THR': 'T',
                 'SEL': 'U',
                 'VAL': 'V',
                 'TRP': 'W',
                 'TYR': 'Y',
                 'UNK': 'X'}

## Statistics

In [2]:
tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False, cache_dir='./cache_model/')
pretrain_model = BertModel.from_pretrained("Rostlab/prot_bert" , cache_dir='./cache_model/')

Some weights of the model checkpoint at Rostlab/prot_bert were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
def get_protein_features(seq):
    sequence_Example = ' '.join(seq)
    sequence_Example = re.sub(r"[UZOB]", "X", sequence_Example)
    encoded_input = tokenizer(sequence_Example, return_tensors='pt')
    last_hidden = pretrain_model(**encoded_input).last_hidden_state.squeeze(0)[1:-1,:]
    return last_hidden.detach()

## BioLiP pdb

In [23]:
# pdb file consistency check
lines = open("./Dataset/TE639.txt", 'r').readlines()
for i in range(0 ,len(lines), 3):
    pdb_id = lines[i][1:6]
    atom = PandasPdb().read_pdb("./pdb_files/" + pdb_id + '.pdb').df['ATOM']
    atom = atom[(atom['atom_name'] == 'CA')]
    atom.sort_values('residue_number')

    seq = ''.join(list(map(lambda x: amino3to1dict[x], list(atom['residue_name']))))
    if lines[i+1].strip() != seq:
        print(pdb_id)
        break

## Graph-construction

In [25]:
for sets in ['TE125.txt', 'TE639.txt', 'TR640.txt', 'TR1154.txt']:
    print(f"processing file {sets}........")
    data_dict = {}
    inconsistent = []
    lines = open("./Dataset/" + sets, 'r').readlines()

    print("processing data........")
    for l in tqdm(range(0, len(lines), 3)):
        pdb_id = lines[l][1:6]
        cpdb = PandasPdb().read_pdb("./pdb_files/" + pdb_id + '.pdb').df['ATOM']
        alpha_c_df = cpdb[(cpdb['atom_name'] == 'CA')]
        alpha_c_df = alpha_c_df.sort_values('residue_number')
        data_dict[pdb_id] = (get_protein_features(lines[l+1].strip()), lines[l+2].strip(), alpha_c_df[['x_coord', 'y_coord', 'z_coord']])


    print(f"file {sets} processing finished!")
    print(f"Storage file length: {len(data_dict)}")
    pickle.dump(data_dict, open("./Dataset_pkl/raw/" + sets.split('.')[0] + '.pkl', 'wb'))
    print("Data saved!")
    print('\n')

processing file TE125.txt........
processing data........


100%|██████████| 125/125 [02:40<00:00,  1.28s/it]


file TE125.txt processing finished!
Storage file length: 125
Data saved!


processing file TE639.txt........
processing data........


100%|██████████| 639/639 [13:13<00:00,  1.24s/it]


file TE639.txt processing finished!
Storage file length: 639
Data saved!


processing file TR640.txt........
processing data........


100%|██████████| 640/640 [14:23<00:00,  1.35s/it]


file TR640.txt processing finished!
Storage file length: 640
Data saved!


processing file TR1154.txt........
processing data........


100%|██████████| 1154/1154 [24:07<00:00,  1.25s/it]


file TR1154.txt processing finished!
Storage file length: 1154
Data saved!


