# Data preprocessing

Including:
- [Statistics](##Statistics)
- [Graph construction](##Graph-construction)

In [1]:
# imports
from biopandas.pdb import PandasPdb
import pandas as pd
import numpy as np
import torch_geometric.nn as pyg_nn
from torch_geometric.data import Data, InMemoryDataset
from transformers import BertModel, BertTokenizer
import re
import os
import shutil
import sys
import pickle
from tqdm import tqdm

In [None]:
amino3to1dict = {'ASH': 'A',
                 'ALA': 'A',
                 'CYX': 'C',
                 'CYS': 'C',
                 'ASP': 'D',
                 'GLU': 'E',
                 'PHE': 'F',
                 'GLY': 'G',
                 'HIS': 'H',
                 'HID': 'H',
                 'HIE': 'H',
                 'HIP': 'H',
                 'ILE': 'I',
                 'LYS': 'K',
                 'LEU': 'L',
                 'MET': 'M',
                 'MSE': 'M',
                 'ASN': 'N',
                 'PYL': 'O',
                 'HYP': 'P',
                 'PRO': 'P',
                 'GLN': 'Q',
                 'ARG': 'R',
                 'SER': 'S',
                 'THR': 'T',
                 'SEL': 'U',
                 'VAL': 'V',
                 'TRP': 'W',
                 'TYR': 'Y',
                 'UNK': 'X'}

## Statistics

In [None]:
tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False, cache_dir='./cache_model/')
pretrain_model = BertModel.from_pretrained("Rostlab/prot_bert" , cache_dir='./cache_model/')

In [None]:
def get_protein_features(seq):
    sequence_Example = ' '.join(seq)
    sequence_Example = re.sub(r"[UZOB]", "X", sequence_Example)
    encoded_input = tokenizer(sequence_Example, return_tensors='pt')
    last_hidden = pretrain_model(**encoded_input).last_hidden_state.squeeze(0)[1:-1,:]
    return last_hidden.detach()

## BioLiP pdb

In [None]:
# pdb file consistency check
lines = open("./Dataset/TE639.txt", 'r').readlines()
for i in range(0 ,len(lines), 3):
    pdb_id = lines[i][1:6]
    atom = PandasPdb().read_pdb("./pdb_files/" + pdb_id + '.pdb').df['ATOM']
    atom = atom[(atom['atom_name'] == 'CA')]
    atom.sort_values('residue_number')

    seq = ''.join(list(map(lambda x: amino3to1dict[x], list(atom['residue_name']))))
    if lines[i+1].strip() != seq:
        print(pdb_id)
        break

## Graph-construction

In [None]:
for sets in ['TE125.txt', 'TE639.txt', 'TR640.txt', 'TR1154.txt']:
    print(f"processing file {sets}........")
    data_dict = {}
    inconsistent = []
    lines = open("./Dataset/" + sets, 'r').readlines()

    print("processing data........")
    for l in tqdm(range(0, len(lines), 3)):
        pdb_id = lines[l][1:6]
        cpdb = PandasPdb().read_pdb("./pdb_files/" + pdb_id + '.pdb').df['ATOM']
        alpha_c_df = cpdb[(cpdb['atom_name'] == 'CA')]
        alpha_c_df = alpha_c_df.sort_values('residue_number')
        data_dict[pdb_id] = (get_protein_features(lines[l+1].strip()), lines[l+2].strip(), alpha_c_df[['x_coord', 'y_coord', 'z_coord']])


    print(f"file {sets} processing finished!")
    print(f"Storage file length: {len(data_dict)}")
    pickle.dump(data_dict, open("./Dataset_pkl/raw/" + sets.split('.')[0] + '.pkl', 'wb'))
    print("Data saved!")
    print('\n')

In [25]:
for sets in ['TE125.txt', 'TE639.txt', 'TR640.txt', 'TR1154.txt']:
    print(f"processing file {sets}........")
    data_dict = {}
    inconsistent = []
    lines = open("./Dataset/" + sets, 'r').readlines()

    print("processing data........")
    for l in tqdm(range(0, len(lines), 3)):
        pdb_id = lines[l][1:6]
        cpdb = PandasPdb().read_pdb("./pdb_files/" + pdb_id + '.pdb').df['ATOM']
        alpha_c_df = cpdb[(cpdb['atom_name'] == 'CA')]
        alpha_c_df = alpha_c_df.sort_values('residue_number')
        data_dict[pdb_id] = (get_protein_features(lines[l+1].strip()), lines[l+2].strip(), alpha_c_df[['x_coord', 'y_coord', 'z_coord']])


    print(f"file {sets} processing finished!")
    print(f"Storage file length: {len(data_dict)}")
    pickle.dump(data_dict, open("./Dataset_pkl/raw/" + sets.split('.')[0] + '.pkl', 'wb'))
    print("Data saved!")
    print('\n')

processing file TE125.txt........
processing data........


  9%|▉         | 11/125 [00:09<01:19,  1.44it/s]